juan_gandhi | наманьячил претотайп (Reply)

github.com/h2oai/h2o-3/blob/f6b96396b9205fd342794d8308ea2e8a43c1a03a/h2o-scala/src/test/scala/water/userapi/H2ODatasetTest.scala

  test("Citibike, end to end") {
    // this is the path of the sample we use
    val path = "smalldata/demos/citibike_20k.csv"
    // we read the file here, and produce a dataset from it
    val dataset = H2ODataset.readFile(path)

    // removing these two column that we don't care about
    dataset.removeColumn("start station name", "end station name")

    // this is the expected number of rows in the dataset
    val expectedSize = 20000
    
    // checking that we got exactly the number of records we expected
    assert(expectedSize == dataset.length)

    // converting gender column to categorical type
    dataset.makeCategorical("gender")
    
    // the domain should be "male", "female", and "N/A"
    val categories = dataset.domainOf("gender")
    assert(Some(3) == categories.map(_.length))
    
    // apply oneHot encoding to all applicable columns except gender (we'll need it)
    val oneHot = dataset.oneHotEncodeExcluding("gender")
    
    // we expect 15 possible 
    assert(Some(15) == oneHot.domain.map(_.length))

    // Planning to do stratified split, so 0.75 go to train, 0.25 go to valid datasets
    val ratio = 0.25
    val expectedValidSize = (expectedSize * ratio).toInt
    val expectedTrainSize = expectedSize - expectedValidSize
    
    // do stratified split on gender column; 55555 is the random seed
    oneHot.stratifiedSplit("gender", ratio, 55555) match {
      case Some((train, valid)) =>
        assert(expectedTrainSize == ETL.length(train))
        assert(expectedValidSize == ETL.length(valid))

      case None =>
        fail("Failed to stratify by gender")
    }
  }