наманьячил претотайп
Apr. 6th, 2017 08:12 pm![[personal profile]](https://www.dreamwidth.org/img/silk/identity/user.png)
github.com/h2oai/h2o-3/blob/f6b96396b9205fd342794d8308ea2e8a43c1a03a/h2o-scala/src/test/scala/water/userapi/H2ODatasetTest.scala
test("Citibike, end to end") { // this is the path of the sample we use val path = "smalldata/demos/citibike_20k.csv" // we read the file here, and produce a dataset from it val dataset = H2ODataset.readFile(path) // removing these two column that we don't care about dataset.removeColumn("start station name", "end station name") // this is the expected number of rows in the dataset val expectedSize = 20000 // checking that we got exactly the number of records we expected assert(expectedSize == dataset.length) // converting gender column to categorical type dataset.makeCategorical("gender") // the domain should be "male", "female", and "N/A" val categories = dataset.domainOf("gender") assert(Some(3) == categories.map(_.length)) // apply oneHot encoding to all applicable columns except gender (we'll need it) val oneHot = dataset.oneHotEncodeExcluding("gender") // we expect 15 possible assert(Some(15) == oneHot.domain.map(_.length)) // Planning to do stratified split, so 0.75 go to train, 0.25 go to valid datasets val ratio = 0.25 val expectedValidSize = (expectedSize * ratio).toInt val expectedTrainSize = expectedSize - expectedValidSize // do stratified split on gender column; 55555 is the random seed oneHot.stratifiedSplit("gender", ratio, 55555) match { case Some((train, valid)) => assert(expectedTrainSize == ETL.length(train)) assert(expectedValidSize == ETL.length(valid)) case None => fail("Failed to stratify by gender") } }