Apr. 6th, 2017
наманьячил претотайп
Apr. 6th, 2017 08:12 pmgithub.com/h2oai/h2o-3/blob/f6b96396b9205fd342794d8308ea2e8a43c1a03a/h2o-scala/src/test/scala/water/userapi/H2ODatasetTest.scala
test("Citibike, end to end") {
// this is the path of the sample we use
val path = "smalldata/demos/citibike_20k.csv"
// we read the file here, and produce a dataset from it
val dataset = H2ODataset.readFile(path)
// removing these two column that we don't care about
dataset.removeColumn("start station name", "end station name")
// this is the expected number of rows in the dataset
val expectedSize = 20000
// checking that we got exactly the number of records we expected
assert(expectedSize == dataset.length)
// converting gender column to categorical type
dataset.makeCategorical("gender")
// the domain should be "male", "female", and "N/A"
val categories = dataset.domainOf("gender")
assert(Some(3) == categories.map(_.length))
// apply oneHot encoding to all applicable columns except gender (we'll need it)
val oneHot = dataset.oneHotEncodeExcluding("gender")
// we expect 15 possible
assert(Some(15) == oneHot.domain.map(_.length))
// Planning to do stratified split, so 0.75 go to train, 0.25 go to valid datasets
val ratio = 0.25
val expectedValidSize = (expectedSize * ratio).toInt
val expectedTrainSize = expectedSize - expectedValidSize
// do stratified split on gender column; 55555 is the random seed
oneHot.stratifiedSplit("gender", ratio, 55555) match {
case Some((train, valid)) =>
assert(expectedTrainSize == ETL.length(train))
assert(expectedValidSize == ETL.length(valid))
case None =>
fail("Failed to stratify by gender")
}
}