529 lines
18 KiB
Clojure
Vendored
529 lines
18 KiB
Clojure
Vendored
(ns tech.v3.dataset.parse-test
|
|
(:require [clojure.test :refer [deftest is]]
|
|
[tech.v3.datatype :as dtype]
|
|
[tech.v3.datatype.functional :as dfn]
|
|
[tech.v3.datatype.bitmap :as bitmap]
|
|
[tech.v3.dataset :as ds]
|
|
[tech.v3.dataset.zip :as zip]
|
|
[tech.v3.dataset.column :as ds-col]
|
|
[tech.v3.dataset.protocols :as ds-proto]
|
|
[tech.v3.dataset.io.nippy]
|
|
[tech.v3.libs.arrow :as arrow]
|
|
[tech.v3.libs.clj-transit :as ds-transit]
|
|
[taoensso.nippy :as nippy]
|
|
[clojure.set :as set]
|
|
[clojure.java.io :as io])
|
|
(:import [com.univocity.parsers.csv CsvFormat CsvParserSettings CsvParser]
|
|
[java.nio.charset StandardCharsets]))
|
|
|
|
|
|
(def test-file "test/data/ames-house-prices/train.csv")
|
|
|
|
|
|
(def missing-data
|
|
(->> [{:column-name "LotFrontage", :missing-count 259}
|
|
{:column-name "Alley", :missing-count 1369}
|
|
{:column-name "MasVnrType", :missing-count 8}
|
|
{:column-name "MasVnrArea", :missing-count 8}
|
|
{:column-name "BsmtQual", :missing-count 37}
|
|
{:column-name "BsmtCond", :missing-count 37}
|
|
{:column-name "BsmtExposure", :missing-count 38}
|
|
{:column-name "BsmtFinType1", :missing-count 37}
|
|
{:column-name "BsmtFinType2", :missing-count 38}
|
|
{:column-name "Electrical", :missing-count 1}
|
|
{:column-name "FireplaceQu", :missing-count 690}
|
|
{:column-name "GarageType", :missing-count 81}
|
|
{:column-name "GarageYrBlt", :missing-count 81}
|
|
{:column-name "GarageFinish", :missing-count 81}
|
|
{:column-name "GarageQual", :missing-count 81}
|
|
{:column-name "GarageCond", :missing-count 81}
|
|
{:column-name "PoolQC", :missing-count 1453}
|
|
{:column-name "Fence", :missing-count 1179}
|
|
{:column-name "MiscFeature", :missing-count 1406}
|
|
]
|
|
(map (juxt :column-name :missing-count))
|
|
(sort-by first)))
|
|
|
|
(def datatype-answers
|
|
[["1stFlrSF" :int16]
|
|
["2ndFlrSF" :int16]
|
|
["3SsnPorch" :int16]
|
|
["Alley" :string]
|
|
["BedroomAbvGr" :int16]
|
|
["BldgType" :string]
|
|
["BsmtCond" :string]
|
|
["BsmtExposure" :string]
|
|
["BsmtFinSF1" :int16]
|
|
["BsmtFinSF2" :int16]
|
|
["BsmtFinType1" :string]
|
|
["BsmtFinType2" :string]
|
|
["BsmtFullBath" :int16]
|
|
["BsmtHalfBath" :int16]
|
|
["BsmtQual" :string]
|
|
["BsmtUnfSF" :int16]
|
|
["CentralAir" :string]
|
|
["Condition1" :string]
|
|
["Condition2" :string]
|
|
["Electrical" :string]
|
|
["EnclosedPorch" :int16]
|
|
["ExterCond" :string]
|
|
["ExterQual" :string]
|
|
["Exterior1st" :string]
|
|
["Exterior2nd" :string]
|
|
["Fence" :string]
|
|
["FireplaceQu" :string]
|
|
["Fireplaces" :int16]
|
|
["Foundation" :string]
|
|
["FullBath" :int16]
|
|
["Functional" :string]
|
|
["GarageArea" :int16]
|
|
["GarageCars" :int16]
|
|
["GarageCond" :string]
|
|
["GarageFinish" :string]
|
|
["GarageQual" :string]
|
|
["GarageType" :string]
|
|
["GarageYrBlt" :int16]
|
|
["GrLivArea" :int16]
|
|
["HalfBath" :int16]
|
|
["Heating" :string]
|
|
["HeatingQC" :string]
|
|
["HouseStyle" :string]
|
|
["Id" :int16]
|
|
["KitchenAbvGr" :int16]
|
|
["KitchenQual" :string]
|
|
["LandContour" :string]
|
|
["LandSlope" :string]
|
|
["LotArea" :int32]
|
|
["LotConfig" :string]
|
|
["LotFrontage" :int16]
|
|
["LotShape" :string]
|
|
["LowQualFinSF" :int16]
|
|
["MSSubClass" :int16]
|
|
["MSZoning" :string]
|
|
["MasVnrArea" :int16]
|
|
["MasVnrType" :string]
|
|
["MiscFeature" :string]
|
|
["MiscVal" :int16]
|
|
["MoSold" :int16]
|
|
["Neighborhood" :string]
|
|
["OpenPorchSF" :int16]
|
|
["OverallCond" :int16]
|
|
["OverallQual" :int16]
|
|
["PavedDrive" :string]
|
|
["PoolArea" :int16]
|
|
["PoolQC" :string]
|
|
["RoofMatl" :string]
|
|
["RoofStyle" :string]
|
|
["SaleCondition" :string]
|
|
["SalePrice" :int32]
|
|
["SaleType" :string]
|
|
["ScreenPorch" :int16]
|
|
["Street" :string]
|
|
["TotRmsAbvGrd" :int16]
|
|
["TotalBsmtSF" :int16]
|
|
["Utilities" :string]
|
|
["WoodDeckSF" :int16]
|
|
["YearBuilt" :int16]
|
|
["YearRemodAdd" :int16]
|
|
["YrSold" :int16]])
|
|
|
|
|
|
(deftest base-ames-parser-test
|
|
(let [result (ds/->dataset test-file)
|
|
dtypes (->> (vals result)
|
|
(map meta)
|
|
(sort-by :name)
|
|
(mapv (juxt :name :datatype)))]
|
|
(is (= (set (map first datatype-answers))
|
|
(set (map first dtypes))))
|
|
|
|
(let [dtype-map (into {} dtypes)
|
|
differences (->> datatype-answers
|
|
(map (fn [[colname col-dtype]]
|
|
(let [detected-dtype (dtype-map colname)]
|
|
(when-not (= detected-dtype col-dtype)
|
|
{:name colname
|
|
:expected-datatype col-dtype
|
|
:result-datatype detected-dtype}))))
|
|
(remove nil?)
|
|
seq)]
|
|
|
|
(is (nil? differences)
|
|
(str differences)))
|
|
(let [result-missing-data (->> (vals result)
|
|
(map (juxt ds-col/column-name
|
|
(comp dtype/ecount ds-col/missing)))
|
|
(remove #(= 0 (second %)))
|
|
(sort-by first))]
|
|
(is (= (set (map first missing-data))
|
|
(set (map first result-missing-data))))))
|
|
|
|
(let [result (ds/->dataset
|
|
test-file
|
|
{:n-records 100
|
|
:column-whitelist ["Id" "SalePrice" "YearBuilt"]})]
|
|
(is (= 3 (count result)))
|
|
;;Header row accounts for one.
|
|
(is (= 100 (ds/row-count result)))))
|
|
|
|
|
|
(deftest base-ames-load-test
|
|
;;Here we just test that the options correctly pass through ->dataset
|
|
(let [result (ds/->dataset test-file
|
|
{:n-records 100
|
|
:column-whitelist ["Id" "SalePrice" "YearBuilt"]})]
|
|
(is (= 3 (ds/column-count result)))
|
|
;;Header row accounts for one.
|
|
(is (= 100 (ds/row-count result)))))
|
|
|
|
|
|
(deftest specify-column-types
|
|
;;parse everything as float32
|
|
(let [result (ds/->dataset
|
|
test-file
|
|
{:n-records 100
|
|
:column-whitelist ["1stFlrSF" "2ndFlrSF" "3SsnPorch"]
|
|
:parser-fn :float32})]
|
|
(is (= #{:float32}
|
|
(set (map dtype/get-datatype (vals result)))))
|
|
(is (= 3 (ds/column-count result))))
|
|
|
|
;;Next up is a map of colname->datatype
|
|
(let [result (ds/->dataset
|
|
test-file
|
|
{:n-records 100
|
|
:column-whitelist ["1stFlrSF" "2ndFlrSF" "3SsnPorch"]
|
|
:parser-fn {"1stFlrSF" :float32
|
|
"2ndFlrSF" :int32}})]
|
|
(is (= #{:float32 :int32 :int16}
|
|
(set (map dtype/get-datatype (vals result)))))))
|
|
|
|
|
|
(deftest semi-colon-delimited-file
|
|
(let [result (ds/->dataset "test/data/sample01.csv"
|
|
{:separator \;})]
|
|
(is (= 3 (ds/column-count result)))))
|
|
|
|
|
|
(deftest tough-file
|
|
(let [result (ds/->dataset "test/data/essential.csv"
|
|
{:n-initial-skip-rows 1
|
|
:skip-bad-rows? true})]
|
|
(is (= 5 (ds/column-count result)))))
|
|
|
|
|
|
(defn- make-essential-csv-parser
|
|
[]
|
|
(-> (doto (CsvParserSettings.)
|
|
(.. getFormat (setLineSeparator "\n"))
|
|
(.setHeaderExtractionEnabled true)
|
|
(.setIgnoreLeadingWhitespaces true)
|
|
(.setIgnoreTrailingWhitespaces true))
|
|
(CsvParser.)))
|
|
|
|
|
|
(deftest custom-csv-parser
|
|
(let [result (ds/->dataset "test/data/essential.csv"
|
|
{:csv-parser (make-essential-csv-parser)
|
|
:skip-bad-rows? true})]
|
|
(is (= 5 (ds/column-count result)))))
|
|
|
|
|
|
(deftest simple-write-test
|
|
(let [initial-ds (ds/->dataset
|
|
test-file
|
|
{:num-rows 20
|
|
:column-whitelist ["1stFlrSF" "2ndFlrSF" "3SsnPorch"]})
|
|
_ (ds/write! initial-ds "test.tsv")
|
|
new-ds (ds/->dataset "test.tsv")]
|
|
(is (dfn/equals (initial-ds "1stFlrSF")
|
|
(new-ds "1stFlrSF")))
|
|
(is (dfn/equals (initial-ds "2ndFlrSF")
|
|
(new-ds "2ndFlrSF"))))
|
|
(let [missing-ds (-> (ds/->dataset
|
|
test-file
|
|
{:n-records 20
|
|
:column-whitelist [43 44 69]})
|
|
(ds/update-column
|
|
"1stFlrSF"
|
|
#(ds-col/set-missing % [2 4 7 9])))
|
|
_ (ds/write! missing-ds "test.tsv")
|
|
new-ds (ds/->dataset "test.tsv")]
|
|
(is (dfn/equals (missing-ds "1stFlrSF")
|
|
(new-ds "1stFlrSF")))
|
|
(is (= #{2 4 7 9}
|
|
(set (ds-col/missing (new-ds "1stFlrSF")))))))
|
|
|
|
|
|
(deftest date-time-format-test-1
|
|
(let [stock-ds (ds/->dataset "test/data/stocks.csv")]
|
|
(is (= :packed-local-date (dtype/get-datatype (stock-ds "date")))))
|
|
(let [temp-ds (ds/->dataset "test/data/seattle-temps.csv")]
|
|
(is (= :zoned-date-time (dtype/get-datatype (temp-ds "date")))))
|
|
(let [stock-ds (ds/->dataset "test/data/stocks.csv"
|
|
{:parser-fn
|
|
{"date" :local-date}})]
|
|
(is (= :local-date (dtype/get-datatype (stock-ds "date"))))))
|
|
|
|
|
|
(deftest custom-reader
|
|
(is (= 560 (ds/row-count (ds/->dataset (io/reader "test/data/stocks.csv")
|
|
{:file-type :csv})))))
|
|
|
|
|
|
(defn verify-relaxed-parse
|
|
[ds]
|
|
(let [date-col (ds "date")
|
|
col-meta (meta date-col)
|
|
^List unparsed-data (:unparsed-data col-meta)
|
|
^RoaringBitmap unparsed-indexes (:unparsed-indexes col-meta)]
|
|
(is (= :packed-local-date (dtype/get-datatype date-col)))
|
|
;;Make sure unparsed data came through intact
|
|
(is (= #{"hello" "1212"}
|
|
(set unparsed-data)))))
|
|
|
|
|
|
(deftest bad-csv-relaxed-1
|
|
(let [ds (ds/->dataset "test/data/stocks-bad-date.csv")]
|
|
(is (= :string (dtype/get-datatype (ds "date"))))
|
|
;;Make sure unparsed data came through intact
|
|
(is (= #{"hello" "1212"}
|
|
(set/intersection #{"hello" "1212"}
|
|
(set (ds-col/unique (ds "date"))))))
|
|
(let [updated-ds (ds/update-column
|
|
ds "date" (partial ds-col/parse-column
|
|
[:packed-local-date :relaxed?]))]
|
|
(verify-relaxed-parse updated-ds))))
|
|
|
|
|
|
(deftest bad-csv-relaxed-2
|
|
(let [ds (ds/->dataset "test/data/stocks-bad-date.csv"
|
|
{:parser-fn
|
|
{"date" [:packed-local-date :relaxed?]}})]
|
|
(verify-relaxed-parse ds)))
|
|
|
|
|
|
(deftest csv-keyword-colnames
|
|
(let [stocks (ds/->dataset "test/data/stocks.csv" {:key-fn keyword})]
|
|
(is (every? keyword? (ds/column-names stocks)))))
|
|
|
|
|
|
(deftest parse-empty-column-name
|
|
(let [data (ds/->dataset "test/data/rcsv.csv")]
|
|
(is (= #{"column-0" "Urban Female" "Urban Male" "Rural Female" "Rural Male"}
|
|
(set (ds/column-names data))))))
|
|
|
|
|
|
(deftest parse-ip-addrs-as-string
|
|
(let [data (ds/->dataset "test/data/ip-addrs.csv")]
|
|
(is (= :string (dtype/get-datatype (data "ip"))))))
|
|
|
|
|
|
(def arrow-file "test/data/iris.feather")
|
|
(def parquet-file "test/data/parquet/userdata1.parquet")
|
|
|
|
|
|
;;We will get back to this one. Potentially there are good ways into this
|
|
;;via arrow.
|
|
#_(deftest parse-parquet
|
|
(let [ds (ds/->dataset parquet-file)]
|
|
(is (= 13 (ds/column-count ds)))
|
|
(is (= 1000 (ds/row-count ds)))
|
|
(is (= #{:local-date-time :float64 :int32 :string}
|
|
(->> (map dtype/get-datatype (vals ds))
|
|
set)))))
|
|
|
|
|
|
(deftest parse-ragged
|
|
(let [ds (ds/->dataset "test/data/ragged.csv"
|
|
{:header-row? false
|
|
:key-fn keyword})]
|
|
(is (= [:column-0 :column-1 :column-2 :column-3 :column-4 :column-5
|
|
:column-6 :column-7 :column-8 :column-9 :column-10 :column-11]
|
|
(vec (ds/column-names ds))))
|
|
(is (= 12 (ds/column-count ds)))
|
|
(is (= [4 24 31 33 65 67 68 71 75 76 93 97]
|
|
(vec ((ds/value-reader ds) 4))))
|
|
(is (= [10 33 51 66 67 84 nil nil nil nil nil nil]
|
|
(vec ((ds/value-reader ds) 10))))))
|
|
|
|
|
|
(deftest parse-small-doubles
|
|
(let [ds (ds/->dataset "test/data/double_parse_test.csv")]
|
|
(is (= 197 (count (filter #(not= 0.0 % ) (ds "pvalue")))))))
|
|
|
|
|
|
(deftest string-separators
|
|
(let [ds (ds/->dataset "test/data/double_parse_test.csv" {:separator ","})]
|
|
(is (= 197 (count (filter #(not= 0.0 % ) (ds "pvalue")))))
|
|
(is (thrown? Throwable (ds/->dataset "test/data/double_parse_test.csv"
|
|
{:separator ",n"})))))
|
|
|
|
|
|
(deftest quoted-column-data
|
|
(try
|
|
(let [ds (ds/->dataset [{:a "onelongstring"}])]
|
|
(ds/write! ds "quoted.csv" {:quote? true})
|
|
(is (= "\"a\"\n\"onelongstring\"\n"
|
|
(slurp "quoted.csv"))))
|
|
(finally
|
|
(.delete (java.io.File. "quoted.csv")))))
|
|
|
|
|
|
(deftest text-data
|
|
(try
|
|
(let [ds (ds/->dataset [{:a "onestring"}
|
|
{:a "anotherstring"}
|
|
{}]
|
|
{:parser-fn :text})
|
|
_ (is (= :text (-> (ds :a) meta :datatype)))
|
|
_ (ds/write! ds "text.csv")
|
|
_ (ds/write! ds "text.nippy")
|
|
csv-ds (ds/->dataset "text.csv" {:parser-fn {"a" :text}
|
|
:key-fn keyword})
|
|
_ (is (= :text (-> (csv-ds :a) meta :datatype)))
|
|
;;_ (is (= 3 (ds/row-count csv-ds)))
|
|
nippy-ds (ds/->dataset "text.nippy")
|
|
_ (is (= :text (-> (nippy-ds :a) meta :datatype)))
|
|
_ (is (= 3 (ds/row-count nippy-ds)))
|
|
_ (arrow/write-dataset-to-stream! ds "text.arrow")
|
|
ds-copy (arrow/read-stream-dataset-copying "text.arrow" {:key-fn keyword})
|
|
_ (is (= :text (-> (ds-copy :a) meta :datatype)))
|
|
_ (is (= 3 (ds/row-count nippy-ds)))
|
|
ds-inplace (arrow/read-stream-dataset-inplace "text.arrow")]
|
|
(is (= :text (-> (ds-inplace "a") meta :datatype)))
|
|
(is (= 3 (ds/row-count nippy-ds))))
|
|
(finally
|
|
(.delete (java.io.File. "text.csv"))
|
|
(.delete (java.io.File. "text.nippy"))
|
|
(.delete (java.io.File. "text.arrow")))))
|
|
|
|
|
|
(deftest custom-parse-method
|
|
(try
|
|
(let [src-ds (ds/->dataset {:a ["1" "missing" "parse-failure" "2" "3"]})
|
|
_ (ds/write! src-ds "custom-parse.csv")
|
|
ds (ds/->dataset
|
|
"custom-parse.csv"
|
|
{:parser-fn {"a" [:int64
|
|
(fn [str-val]
|
|
(cond
|
|
(= str-val "missing")
|
|
:tech.v3.dataset/missing
|
|
(= str-val "parse-failure")
|
|
:tech.v3.dataset/parse-failure
|
|
:else
|
|
(Long/parseLong str-val)))]}})]
|
|
(is (= [1 nil nil 2 3]
|
|
(vec (ds "a"))))
|
|
(is (= #{1 2} (set (ds/missing ds))))
|
|
(is (= #{2}
|
|
(set (:unparsed-indexes (meta (ds "a"))))))
|
|
(is (= ["parse-failure"]
|
|
(vec (:unparsed-data (meta (ds "a")))))))
|
|
(finally
|
|
(.delete (java.io.File. "custom-parse.csv")))))
|
|
|
|
|
|
(deftest stocks-v5
|
|
(let [v5 (ds/->dataset "test/data/stocks-v5.nippy")
|
|
cur (ds/->dataset "test/data/stocks.csv")]
|
|
(is (= (vec (v5 "date"))
|
|
(vec (cur "date"))))))
|
|
|
|
|
|
|
|
(deftest gzipped-input-stream-issue-247
|
|
(let [ds (ds/->dataset (io/input-stream "test/data/ames-train.csv.gz")
|
|
{:file-type :csv
|
|
:gzipped? true})
|
|
correct-ds (ds/->dataset "test/data/ames-train.csv.gz")]
|
|
(is (= (ds/row-count correct-ds) (ds/row-count ds)))))
|
|
|
|
|
|
(deftest pokemon-csv
|
|
(let [ds (ds/->dataset "test/data/pokemon.csv")]
|
|
(is (= "['Overgrow', 'Chlorophyll']" (first (ds "abilities"))))))
|
|
|
|
(deftest issue-292
|
|
(let [ds (ds/->dataset "test/data/issue-292.csv" )]
|
|
(is (== 3 (ds/column-count ds)))))
|
|
|
|
|
|
(deftest json-test
|
|
(try
|
|
(let [ds (-> (ds/->dataset "test/data/stocks.csv")
|
|
(ds/column-map "date" str ["date"]))
|
|
_ (ds/write! ds "stocks.json")
|
|
jds (ds/->dataset "stocks.json")]
|
|
(is (= (vec (ds "date")) (vec (jds "date"))))
|
|
(is (dfn/equals (ds "price") (jds "price"))))
|
|
(finally
|
|
(.delete (java.io.File. "stocks.json")))))
|
|
|
|
|
|
(deftest nippy-column
|
|
(let [ds (ds/->dataset {:a [1 2 3] :b [4 5 6]})
|
|
frozen (nippy/freeze (ds :a))
|
|
thawed (nippy/thaw frozen)]
|
|
(is (dfn/equals (ds :a) thawed))
|
|
(is (ds-proto/is-column? thawed))))
|
|
|
|
|
|
(deftest empty-csv
|
|
(let [ds (ds/->dataset "test/data/empty-csv-header.csv")]
|
|
(is (= 7 (ds/column-count ds))))
|
|
(let [ds (ds/->dataset "test/data/empty-csv.csv")]
|
|
(is (= 0 (ds/column-count ds)))
|
|
(is (ds/dataset? ds))))
|
|
|
|
|
|
(deftest comment-char
|
|
(let [ds (ds/->dataset "test/data/csv-comment.csv")
|
|
rows (ds/rows ds)]
|
|
(is (= 5 (ds/row-count ds)))
|
|
(is (= (rows -1) (rows -2)))))
|
|
|
|
(deftest issue-304
|
|
(let [ds (ds/->dataset "test/data/issue-292.csv" {:n-initial-skip-rows 10})]
|
|
(is (= 11 (-> (ds "10") (first))))))
|
|
|
|
|
|
(deftest issue-362
|
|
(let [ds-seq (zip/zipfile->dataset-seq "test/data/unknown.zip")]
|
|
(is (= 2 (count ds-seq)))))
|
|
|
|
|
|
(deftest issue-388-transit-support
|
|
(let [ds (ds/->dataset {:a [1 2 3]
|
|
:b [:one :two :three]})
|
|
str-data (ds-transit/dataset->transit-str ds)
|
|
nds (ds-transit/transit-str->dataset str-data)]
|
|
(is (= (ds :a) (nds :a)))
|
|
(is (= (ds :b) (nds :b)))))
|
|
|
|
|
|
(deftest issue-434-transit-support
|
|
(let [ds (ds/->dataset {:a [1 2 3]
|
|
:b [:one :two :three]
|
|
;;transit encoding is milli instants
|
|
:c (dtype/make-container :packed-milli-instant [(java.time.Instant/now) (java.time.Instant/now)])})
|
|
str-data (ds-transit/dataset->transit-str ds)
|
|
nds (ds-transit/transit-str->dataset str-data)]
|
|
(is (= (ds :a) (nds :a)))
|
|
(is (= (ds :b) (nds :b)))
|
|
(is (= (ds :c) (nds :c)))))
|
|
|
|
|
|
(deftest issue-414-json-parser-fn
|
|
(is (= [1 2 3] (get (ds/->dataset "test/data/local_date.json"
|
|
{:parser-fn {:time-period :local-date}})
|
|
"test"))))
|
|
|
|
(deftest dataset-parser-clear-packed-column
|
|
(let [p (ds/dataset-parser)]
|
|
(ds-proto/add-row p {:date (java.time.Instant/now)})
|
|
(ds-proto/ds-clear p)
|
|
(ds-proto/add-row p {:date (java.time.Instant/now)})
|
|
(is (= 1 (count (@p :date))))))
|