Files
2026-02-08 11:20:43 -10:00

529 lines
18 KiB
Clojure
Vendored

(ns tech.v3.dataset.parse-test
(:require [clojure.test :refer [deftest is]]
[tech.v3.datatype :as dtype]
[tech.v3.datatype.functional :as dfn]
[tech.v3.datatype.bitmap :as bitmap]
[tech.v3.dataset :as ds]
[tech.v3.dataset.zip :as zip]
[tech.v3.dataset.column :as ds-col]
[tech.v3.dataset.protocols :as ds-proto]
[tech.v3.dataset.io.nippy]
[tech.v3.libs.arrow :as arrow]
[tech.v3.libs.clj-transit :as ds-transit]
[taoensso.nippy :as nippy]
[clojure.set :as set]
[clojure.java.io :as io])
(:import [com.univocity.parsers.csv CsvFormat CsvParserSettings CsvParser]
[java.nio.charset StandardCharsets]))
(def test-file "test/data/ames-house-prices/train.csv")
(def missing-data
(->> [{:column-name "LotFrontage", :missing-count 259}
{:column-name "Alley", :missing-count 1369}
{:column-name "MasVnrType", :missing-count 8}
{:column-name "MasVnrArea", :missing-count 8}
{:column-name "BsmtQual", :missing-count 37}
{:column-name "BsmtCond", :missing-count 37}
{:column-name "BsmtExposure", :missing-count 38}
{:column-name "BsmtFinType1", :missing-count 37}
{:column-name "BsmtFinType2", :missing-count 38}
{:column-name "Electrical", :missing-count 1}
{:column-name "FireplaceQu", :missing-count 690}
{:column-name "GarageType", :missing-count 81}
{:column-name "GarageYrBlt", :missing-count 81}
{:column-name "GarageFinish", :missing-count 81}
{:column-name "GarageQual", :missing-count 81}
{:column-name "GarageCond", :missing-count 81}
{:column-name "PoolQC", :missing-count 1453}
{:column-name "Fence", :missing-count 1179}
{:column-name "MiscFeature", :missing-count 1406}
]
(map (juxt :column-name :missing-count))
(sort-by first)))
(def datatype-answers
[["1stFlrSF" :int16]
["2ndFlrSF" :int16]
["3SsnPorch" :int16]
["Alley" :string]
["BedroomAbvGr" :int16]
["BldgType" :string]
["BsmtCond" :string]
["BsmtExposure" :string]
["BsmtFinSF1" :int16]
["BsmtFinSF2" :int16]
["BsmtFinType1" :string]
["BsmtFinType2" :string]
["BsmtFullBath" :int16]
["BsmtHalfBath" :int16]
["BsmtQual" :string]
["BsmtUnfSF" :int16]
["CentralAir" :string]
["Condition1" :string]
["Condition2" :string]
["Electrical" :string]
["EnclosedPorch" :int16]
["ExterCond" :string]
["ExterQual" :string]
["Exterior1st" :string]
["Exterior2nd" :string]
["Fence" :string]
["FireplaceQu" :string]
["Fireplaces" :int16]
["Foundation" :string]
["FullBath" :int16]
["Functional" :string]
["GarageArea" :int16]
["GarageCars" :int16]
["GarageCond" :string]
["GarageFinish" :string]
["GarageQual" :string]
["GarageType" :string]
["GarageYrBlt" :int16]
["GrLivArea" :int16]
["HalfBath" :int16]
["Heating" :string]
["HeatingQC" :string]
["HouseStyle" :string]
["Id" :int16]
["KitchenAbvGr" :int16]
["KitchenQual" :string]
["LandContour" :string]
["LandSlope" :string]
["LotArea" :int32]
["LotConfig" :string]
["LotFrontage" :int16]
["LotShape" :string]
["LowQualFinSF" :int16]
["MSSubClass" :int16]
["MSZoning" :string]
["MasVnrArea" :int16]
["MasVnrType" :string]
["MiscFeature" :string]
["MiscVal" :int16]
["MoSold" :int16]
["Neighborhood" :string]
["OpenPorchSF" :int16]
["OverallCond" :int16]
["OverallQual" :int16]
["PavedDrive" :string]
["PoolArea" :int16]
["PoolQC" :string]
["RoofMatl" :string]
["RoofStyle" :string]
["SaleCondition" :string]
["SalePrice" :int32]
["SaleType" :string]
["ScreenPorch" :int16]
["Street" :string]
["TotRmsAbvGrd" :int16]
["TotalBsmtSF" :int16]
["Utilities" :string]
["WoodDeckSF" :int16]
["YearBuilt" :int16]
["YearRemodAdd" :int16]
["YrSold" :int16]])
(deftest base-ames-parser-test
(let [result (ds/->dataset test-file)
dtypes (->> (vals result)
(map meta)
(sort-by :name)
(mapv (juxt :name :datatype)))]
(is (= (set (map first datatype-answers))
(set (map first dtypes))))
(let [dtype-map (into {} dtypes)
differences (->> datatype-answers
(map (fn [[colname col-dtype]]
(let [detected-dtype (dtype-map colname)]
(when-not (= detected-dtype col-dtype)
{:name colname
:expected-datatype col-dtype
:result-datatype detected-dtype}))))
(remove nil?)
seq)]
(is (nil? differences)
(str differences)))
(let [result-missing-data (->> (vals result)
(map (juxt ds-col/column-name
(comp dtype/ecount ds-col/missing)))
(remove #(= 0 (second %)))
(sort-by first))]
(is (= (set (map first missing-data))
(set (map first result-missing-data))))))
(let [result (ds/->dataset
test-file
{:n-records 100
:column-whitelist ["Id" "SalePrice" "YearBuilt"]})]
(is (= 3 (count result)))
;;Header row accounts for one.
(is (= 100 (ds/row-count result)))))
(deftest base-ames-load-test
;;Here we just test that the options correctly pass through ->dataset
(let [result (ds/->dataset test-file
{:n-records 100
:column-whitelist ["Id" "SalePrice" "YearBuilt"]})]
(is (= 3 (ds/column-count result)))
;;Header row accounts for one.
(is (= 100 (ds/row-count result)))))
(deftest specify-column-types
;;parse everything as float32
(let [result (ds/->dataset
test-file
{:n-records 100
:column-whitelist ["1stFlrSF" "2ndFlrSF" "3SsnPorch"]
:parser-fn :float32})]
(is (= #{:float32}
(set (map dtype/get-datatype (vals result)))))
(is (= 3 (ds/column-count result))))
;;Next up is a map of colname->datatype
(let [result (ds/->dataset
test-file
{:n-records 100
:column-whitelist ["1stFlrSF" "2ndFlrSF" "3SsnPorch"]
:parser-fn {"1stFlrSF" :float32
"2ndFlrSF" :int32}})]
(is (= #{:float32 :int32 :int16}
(set (map dtype/get-datatype (vals result)))))))
(deftest semi-colon-delimited-file
(let [result (ds/->dataset "test/data/sample01.csv"
{:separator \;})]
(is (= 3 (ds/column-count result)))))
(deftest tough-file
(let [result (ds/->dataset "test/data/essential.csv"
{:n-initial-skip-rows 1
:skip-bad-rows? true})]
(is (= 5 (ds/column-count result)))))
(defn- make-essential-csv-parser
[]
(-> (doto (CsvParserSettings.)
(.. getFormat (setLineSeparator "\n"))
(.setHeaderExtractionEnabled true)
(.setIgnoreLeadingWhitespaces true)
(.setIgnoreTrailingWhitespaces true))
(CsvParser.)))
(deftest custom-csv-parser
(let [result (ds/->dataset "test/data/essential.csv"
{:csv-parser (make-essential-csv-parser)
:skip-bad-rows? true})]
(is (= 5 (ds/column-count result)))))
(deftest simple-write-test
(let [initial-ds (ds/->dataset
test-file
{:num-rows 20
:column-whitelist ["1stFlrSF" "2ndFlrSF" "3SsnPorch"]})
_ (ds/write! initial-ds "test.tsv")
new-ds (ds/->dataset "test.tsv")]
(is (dfn/equals (initial-ds "1stFlrSF")
(new-ds "1stFlrSF")))
(is (dfn/equals (initial-ds "2ndFlrSF")
(new-ds "2ndFlrSF"))))
(let [missing-ds (-> (ds/->dataset
test-file
{:n-records 20
:column-whitelist [43 44 69]})
(ds/update-column
"1stFlrSF"
#(ds-col/set-missing % [2 4 7 9])))
_ (ds/write! missing-ds "test.tsv")
new-ds (ds/->dataset "test.tsv")]
(is (dfn/equals (missing-ds "1stFlrSF")
(new-ds "1stFlrSF")))
(is (= #{2 4 7 9}
(set (ds-col/missing (new-ds "1stFlrSF")))))))
(deftest date-time-format-test-1
(let [stock-ds (ds/->dataset "test/data/stocks.csv")]
(is (= :packed-local-date (dtype/get-datatype (stock-ds "date")))))
(let [temp-ds (ds/->dataset "test/data/seattle-temps.csv")]
(is (= :zoned-date-time (dtype/get-datatype (temp-ds "date")))))
(let [stock-ds (ds/->dataset "test/data/stocks.csv"
{:parser-fn
{"date" :local-date}})]
(is (= :local-date (dtype/get-datatype (stock-ds "date"))))))
(deftest custom-reader
(is (= 560 (ds/row-count (ds/->dataset (io/reader "test/data/stocks.csv")
{:file-type :csv})))))
(defn verify-relaxed-parse
[ds]
(let [date-col (ds "date")
col-meta (meta date-col)
^List unparsed-data (:unparsed-data col-meta)
^RoaringBitmap unparsed-indexes (:unparsed-indexes col-meta)]
(is (= :packed-local-date (dtype/get-datatype date-col)))
;;Make sure unparsed data came through intact
(is (= #{"hello" "1212"}
(set unparsed-data)))))
(deftest bad-csv-relaxed-1
(let [ds (ds/->dataset "test/data/stocks-bad-date.csv")]
(is (= :string (dtype/get-datatype (ds "date"))))
;;Make sure unparsed data came through intact
(is (= #{"hello" "1212"}
(set/intersection #{"hello" "1212"}
(set (ds-col/unique (ds "date"))))))
(let [updated-ds (ds/update-column
ds "date" (partial ds-col/parse-column
[:packed-local-date :relaxed?]))]
(verify-relaxed-parse updated-ds))))
(deftest bad-csv-relaxed-2
(let [ds (ds/->dataset "test/data/stocks-bad-date.csv"
{:parser-fn
{"date" [:packed-local-date :relaxed?]}})]
(verify-relaxed-parse ds)))
(deftest csv-keyword-colnames
(let [stocks (ds/->dataset "test/data/stocks.csv" {:key-fn keyword})]
(is (every? keyword? (ds/column-names stocks)))))
(deftest parse-empty-column-name
(let [data (ds/->dataset "test/data/rcsv.csv")]
(is (= #{"column-0" "Urban Female" "Urban Male" "Rural Female" "Rural Male"}
(set (ds/column-names data))))))
(deftest parse-ip-addrs-as-string
(let [data (ds/->dataset "test/data/ip-addrs.csv")]
(is (= :string (dtype/get-datatype (data "ip"))))))
(def arrow-file "test/data/iris.feather")
(def parquet-file "test/data/parquet/userdata1.parquet")
;;We will get back to this one. Potentially there are good ways into this
;;via arrow.
#_(deftest parse-parquet
(let [ds (ds/->dataset parquet-file)]
(is (= 13 (ds/column-count ds)))
(is (= 1000 (ds/row-count ds)))
(is (= #{:local-date-time :float64 :int32 :string}
(->> (map dtype/get-datatype (vals ds))
set)))))
(deftest parse-ragged
(let [ds (ds/->dataset "test/data/ragged.csv"
{:header-row? false
:key-fn keyword})]
(is (= [:column-0 :column-1 :column-2 :column-3 :column-4 :column-5
:column-6 :column-7 :column-8 :column-9 :column-10 :column-11]
(vec (ds/column-names ds))))
(is (= 12 (ds/column-count ds)))
(is (= [4 24 31 33 65 67 68 71 75 76 93 97]
(vec ((ds/value-reader ds) 4))))
(is (= [10 33 51 66 67 84 nil nil nil nil nil nil]
(vec ((ds/value-reader ds) 10))))))
(deftest parse-small-doubles
(let [ds (ds/->dataset "test/data/double_parse_test.csv")]
(is (= 197 (count (filter #(not= 0.0 % ) (ds "pvalue")))))))
(deftest string-separators
(let [ds (ds/->dataset "test/data/double_parse_test.csv" {:separator ","})]
(is (= 197 (count (filter #(not= 0.0 % ) (ds "pvalue")))))
(is (thrown? Throwable (ds/->dataset "test/data/double_parse_test.csv"
{:separator ",n"})))))
(deftest quoted-column-data
(try
(let [ds (ds/->dataset [{:a "onelongstring"}])]
(ds/write! ds "quoted.csv" {:quote? true})
(is (= "\"a\"\n\"onelongstring\"\n"
(slurp "quoted.csv"))))
(finally
(.delete (java.io.File. "quoted.csv")))))
(deftest text-data
(try
(let [ds (ds/->dataset [{:a "onestring"}
{:a "anotherstring"}
{}]
{:parser-fn :text})
_ (is (= :text (-> (ds :a) meta :datatype)))
_ (ds/write! ds "text.csv")
_ (ds/write! ds "text.nippy")
csv-ds (ds/->dataset "text.csv" {:parser-fn {"a" :text}
:key-fn keyword})
_ (is (= :text (-> (csv-ds :a) meta :datatype)))
;;_ (is (= 3 (ds/row-count csv-ds)))
nippy-ds (ds/->dataset "text.nippy")
_ (is (= :text (-> (nippy-ds :a) meta :datatype)))
_ (is (= 3 (ds/row-count nippy-ds)))
_ (arrow/write-dataset-to-stream! ds "text.arrow")
ds-copy (arrow/read-stream-dataset-copying "text.arrow" {:key-fn keyword})
_ (is (= :text (-> (ds-copy :a) meta :datatype)))
_ (is (= 3 (ds/row-count nippy-ds)))
ds-inplace (arrow/read-stream-dataset-inplace "text.arrow")]
(is (= :text (-> (ds-inplace "a") meta :datatype)))
(is (= 3 (ds/row-count nippy-ds))))
(finally
(.delete (java.io.File. "text.csv"))
(.delete (java.io.File. "text.nippy"))
(.delete (java.io.File. "text.arrow")))))
(deftest custom-parse-method
(try
(let [src-ds (ds/->dataset {:a ["1" "missing" "parse-failure" "2" "3"]})
_ (ds/write! src-ds "custom-parse.csv")
ds (ds/->dataset
"custom-parse.csv"
{:parser-fn {"a" [:int64
(fn [str-val]
(cond
(= str-val "missing")
:tech.v3.dataset/missing
(= str-val "parse-failure")
:tech.v3.dataset/parse-failure
:else
(Long/parseLong str-val)))]}})]
(is (= [1 nil nil 2 3]
(vec (ds "a"))))
(is (= #{1 2} (set (ds/missing ds))))
(is (= #{2}
(set (:unparsed-indexes (meta (ds "a"))))))
(is (= ["parse-failure"]
(vec (:unparsed-data (meta (ds "a")))))))
(finally
(.delete (java.io.File. "custom-parse.csv")))))
(deftest stocks-v5
(let [v5 (ds/->dataset "test/data/stocks-v5.nippy")
cur (ds/->dataset "test/data/stocks.csv")]
(is (= (vec (v5 "date"))
(vec (cur "date"))))))
(deftest gzipped-input-stream-issue-247
(let [ds (ds/->dataset (io/input-stream "test/data/ames-train.csv.gz")
{:file-type :csv
:gzipped? true})
correct-ds (ds/->dataset "test/data/ames-train.csv.gz")]
(is (= (ds/row-count correct-ds) (ds/row-count ds)))))
(deftest pokemon-csv
(let [ds (ds/->dataset "test/data/pokemon.csv")]
(is (= "['Overgrow', 'Chlorophyll']" (first (ds "abilities"))))))
(deftest issue-292
(let [ds (ds/->dataset "test/data/issue-292.csv" )]
(is (== 3 (ds/column-count ds)))))
(deftest json-test
(try
(let [ds (-> (ds/->dataset "test/data/stocks.csv")
(ds/column-map "date" str ["date"]))
_ (ds/write! ds "stocks.json")
jds (ds/->dataset "stocks.json")]
(is (= (vec (ds "date")) (vec (jds "date"))))
(is (dfn/equals (ds "price") (jds "price"))))
(finally
(.delete (java.io.File. "stocks.json")))))
(deftest nippy-column
(let [ds (ds/->dataset {:a [1 2 3] :b [4 5 6]})
frozen (nippy/freeze (ds :a))
thawed (nippy/thaw frozen)]
(is (dfn/equals (ds :a) thawed))
(is (ds-proto/is-column? thawed))))
(deftest empty-csv
(let [ds (ds/->dataset "test/data/empty-csv-header.csv")]
(is (= 7 (ds/column-count ds))))
(let [ds (ds/->dataset "test/data/empty-csv.csv")]
(is (= 0 (ds/column-count ds)))
(is (ds/dataset? ds))))
(deftest comment-char
(let [ds (ds/->dataset "test/data/csv-comment.csv")
rows (ds/rows ds)]
(is (= 5 (ds/row-count ds)))
(is (= (rows -1) (rows -2)))))
(deftest issue-304
(let [ds (ds/->dataset "test/data/issue-292.csv" {:n-initial-skip-rows 10})]
(is (= 11 (-> (ds "10") (first))))))
(deftest issue-362
(let [ds-seq (zip/zipfile->dataset-seq "test/data/unknown.zip")]
(is (= 2 (count ds-seq)))))
(deftest issue-388-transit-support
(let [ds (ds/->dataset {:a [1 2 3]
:b [:one :two :three]})
str-data (ds-transit/dataset->transit-str ds)
nds (ds-transit/transit-str->dataset str-data)]
(is (= (ds :a) (nds :a)))
(is (= (ds :b) (nds :b)))))
(deftest issue-434-transit-support
(let [ds (ds/->dataset {:a [1 2 3]
:b [:one :two :three]
;;transit encoding is milli instants
:c (dtype/make-container :packed-milli-instant [(java.time.Instant/now) (java.time.Instant/now)])})
str-data (ds-transit/dataset->transit-str ds)
nds (ds-transit/transit-str->dataset str-data)]
(is (= (ds :a) (nds :a)))
(is (= (ds :b) (nds :b)))
(is (= (ds :c) (nds :c)))))
(deftest issue-414-json-parser-fn
(is (= [1 2 3] (get (ds/->dataset "test/data/local_date.json"
{:parser-fn {:time-period :local-date}})
"test"))))
(deftest dataset-parser-clear-packed-column
(let [p (ds/dataset-parser)]
(ds-proto/add-row p {:date (java.time.Instant/now)})
(ds-proto/ds-clear p)
(ds-proto/add-row p {:date (java.time.Instant/now)})
(is (= 1 (count (@p :date))))))