125 lines
4.5 KiB
Clojure
Vendored
125 lines
4.5 KiB
Clojure
Vendored
(ns tech.v3.libs.parquet-test
|
|
(:require [tech.v3.dataset :as ds]
|
|
[tech.v3.datatype :as dtype]
|
|
[tech.v3.datatype.functional :as dfn]
|
|
[tech.v3.libs.parquet :as parquet]
|
|
[tech.v3.dataset.utils :as ds-utils]
|
|
[tech.v3.dataset.column :as ds-col]
|
|
[tech.v3.datatype.datetime :as dtype-dt]
|
|
[clojure.test :refer [deftest is]]))
|
|
|
|
(ds-utils/set-slf4j-log-level :info)
|
|
|
|
|
|
(deftest stocks-test
|
|
(try
|
|
(let [stocks (ds/->dataset "test/data/stocks.csv")
|
|
_ (ds/write! stocks "stocks.parquet")
|
|
stocks-p (ds/->dataset "stocks.parquet")]
|
|
(is (= (vec (stocks "symbol"))
|
|
(mapv str (stocks-p "symbol"))))
|
|
(is (dfn/equals (stocks "price")
|
|
(stocks-p "price")))
|
|
(is (= (vec (stocks "date"))
|
|
(vec (stocks-p "date")))))
|
|
(finally
|
|
(.delete (java.io.File. "stocks.parquet")))))
|
|
|
|
|
|
(deftest userdata1-test
|
|
(try
|
|
(let [testd (ds/->dataset "test/data/parquet/userdata1.parquet")
|
|
_ (ds/write! testd "userdata1.parquet")
|
|
newd (ds/->dataset "userdata1.parquet")
|
|
_ (ds/write! newd "userdata1.nippy")
|
|
nippy-d (ds/->dataset "userdata1.nippy")]
|
|
(is (= (vec (testd "registration_dttm"))
|
|
(vec (newd "registration_dttm"))))
|
|
(is (= (vec (testd "comments"))
|
|
(vec (newd "comments"))))
|
|
(is (= (vec (testd "comments"))
|
|
(vec (nippy-d "comments")))))
|
|
|
|
(finally
|
|
(.delete (java.io.File. "userdata1.parquet"))
|
|
(.delete (java.io.File. "userdata1.nippy")))))
|
|
|
|
|
|
(deftest whitelist-test
|
|
(let [testd (ds/->dataset "test/data/parquet/userdata1.parquet"
|
|
{:column-whitelist ["first_name" "last_name" "gender"]})]
|
|
(is (= 3 (ds/column-count testd)))))
|
|
|
|
|
|
(deftest ames-ds
|
|
(try
|
|
(let [ames (ds/->dataset "test/data/ames-house-prices/train.csv")
|
|
_ (ds/write! ames "ames.parquet")
|
|
newd (ds/->dataset "ames.parquet")]
|
|
(is (= (ds/missing (ames "LotFrontage"))
|
|
(ds/missing (newd "LotFrontage"))))
|
|
(is (= (vec (ames "CentralAir"))
|
|
(vec (newd "CentralAir"))))
|
|
(is (dfn/equals (ames "SalePrice") (newd "SalePrice"))))
|
|
(finally
|
|
(.delete (java.io.File. "ames.parquet")))))
|
|
|
|
|
|
(deftest uuid-test
|
|
(try
|
|
(let [uuid-ds (ds/->dataset "test/data/uuid.parquet"
|
|
{:parser-fn {"uuids" :uuid}})
|
|
_ (ds/write! uuid-ds "test-uuid.parquet")
|
|
new-ds (ds/->dataset "test-uuid.parquet"
|
|
{:parser-fn {"uuids" :uuid}})]
|
|
(is (= :uuid ((comp :datatype meta) (uuid-ds "uuids"))))
|
|
(is (= :uuid ((comp :datatype meta) (new-ds "uuids")))))
|
|
(finally
|
|
(.delete (java.io.File. "test-uuid.parquet")))))
|
|
|
|
|
|
(deftest missing-uint8-data
|
|
;;Use a large enough value the the system is forced to use uint8 columns else
|
|
;;it will default to int8 columns based on the column data min/max
|
|
(let [ds (ds/->dataset {:a (dtype/make-container :uint8 [10 20 245])})
|
|
ds (ds/update-column ds :a #(ds-col/set-missing % [1 5]))]
|
|
(try
|
|
(parquet/ds->parquet ds "test.parquet")
|
|
(let [nds (ds/->dataset "test.parquet" {:key-fn keyword})]
|
|
(is (= 3 (ds/row-count nds)))
|
|
(is (= [1] (vec (dtype/->reader (ds/missing ds)))))
|
|
(is (= :uint8 (dtype/elemwise-datatype (ds :a))))
|
|
(is (= :uint8 (dtype/elemwise-datatype (nds :a))))
|
|
(is (= [1] (vec (dtype/->reader (ds/missing nds))))))
|
|
(finally
|
|
(.delete (java.io.File. "test.parquet"))))))
|
|
|
|
|
|
(deftest nested-parquet
|
|
(let [ds (ds/->dataset "test/data/nested.parquet")]
|
|
(is (= [1 nil 2 nil 3 nil nil] (vec (ds "id"))))
|
|
(is (= ["a" "b" "a" "b" "a" "b" "c"] (vec (ds "val.key_value.key"))))
|
|
(is (= ["va" "vb" nil nil "vb" nil nil] (vec (ds "val2.key_value.key"))))))
|
|
|
|
|
|
(deftest local-time
|
|
(try
|
|
(let [ds (ds/->dataset {:a (range 10)
|
|
:b (repeat 10 (java.time.LocalTime/now))})
|
|
_ (parquet/ds->parquet ds "test.parquet")
|
|
pds (ds/->dataset "test.parquet" {:key-fn keyword})]
|
|
(is (= (vec (ds :b))
|
|
(vec (pds :b)))))
|
|
(finally
|
|
(.delete (java.io.File. "test.parquet")))))
|
|
|
|
|
|
(deftest decimaltable
|
|
(let [table (ds/->dataset "test/data/decimaltable.parquet")
|
|
decimals (table "decimals")]
|
|
(is (dfn/equals [3.420 1.246] decimals))))
|
|
|
|
|
|
(deftest issue-401-paruet-missing-column
|
|
(is (= 4 (ds/column-count (ds/->dataset "test/data/2024-03-03.parquet")))))
|