Files
2026-02-08 11:20:43 -10:00

125 lines
4.5 KiB
Clojure
Vendored

(ns tech.v3.libs.parquet-test
(:require [tech.v3.dataset :as ds]
[tech.v3.datatype :as dtype]
[tech.v3.datatype.functional :as dfn]
[tech.v3.libs.parquet :as parquet]
[tech.v3.dataset.utils :as ds-utils]
[tech.v3.dataset.column :as ds-col]
[tech.v3.datatype.datetime :as dtype-dt]
[clojure.test :refer [deftest is]]))
(ds-utils/set-slf4j-log-level :info)
(deftest stocks-test
(try
(let [stocks (ds/->dataset "test/data/stocks.csv")
_ (ds/write! stocks "stocks.parquet")
stocks-p (ds/->dataset "stocks.parquet")]
(is (= (vec (stocks "symbol"))
(mapv str (stocks-p "symbol"))))
(is (dfn/equals (stocks "price")
(stocks-p "price")))
(is (= (vec (stocks "date"))
(vec (stocks-p "date")))))
(finally
(.delete (java.io.File. "stocks.parquet")))))
(deftest userdata1-test
(try
(let [testd (ds/->dataset "test/data/parquet/userdata1.parquet")
_ (ds/write! testd "userdata1.parquet")
newd (ds/->dataset "userdata1.parquet")
_ (ds/write! newd "userdata1.nippy")
nippy-d (ds/->dataset "userdata1.nippy")]
(is (= (vec (testd "registration_dttm"))
(vec (newd "registration_dttm"))))
(is (= (vec (testd "comments"))
(vec (newd "comments"))))
(is (= (vec (testd "comments"))
(vec (nippy-d "comments")))))
(finally
(.delete (java.io.File. "userdata1.parquet"))
(.delete (java.io.File. "userdata1.nippy")))))
(deftest whitelist-test
(let [testd (ds/->dataset "test/data/parquet/userdata1.parquet"
{:column-whitelist ["first_name" "last_name" "gender"]})]
(is (= 3 (ds/column-count testd)))))
(deftest ames-ds
(try
(let [ames (ds/->dataset "test/data/ames-house-prices/train.csv")
_ (ds/write! ames "ames.parquet")
newd (ds/->dataset "ames.parquet")]
(is (= (ds/missing (ames "LotFrontage"))
(ds/missing (newd "LotFrontage"))))
(is (= (vec (ames "CentralAir"))
(vec (newd "CentralAir"))))
(is (dfn/equals (ames "SalePrice") (newd "SalePrice"))))
(finally
(.delete (java.io.File. "ames.parquet")))))
(deftest uuid-test
(try
(let [uuid-ds (ds/->dataset "test/data/uuid.parquet"
{:parser-fn {"uuids" :uuid}})
_ (ds/write! uuid-ds "test-uuid.parquet")
new-ds (ds/->dataset "test-uuid.parquet"
{:parser-fn {"uuids" :uuid}})]
(is (= :uuid ((comp :datatype meta) (uuid-ds "uuids"))))
(is (= :uuid ((comp :datatype meta) (new-ds "uuids")))))
(finally
(.delete (java.io.File. "test-uuid.parquet")))))
(deftest missing-uint8-data
;;Use a large enough value the the system is forced to use uint8 columns else
;;it will default to int8 columns based on the column data min/max
(let [ds (ds/->dataset {:a (dtype/make-container :uint8 [10 20 245])})
ds (ds/update-column ds :a #(ds-col/set-missing % [1 5]))]
(try
(parquet/ds->parquet ds "test.parquet")
(let [nds (ds/->dataset "test.parquet" {:key-fn keyword})]
(is (= 3 (ds/row-count nds)))
(is (= [1] (vec (dtype/->reader (ds/missing ds)))))
(is (= :uint8 (dtype/elemwise-datatype (ds :a))))
(is (= :uint8 (dtype/elemwise-datatype (nds :a))))
(is (= [1] (vec (dtype/->reader (ds/missing nds))))))
(finally
(.delete (java.io.File. "test.parquet"))))))
(deftest nested-parquet
(let [ds (ds/->dataset "test/data/nested.parquet")]
(is (= [1 nil 2 nil 3 nil nil] (vec (ds "id"))))
(is (= ["a" "b" "a" "b" "a" "b" "c"] (vec (ds "val.key_value.key"))))
(is (= ["va" "vb" nil nil "vb" nil nil] (vec (ds "val2.key_value.key"))))))
(deftest local-time
(try
(let [ds (ds/->dataset {:a (range 10)
:b (repeat 10 (java.time.LocalTime/now))})
_ (parquet/ds->parquet ds "test.parquet")
pds (ds/->dataset "test.parquet" {:key-fn keyword})]
(is (= (vec (ds :b))
(vec (pds :b)))))
(finally
(.delete (java.io.File. "test.parquet")))))
(deftest decimaltable
(let [table (ds/->dataset "test/data/decimaltable.parquet")
decimals (table "decimals")]
(is (dfn/equals [3.420 1.246] decimals))))
(deftest issue-401-paruet-missing-column
(is (= 4 (ds/column-count (ds/->dataset "test/data/2024-03-03.parquet")))))