init research

This commit is contained in:
2026-02-08 11:20:43 -10:00
commit bdf064f54d
3041 changed files with 1592200 additions and 0 deletions
@@ -0,0 +1,189 @@
(ns tech.v3.dataset.categorical-test
(:require [tech.v3.dataset.categorical :as ds-cat]
[tech.v3.dataset.modelling :as ds-mod]
[tech.v3.datatype :as dtype]
[tech.v3.dataset.column-filters :as cf]
[clojure.test :refer [deftest is] :as t]
[tech.v3.dataset :as ds]))
(deftest prediction
(is (= [:no :yes]
(->
(ds/->dataset {:yes [0.3 0.5] :no [0.7 0.5]})
(ds-mod/probability-distributions->label-column :val)
(ds-cat/reverse-map-categorical-xforms)
:val))))
(deftest prob-dist
(let [prob
(->
(ds/->dataset {:yes [0.3 0.5] :no [0.7 0.5]})
(ds-mod/probability-distributions->label-column :val)
(ds-cat/reverse-map-categorical-xforms))]
(is (= (:yes prob) [0.3 0.5]))
(is (= (:no prob) [0.7 0.5]))
(is (= (:val prob) [:no :yes]))))
(deftest cat-to-number
(is (=
(set
(->
(ds/->dataset {:x [:a :b] :y ["1" "0"]})
(ds/categorical->number [:y])
:y))
(set [0 1]))))
(defn- cat->num [table-args]
(->
(ds/->dataset {:y [:a :b :c :d]})
(ds/categorical->number [:y] table-args)
:y
meta
:categorical-map
:lookup-table
clojure.set/map-invert))
(deftest test-categorical->number []
(is (= {5 :a, 2 :b, 0 :d, 1 :c}
(cat->num [[:a 5] [:b 2]])))
(is (= {5 :a, 0 :b, 1 :d, 2 :c}
(cat->num [[:a 5] [:b 0]])))
(is (= (cat->num [])
{0 :d, 1 :c, 2 :a, 3 :b}))
(is (= (cat->num [[:not-present 1]])
{1 :not-present, 0 :d, 2 :c, 3 :a, 4 :b}))
(is (= (cat->num [[:a 1 :b 1]])
{1 :a, 0 :d, 2 :c, 3 :b})))
(deftest cat-map-regression
(is (every? #(Double/isFinite %)
(-> (ds/->dataset "test/data/titanic.csv")
(ds/update-column "Survived"
(fn [col]
(let [val-map {0 :drowned
1 :survived}]
(dtype/emap val-map :keyword col))))
(ds/categorical->number cf/categorical)
(ds/column "Survived")))))
(deftest categorical-assignments-are-integers
(is (= #{0 1 2 3}
(->
(ds/->dataset {:x1 [1 2 4 5 6 5 6 7]
:x2 [5 6 6 7 8 2 4 6]
:y [:a :b :b :a :c :a :b :d]})
(ds/categorical->number [:y])
(get :y)
distinct
set))))
(defn- =-invert-cat [target-1 target-2
lookup-one lookup-two
result-datatype
expected-result
]
(let [ds (ds/->dataset {:target [target-1 target-2]})
inverted
(ds-cat/invert-categorical-map ds
{:lookup-table {:one lookup-one
:two lookup-two},
:src-column :target,
:result-datatype result-datatype})
inverted-target (-> inverted :target)]
(= expected-result inverted-target)))
;(format "expected %s, found: %s" expected-result) (seq inverted-target)))
(deftest invert-cat--works
(is
(=-invert-cat 1 2
1 2
:int
[:one :two]))
; TODO - should pass ?
(is (=-invert-cat 1.0 2.0
1 2
:int
[:one :two]))
; TODO - should pass ?
(is (=-invert-cat 1.99999 2.99999
1 2
:int
[:one :two]))
; TODO - should pass ?
(is (=-invert-cat 1.2 1.3
1 2
:int
[:one :one])))
(deftest invert-cat--throws
(is (thrown? Exception
(=-invert-cat 1.0 2.0
1.0 2.0
:float
[:one :two])
;; => Execution error at tech.v3.dataset.categorical/invert-categorical-map$fn (categorical.clj:177).
;; Unable to find src value for numeric value 1.0
))
(is (thrown? Exception
(=-invert-cat 1 2
4 5
:int
[:one :two])))
;; => Execution error at tech.v3.dataset.categorical/invert-categorical-map$fn (categorical.clj:177).
;; Unable to find src value for numeric value 1
(is (thrown? Exception
(=-invert-cat 1 2
1.0 2.0
:int
[:one :two]))))
;; => Execution error at tech.v3.dataset.categorical/invert-categorical-map$fn (categorical.clj:177).
;; Unable to find src value for numeric value 1
(defn- is-roundtrip-ok [raw-model-prediction]
(let [
train-ds
(->
(ds/->dataset {:target [:a :b :c]})
(ds/categorical->number [:target])
)
cat-map (-> train-ds :target meta :categorical-map)
prediction-ds
(->
(ds/->dataset {:target raw-model-prediction})
(ds/assoc-metadata [:target] :categorical-map cat-map)
(ds-cat/reverse-map-categorical-xforms))]
(is (= [:c :a :b] (:target prediction-ds)))
))
(deftest round-trip
;; only this should pass
(is-roundtrip-ok [0 1 2])
;; currently these all pass, while I would like them to all fail
(is-roundtrip-ok [0.0 1.2 2.2])
(is-roundtrip-ok [0.9 1.9 2.9])
(is-roundtrip-ok (float-array [0 1 2]))
(is-roundtrip-ok (float-array [0 1.9 2.9]))
(is-roundtrip-ok (double-array [0 1.5 2.2])))
+62
View File
@@ -0,0 +1,62 @@
(ns tech.v3.dataset.datetime-test
(:require [tech.v3.dataset :as ds]
[tech.v3.datatype :as dtype]
[tech.v3.datatype.datetime :as dtype-dt]
[clojure.test :refer [deftest is]]))
(deftest epoch-millis-second-maps
(let [ds (-> (ds/->dataset "test/data/stocks.csv")
(ds/update-column "date" dtype-dt/datetime->milliseconds)
(ds/mapseq-reader))]
(is (number? (get (first ds) "date")))))
(deftest datetime-column-datatype-test
(let [ds (ds/->dataset "test/data/stocks.csv")]
(is (= :packed-local-date
(-> (ds "date")
(dtype/->reader)
(dtype/sub-buffer 0 20)
(dtype/get-datatype))))))
(deftest stocks-descriptive-stats
(let [stocks (ds/->dataset "test/data/stocks.csv")
desc-stats (ds/descriptive-stats stocks)
date-only (-> (ds/filter-column desc-stats :col-name #(= "date" %))
(ds/mapseq-reader)
(first))]
(is (every? dtype-dt/datetime-datatype?
(map dtype/get-datatype
(vals (select-keys date-only [:min :mean :max])))))))
(deftest stocks-descriptive-stats-2
(let [stocks (-> (ds/->dataset "test/data/stocks.csv")
(ds/update-column "date" (partial dtype/emap
dtype-dt/local-date->instant
:instant)))
desc-stats (ds/descriptive-stats stocks {:stat-names (ds/all-descriptive-stats-names)})
date-only (-> (ds/filter-column desc-stats :col-name #(= "date" %))
(ds/mapseq-reader)
(first))]
(is (every? dtype-dt/datetime-datatype?
(map dtype/get-datatype
(vals (select-keys date-only [:min :mean :max
:quartile-1 :quartile-3])))))))
(deftest datetime-shenanigans-1
(is (= (java.time.LocalDateTime/of 2020 01 01 11 22 33)
(nth (ds/column
(ds/->dataset {:dt [(java.time.LocalDateTime/of 2020 01 01 11 22 33)
(java.time.LocalDateTime/of 2020 10 01 01 01 01)]})
:dt) 0)))
(is (= (java.time.LocalDateTime/of 2020 01 01 11 22 33)
(dtype/get-value
(ds/column
(ds/->dataset {:dt [(java.time.LocalDateTime/of 2020 01 01 11 22 33)
(java.time.LocalDateTime/of 2020 10 01 01 01 01)]})
:dt) 0))))
@@ -0,0 +1,57 @@
(ns tech.v3.dataset.format-sequence-test
(:require [tech.v3.dataset.format-sequence :refer [format-sequence]]
[clojure.test :refer [deftest is]]))
(def a [0.000001 0.00001 0.0001 0.001 0.01 0.1 0.0
1.0 10.0 100.0 1000.0 10000.0 100000.0])
(def b [10.0 10.1 10.11 10.111 10.1111 10.11111
1.0 1.1 1.11 1.111 1.1111 1.11111
0.0 0.1 0.11 0.111 0.1111 -0.11111])
(def c (range -5 4 0.8795833))
(def d [-1.0e-20 -1.334e-100 3.43e100 4.556e20
1.0e-20 1.334e-100 -3.43e100 -41.556e20
0.999e-300 -0.999e300])
(def e [-1.0e99 1.0e99])
(def f [-1.0e100 1.0e100])
(def g [0.002 0.0002 0.000333 0.1 -0.0003 0.0])
(def h [0.002 0.0002 0.00333 0.00001 -0.0003 0.022 0.0001])
(def i [10.0 ##NaN ##Inf ##-Inf 100 0.001 nil])
(def j (map float [39.81 36.35 43.22 28.37 25.45
-39.81 36.351 43.221 28.371 25.451]))
(deftest regression-tests
(is (= (format-sequence j)
'(" 39.810" " 36.350" " 43.220" " 28.370" " 25.450" "-39.810" " 36.351" " 43.221" " 28.371" " 25.451")))
(is (= (format-sequence i 0 0)
'("1.0E+01" " NaN" " Inf" " -Inf" "1.0E+02" "1.0E-03" " NaN")))
(is (= (format-sequence a)
'(" 0.000001" " 0.000010" " 0.000100" " 0.001000" " 0.010000" " 0.100000" " 0.000000" " 1.000000" " 10.000000" " 100.000000" " 1000.000000" " 10000.000000" "100000.000000")))
(is (= (format-sequence a 5 4)
'("1.0E-06" "1.0E-05" "1.0E-04" "1.0E-03" "1.0E-02" "1.0E-01" "0.0E+00" "1.0E+00" "1.0E+01" "1.0E+02" "1.0E+03" "1.0E+04" "1.0E+05")))
(is (= (format-sequence b)
'("10.00000" "10.10000" "10.11000" "10.11100" "10.11110" "10.11111" " 1.00000" " 1.10000" " 1.11000" " 1.11100" " 1.11110" " 1.11111" " 0.00000" " 0.10000" " 0.11000" " 0.11100" " 0.11110" "-0.11111")))
(is (= (format-sequence b 5 2)
'(" 1.00000E+01" " 1.01000E+01" " 1.01100E+01" " 1.01110E+01" " 1.01111E+01" " 1.01111E+01" " 1.00000E+00" " 1.10000E+00" " 1.11000E+00" " 1.11100E+00" " 1.11110E+00" " 1.11111E+00" " 0.00000E+00" " 1.00000E-01" " 1.10000E-01" " 1.11000E-01" " 1.11100E-01" "-1.11110E-01")))
(is (= (format-sequence c)
'("-5.0000000" "-4.1204167" "-3.2408334" "-2.3612501" "-1.4816668" "-0.6020835" " 0.2774998" " 1.1570831" " 2.0366664" " 2.9162497" " 3.7958330")))
(is (= (format-sequence c 4)
'("-5.0000" "-4.1204" "-3.2408" "-2.3613" "-1.4817" "-0.6021" " 0.2775" " 1.1571" " 2.0367" " 2.9162" " 3.7958")))
(is (= (format-sequence c 4 0)
'("-5.0000E+00" "-4.1204E+00" "-3.2408E+00" "-2.3613E+00" "-1.4817E+00" "-6.0208E-01" " 2.7750E-01" " 1.1571E+00" " 2.0367E+00" " 2.9162E+00" " 3.7958E+00")))
(is (= (format-sequence d)
'("-1.0000E-020" "-1.3340E-100" " 3.4300E+100" " 4.5560E+020" " 1.0000E-020" " 1.3340E-100" "-3.4300E+100" "-4.1556E+021" " 9.9900E-301" "-9.9900E+299")))
(is (= (format-sequence e)
'("-1.0E+99" " 1.0E+99")))
(is (= (format-sequence f)
'("-1.0E+100" " 1.0E+100")))
(is (= (format-sequence g)
'(" 0.002000" " 0.000200" " 0.000333" " 0.100000" "-0.000300" " 0.000000")))
(is (= (format-sequence h)
'(" 0.00200" " 0.00020" " 0.00333" " 0.00001" "-0.00030" " 0.02200" " 0.00010")))
(is (= (format-sequence i)
'(" 10.000" " NaN" " Inf" " -Inf" "100.000" " 0.001" " NaN")))
(is (= (format-sequence i 0 0)
'("1.0E+01" " NaN" " Inf" " -Inf" "1.0E+02" "1.0E-03" " NaN")))
(is (= (format-sequence j)
'(" 39.810" " 36.350" " 43.220" " 28.370" " 25.450" "-39.810" " 36.351" " 43.221" " 28.371" " 25.451"))))
+37
View File
@@ -0,0 +1,37 @@
(ns tech.v3.dataset.github-test
(:require [tech.v3.dataset :as ds]
[tech.v3.datatype :as dtype]
[clojure.test :refer [deftest is]]))
(comment
;;This sometimes returns a 500 error.
(deftest load-github-events
(let [ds (ds/->dataset "https://api.github.com/events" {:file-type :json
:key-fn keyword})]
(is (every? keyword? (ds/column-names ds)))
(is (= [8 30] (dtype/shape ds)))))
(do
(require '[tech.v3.datatype.functional :as dfn])
(require '[tech.v3.datatype.argops :as argops])
(require '[tech.v3.datatype.unary-pred :as un-pred])
(defonce flights (ds/->dataset "https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv")))
(time (-> (dfn/+ (flights "arr_delay")
(flights "dep_delay"))
(dfn/< 0)
(un-pred/bool-reader->indexes)
(dtype/ecount)))
;;Another way to get the same result is to use summation. Booleans are
;;interpreted very specifically below where false is 0 and 1 is true.
;;Double summation is very fast.
(time (-> (dfn/+ (flights "arr_delay")
(flights "dep_delay"))
(dfn/< 0)
(dfn/sum)))
)
+45
View File
@@ -0,0 +1,45 @@
(ns tech.v3.dataset.infer-test
(:require [tech.v3.dataset :as ds]
[tech.v3.datatype :as dtype]
[tech.v3.datatype.functional :as dfn]
[tech.v3.datatype.bitmap :as bitmap]
[clojure.test :refer [deftest is]]))
(deftest simple-inference
(letfn [(inferred-equals [lhs rhs]
(let [test-col (-> (ds/->dataset [])
(assoc :testdata lhs)
(:testdata))]
(is (= (dtype/elemwise-datatype test-col)
(dtype/elemwise-datatype rhs)))
(is (every? identity (dfn/eq test-col rhs)) (vec lhs))))]
(inferred-equals [true false true false] (boolean-array [true false true false]))
(inferred-equals (list 0 Double/NaN 1.0) (double-array [0.0 Double/NaN 1.0]))
(inferred-equals #:tech.v3.dataset{:data [1 2 3 nil 4]
:force-datatype? true}
[1 2 3 nil 4])
(inferred-equals (list 0 Double/NaN 1.0 nil nil)
(double-array [0.0 Double/NaN 1.0 Double/NaN Double/NaN]))
(is (= #{2 4}
(set (ds/missing (-> (ds/->dataset [])
(assoc :test-data [1 2 nil 3 nil]))))))
(is (= #{2 4}
(set (ds/missing
(-> (ds/->dataset [])
(assoc :test-data #:tech.v3.dataset{:data [1 2 nil 3 nil]
:force-datatype? true}))))))
(is
(= #{}
(set (ds/missing
(-> (ds/->dataset [])
(assoc :test-data #:tech.v3.dataset{:data [1 2 nil 3 nil]
:force-datatype? true
:missing (bitmap/->bitmap)}))))))
(is
(= #{}
(set (ds/missing
(-> (ds/->dataset [])
(assoc :test-data #:tech.v3.dataset{:data [1 2 nil 3 nil]
:missing (bitmap/->bitmap)}))))))
))
+401
View File
@@ -0,0 +1,401 @@
(ns tech.v3.dataset.join-test
(:require [tech.v3.dataset :as ds]
[tech.v3.dataset.join :as ds-join]
[tech.v3.dataset.column :as ds-col]
[tech.v3.datatype :as dtype]
[tech.v3.datatype.packing :as packing]
[tech.v3.datatype.functional :as dfn]
[tech.v3.datatype.datetime :as dtype-dt]
[clojure.test :refer [deftest is testing]])
(:import [java.time LocalDate]))
(deftest simple-join-test
(let [lhs (ds/->dataset {:a (range 10)
:b (range 10)})
rhs (ds/->dataset {:a (->> (range 10)
(mapcat (partial repeat 2))
(vec))
:c (->> (range 10)
(mapcat (partial repeat 2))
(vec))})
{:keys [inner rhs-missing]} (ds-join/hash-join :a lhs rhs)]
(is (dfn/equals (inner :a) (inner :b)))
(is (dfn/equals (inner :b) (inner :c)))
(is (empty? (seq rhs-missing))))
(let [lhs (ds/->dataset {:a (range 10)
:b (range 10)})
rhs (ds/->dataset {:a (->> (range 15)
(mapcat (partial repeat 2))
(vec))
:c (->> (range 15)
(mapcat (partial repeat 2))
(vec))})
{:keys [inner rhs-missing]} (ds-join/hash-join [:b :c] lhs rhs
{:rhs-missing? true})]
(is (dfn/equals (inner :a) (inner :b)))
(is (dfn/equals (inner :b) (inner :right.a)))
(is (= [20 21 22 23 24 25 26 27 28 29] (vec rhs-missing))))
(let [lhs (ds/->dataset {:a (range 15)
:b (range 15)})
rhs (ds/->dataset {:a (->> (range 10)
(mapcat (partial repeat 2))
(vec))
:c (->> (range 10)
(mapcat (partial repeat 2))
(vec))})
{:keys [inner lhs-missing]} (ds-join/hash-join :a lhs rhs
{:lhs-missing? true})]
(is (dfn/equals (inner :a) (inner :b)))
(is (dfn/equals (inner :b) (inner :c)))
(is (= [10 11 12 13 14] (vec lhs-missing)))))
(defn lhs-customer-db
[]
(ds/->dataset [{"CustomerID" 1,
"CustomerName" "Alfreds Futterkiste",
"ContactName" "Maria Anders",
"Address" "Obere Str. 57",
"City" "Berlin",
"PostalCode" 12209,
"Country" "Germany"}
{"CustomerID" 2,
"CustomerName" "Ana Trujillo Emparedados y helados",
"ContactName" "Ana Trujillo",
"Address" "Avda. de la Constitución 2222",
"City" "México D.F.",
"PostalCode" 5021,
"Country" "Mexico"}
{"CustomerID" 3,
"CustomerName" "Antonio Moreno Taquería",
"ContactName" "Antonio Moreno",
"Address" "Mataderos 2312",
"City" "México D.F.",
"PostalCode" 5023,
"Country" "Mexico"}]
{:parser-fn {"PostalCode" :int16}}))
(defn rhs-customer-db
[]
(ds/->dataset [{"OrderID" 10308,
"CustomerID" 2,
"EmployeeID" 7,
"OrderDate" "1996-09-18",
"ShipperID" 3}
{"OrderID" 10309,
"CustomerID" 37,
"EmployeeID" 3,
"OrderDate" "1996-09-19",
"ShipperID" 1}
{"OrderID" 10310,
"CustomerID" 77,
"EmployeeID" 8,
"OrderDate" "1996-09-20",
"ShipperID" 2}]
{:parser-fn {"OrderID" :int16
"CustomerID" :int16
"EmployeeID" :int16
"ShipperID" :int16}}))
(deftest inner-join-test
(let [lhs (lhs-customer-db)
rhs (rhs-customer-db)
join-data (ds-join/inner-join "CustomerID" lhs rhs)
lhs-colname-map (:left-column-names (meta join-data))
rhs-colname-map (:right-column-names (meta join-data))]
(is (= (count lhs-colname-map)
(ds/column-count lhs)))
(is (= (count rhs-colname-map)
(ds/column-count rhs)))))
;;sample from https://www.w3schools.com/sql/sql_join_left.asp
(deftest left-join-test
(let [lhs (lhs-customer-db)
rhs (rhs-customer-db)
join-data (ds-join/left-join "CustomerID" lhs rhs)
recs (ds/mapseq-reader join-data)
empty-int? #{-32768}
empty-string? #{""}
empty-val? #(or (empty-int? %) (empty-string? %)
(nil? %))
realized (some #(when (= (get % "CustomerID") 2) %) recs)
unrealized (filter #(not= % realized) recs)
lhs-colname-map (:left-column-names (meta join-data))
rhs-colname-map (:right-column-names (meta join-data))]
(is (every? (complement empty-val?) (vals realized))
"Ana's record should be fully realized.")
(is (every? identity
(for [{:strs [OrderID OrderDate ShipperID]}
unrealized]
;;We can't do order date because they are dates
(every? empty-val? [OrderID ShipperID])))
"Everyone else should have missing entries from RHS.")
(is (= (count lhs-colname-map)
(ds/column-count lhs)))
(is (= (count rhs-colname-map)
(ds/column-count rhs)))))
(deftest right-join-test
(let [lhs (lhs-customer-db)
rhs (rhs-customer-db)
join-data (ds-join/right-join "CustomerID" lhs rhs)
lhs-colname-map (:left-column-names (meta join-data))
rhs-colname-map (:right-column-names (meta join-data))]
(is (= #{2 37 77} (set (join-data "right.CustomerID"))))
(is (= #{"Ana Trujillo" nil} (set (join-data "ContactName"))))
(is (= #{5021 nil} (set (map #(when % (int %)) (join-data "PostalCode")))))
(is (= #{1 2} (set (ds-col/missing (join-data "ContactName")))))
(is (= #{1 2} (set (ds-col/missing (join-data "PostalCode")))))
(is (= (count lhs-colname-map)
(ds/column-count lhs)))
(is (= (count rhs-colname-map)
(ds/column-count rhs)))))
(deftest duplicate-column-test
(let [test-ds (ds/->dataset "test/data/ames-house-prices/train.csv"
{:column-whitelist ["SalePrice" "1stFlrSF" "2ndFlrSF"]
:n-records 5
:parser-fn {:SalePrice :float32}})
jt (ds-join/inner-join "1stFlrSF" test-ds test-ds)]
(is (= (ds/column-count jt)
(count (distinct (ds/column-names jt))))))
(let [test-ds (ds/->dataset "test/data/ames-house-prices/train.csv"
{:column-whitelist ["SalePrice" "1stFlrSF" "2ndFlrSF"]
:n-records 5
:parser-fn {:SalePrice :float32}})
jt (ds-join/inner-join ["1stFlrSF" "2ndFlrSF"] test-ds test-ds)]
(is (= (ds/column-count jt)
(count (distinct (ds/column-names jt)))))))
(deftest join-tuple-cname
(let [DS (ds/->dataset [{:a 11 [:a :b] 2}])
lj (ds-join/left-join :a DS DS)
rj (ds-join/right-join :a DS DS)
ljt (ds-join/left-join [[:a :b][:a :b]] DS DS)]
;;no nil column names
(is (every? identity (ds/column-names lj)))
(is (every? identity (ds/column-names rj)))
(is (every? identity (ds/column-names ljt)))))
(defn- drop-missing
[ds]
(ds/drop-rows ds (ds/missing ds)))
(deftest asof-lt
(let [ds-a (ds/->dataset {:a (range 10)})
ds-b (ds/->dataset {:a (dfn/* 2 (range 10))})
ds-bm (ds/->dataset {:a (dfn/- (dfn/* 2 (range 10)) 5)})
ds-bmm (ds/->dataset {:a (dfn/- (dfn/* 2 (range 10)) 14)})]
(is (= [2 2 4 4 6 6 8 8 10 10]
(vec ((ds-join/left-join-asof :a ds-a ds-b {:asof-op :<}) :right.a))))
(is (= [0 2 2 4 4 6 6 8 8 10]
(vec ((ds-join/left-join-asof :a ds-a ds-b {:asof-op :<=}) :right.a))))
(is (= [1 3 3 5 5 7 7 9 9 11]
(vec ((ds-join/left-join-asof :a ds-a ds-bm {:asof-op :<}) :right.a))))
(is (= [2 2 4 4 nil nil nil nil nil nil]
(vec ((ds-join/left-join-asof :a ds-a ds-bmm {:asof-op :<}) :right.a)))))
(let [cur-date (dtype-dt/local-date)
date-fn #(when %
(dtype-dt/plus-temporal-amount cur-date % :days))
ds-a (ds/->dataset {:a (date-fn (range 10))})
ds-b (ds/->dataset {:a (date-fn (dfn/* 2 (range 10)))})
ds-bm (ds/->dataset {:a (date-fn (dfn/- (dfn/* 2 (range 10)) 5))})
ds-bmm (ds/->dataset {:a (date-fn (dfn/- (dfn/* 2 (range 10)) 14))})]
(is (= (vec (date-fn [2 2 4 4 6 6 8 8 10 10]))
(vec (packing/unpack
((ds-join/left-join-asof :a ds-a ds-b {:asof-op :<}) :right.a)))))
(is (= (date-fn [0 2 2 4 4 6 6 8 8 10])
(vec (packing/unpack
((ds-join/left-join-asof :a ds-a ds-b {:asof-op :<=}) :right.a)))))
(is (= (date-fn [1 3 3 5 5 7 7 9 9 11])
(vec (packing/unpack
((ds-join/left-join-asof :a ds-a ds-bm {:asof-op :<}) :right.a)))))
(is (= (date-fn [2 2 4 4])
(vec (packing/unpack
((drop-missing (ds-join/left-join-asof
:a ds-a ds-bmm {:asof-op :<}))
:right.a)))))))
(deftest asof-gt
(let [ds-a (ds/->dataset {:a (range 10)})
ds-b (ds/->dataset {:a (dfn/* 2 (range 10))})
ds-bm (ds/->dataset {:a (dfn/- (dfn/* 2 (range 10)) 5)})
ds-bmm (ds/->dataset {:a (dfn/- (dfn/* 2 (range 10)) 14)})]
(is (= [nil 0 0 2 2 4 4 6 6 8]
(vec ((ds-join/left-join-asof :a ds-a ds-b {:asof-op :>}) :right.a))))
(is (= [0 0 2 2 4 4 6 6 8 8]
(vec ((ds-join/left-join-asof :a ds-a ds-b {:asof-op :>=}) :right.a))))
(is (= [-1 -1 1 1 3 3 5 5 7 7]
(vec ((ds-join/left-join-asof :a ds-a ds-bm {:asof-op :>}) :right.a))))
(is (= [-2 0 0 2 2 4 4 4 4 4]
(vec ((ds-join/left-join-asof :a ds-a ds-bmm {:asof-op :>}) :right.a))))))
(deftest asof-nearest
(let [ds-a (ds/->dataset {:a (range 10)})
ds-b (ds/->dataset {:a (dfn/* 3 (range 10))})
ds-bm (ds/->dataset {:a (dfn/- (dfn/* 3 (range 10)) 5)})
ds-bmm (ds/->dataset {:a (dfn/- (dfn/* 3 (range 10)) 20)})]
(is (= [0 0 3 3 3 6 6 6 9 9]
(vec ((ds-join/left-join-asof :a ds-a ds-b {:asof-op :nearest})
:right.a))))
(is (= [1 1 1 4 4 4 7 7 7 10]
(vec ((ds-join/left-join-asof :a ds-a ds-bm {:asof-op :nearest})
:right.a))))
(is (= [1 1 1 4 4 4 7 7 7 7]
(vec ((ds-join/left-join-asof :a ds-a ds-bmm {:asof-op :nearest})
:right.a))))))
(deftest pd-merge
(let [ds-a (ds/->dataset {:a [:a :b :b :a :c]
:b (range 5)
:c (range 5)})
ds-b (ds/->dataset {:a [:a :b :a :b :d]
:b (range 5)
:c (range 6 11)})]
(is (= [0 1 2 3 4 nil nil nil]
(vec ((ds-join/pd-merge ds-a ds-b {:on [:a :b] :how :outer}) :c))))
(is (= [6 7 nil nil nil]
(vec ((ds-join/pd-merge ds-a ds-b {:on [:a :b] :how :left}) :right.c))))
(is (= [0 1 nil nil nil]
(vec ((ds-join/pd-merge ds-a ds-b {:on [:a :b] :how :right}) :left.c))))
(is (= [6 7]
(vec ((ds-join/pd-merge ds-a ds-b {:on [:a :b] :how :inner}) :right.c))))
(is (= [6 7 8 9 10 6 7 8 9 10 6 7 8 9 10 6 7 8 9 10 6 7 8 9 10]
(vec ((ds-join/pd-merge ds-a ds-b {:how :cross}) :right.c))))))
(deftest double-join
(let [a (ds/->dataset [{:name "a" :a 1.0 :b 2.0}
{:name "b" :a 1.0 :b 2.0}
{:name "c" :a 1.0 :b 2.0}])
b (ds/->dataset [{:name "a" :c 1.0}
{:name "b" :c 1.0}])]
(is (= [1.0 1.0 nil]
(vec ((ds-join/left-join :name a b) :c))))
(is (= ["a" "b" nil]
(vec ((ds-join/left-join :name a b) :right.name))))
(is (= [2.0 2.0 2.0]
(vec ((ds-join/left-join :name a b) :b))))
(is (= [1.0 1.0 1.0]
(vec ((ds-join/left-join :name a b) :a))))
(is (= ["a" "b" "c"]
(vec ((ds-join/left-join :name a b) :name))))
(ds-join/left-join :name a b)))
(deftest eraderna-left-join
(testing "Changing the type of int shouldn't break the join"
(let [a (-> (ds/->dataset [{:y 2022}]))
a' (-> a
(ds/column-cast :y :int16))
b (ds/->dataset [{:y 2022 :s "2022"}
{:y 2023 :s "2023"}])]
(is (=
((ds-join/left-join :y a b) :s)
((ds-join/left-join :y a' b) :s))))))
(deftest cross-join
(let [res (ds-join/pd-merge
(ds/->dataset {:a [1 2 3] :b [4 5 6]})
(ds/->dataset {:c [:a :b :c] :d [:x :y :z]})
{:how :cross})]
(is (= [1 1 1 2 2 2 3 3 3]
(res :a)))
(is (= [:a :b :c :a :b :c :a :b :c]
(res :c)))))
(deftest pd-merge-issue-302
(let [res (ds-join/pd-merge (ds/->dataset {:id ["a" "b"]
:x [1 2]})
(ds/->dataset {:id ["c"]
:y [3]})
{:on [:id] :how :outer})]
(is (= [nil nil 3] (vec (:y res))))))
(deftest left-join-dates
(is (= [{:a (LocalDate/of 2022 12 20)
:b 4,
:right.a (LocalDate/of 2022 12 20)
:c 5}
{:a (LocalDate/of 2022 12 28)
:b 3}
{:a (LocalDate/of 2022 12 30)
:b 4}]
(vec (ds/rows
(tech.v3.dataset.join/left-join
:a
(ds/->dataset [{:a (LocalDate/of 2022 12 28) :b 3}
{:a (LocalDate/of 2022 12 30) :b 4}
{:a (LocalDate/of 2022 12 20) :b 4}])
(ds/->dataset [{:a (LocalDate/of 2022 12 20) :c 5}
{:a (LocalDate/of 2022 10 20) :c 6}
{:a (LocalDate/of 2022 11 20) :c 7}])))))))
(deftest issue-361
(let [ds1 (ds/->dataset {:a '(\1 \2 \3 \4 \5 \6 \7 \8 \9)})
ds2 (ds/->dataset {:a '(\0 \9 \8 \7 \6 \5 \4 \3 \2)})
jds (ds-join/left-join :a ds1 ds2)]
(is (= 9 (ds/row-count jds)))
(is (= 1 (dtype/ecount (ds/missing jds))))))
(deftest issue-377
(let [j (ds-join/left-join :a
(ds/->dataset {:a [nil 2]
:b [3 4]})
(ds/->dataset {:a [nil 4]
:b [6 7]}))]
(is (= [6 nil] (vec (j :right.b))))))
(deftest short-types
(let [lds (ds/->dataset [{:i "foo" :y (short 2022)}])
rds (ds/->dataset [{:i "foo" :y 2022 :s "2022"}
{:i "foo" :y 2023 :s "2023"}])
jds (ds-join/pd-merge lds rds {:on [:i :y]})]
(is (= {:i "foo" :y 2022 :s "2022"}
(ds/row-at jds 0))))
(is (= 1 (ds/row-count
(ds-join/left-join :z
(ds/->dataset [{:z ["foo" (short 2022)]}])
(ds/->dataset [{:z ["foo" (long 2022)] :s "2022"}
{:z ["foo" (long 2023)] :s "2023"}]))))))
(deftest issue-381
(let [make-row (fn [] {:row 1})
left (ds/->dataset (repeatedly 10000 make-row))
right (ds/->dataset (repeatedly 1000 make-row))
jds (ds-join/left-join :row left right)]
(is (= (* 10000 1000) (ds/row-count jds)))))
(deftest pd-merge-error
(let [ds1 (ds/->dataset {:customer ["A" "A" "A"]
:product ["A" "B" "C"]})
ds2 (ds/->dataset {:product ["B" "C"]})
mm (ds-join/pd-merge ds1 ds2 {:on :product :how :inner})]
(is (= #{:product :customer}
(set (ds/column-names mm))))))
(deftest pd-merge-issue-435
(is (ds-join/pd-merge (ds/empty-dataset)
(ds/->dataset {:t [0 1] :x [:a :b]})
{:on :t :how :outer}))
(is (ds-join/pd-merge (ds/->dataset {:t [0 1] :x [:a :b]})
(ds/empty-dataset)
{:on :t :how :outer})))
+222
View File
@@ -0,0 +1,222 @@
(ns tech.v3.dataset.mapseq-test
(:require [tech.v3.dataset :as ds]
[tech.v3.dataset.column :as ds-col]
[tech.v3.dataset.column-filters :as cf]
[tech.v3.dataset.math :as ds-math]
[tech.v3.dataset.modelling :as ds-mod]
[tech.v3.dataset.categorical :as ds-cat]
[tech.v3.dataset.test-utils :as test-utils]
[tech.v3.datatype :as dtype]
[tech.v3.datatype.functional :as dtype-fn]
[tech.v3.tensor :as dtt]
[clojure.set :as set]
[clojure.test :refer [deftest is testing]]))
(deftest mapseq-classification-test
(let [src-ds (test-utils/mapseq-fruit-dataset)
dataset (ds/bind-> src-ds ds
(ds/remove-columns [:fruit-subtype :fruit-label])
(ds/categorical->number cf/categorical)
(ds/update (cf/difference ds (cf/categorical ds))
#(ds-math/transform-minmax % (ds-math/fit-minmax %)))
(ds-mod/set-inference-target :fruit-name))
mapseq-ds (ds/mapseq-reader (test-utils/mapseq-fruit-dataset))
src-keys (set (keys (first mapseq-ds)))
result-keys (->> (ds/columns dataset)
(map ds-col/column-name)
(set))
non-categorical (ds/column-names
(cf/difference dataset (cf/categorical dataset)))]
(is (= #{59}
(->> (ds/columns dataset)
(map dtype/ecount)
set)))
;;Column names can be keywords.
(is (= src-keys
(set (->> (ds/columns src-ds)
(map ds-col/column-name)))))
(is (= (set/difference src-keys #{:fruit-subtype :fruit-label})
result-keys))
;; Map back from values to keys for labels. For tablesaw, column values
;; are never keywords.
(is (= (mapv :fruit-name mapseq-ds)
(vec (first (vals (ds-mod/labels dataset))))))
(is (= {:fruit-name :classification}
(ds-mod/model-type dataset)))
(is (= {:fruit-name :classification,
:mass :regression,
:width :regression,
:height :regression,
:color-score :regression}
(ds-mod/model-type dataset (ds/column-names dataset))))
;;Does the post-transformation value of fruit-name map to the
;;pre-transformation value of fruit-name?
(is (= (mapv :fruit-name mapseq-ds)
(->> (ds-cat/reverse-map-categorical-xforms dataset)
(ds/mapseq-reader)
(mapv :fruit-name))))
(is (= (as-> (ds/select dataset :all (range 10)) dataset
(ds/mapseq-reader dataset)
(group-by :fruit-name dataset))
(as-> (ds/select dataset :all (range 10)) ds
(ds/group-by-column ds :fruit-name)
(map (fn [[k group-ds]]
[k (vec (ds/mapseq-reader group-ds))])
ds)
(into {} ds))))
;;forward map from input value to encoded value.
;;After ETL, column values are all doubles
(let [apple-value (get (ds-mod/inference-target-label-map dataset) :apple)]
(is (= #{:apple}
(as-> dataset ds
(ds/filter ds #(= apple-value (:fruit-name %)))
;;Use full version of ->flyweight to do reverse mapping of numeric
;;fruit name back to input label.
(ds-cat/reverse-map-categorical-xforms ds)
(ds/mapseq-reader ds)
(map :fruit-name ds)
(set ds)))))
;; Ensure range map works
(is (= (vec (repeat (count non-categorical) [-0.5 0.5]))
(->> non-categorical
(mapv (fn [colname]
(let [{col-min :min
col-max :max} (-> (ds/column dataset colname)
(ds-col/stats [:min :max]))]
[col-min col-max]))))))
;;Concatenation should work
(is (= (mapv :fruit-name
(concat mapseq-ds mapseq-ds))
(->> (-> (ds/concat dataset dataset)
(ds-cat/reverse-map-categorical-xforms)
(ds/mapseq-reader))
(mapv :fruit-name))))
(let [new-ds (ds/bind-> (ds/->dataset (map hash-map (repeat :mass) (range 20))) dataset
;;The mean should happen in double or floating point space.
(assoc :mass-avg
(dtype-fn/fixed-rolling-window
(dtype/elemwise-cast (dataset :mass) :float64)
5 dtype-fn/mean)))]
(is (= [{:mass 0, :mass-avg 0.6}
{:mass 1, :mass-avg 1.2}
{:mass 2, :mass-avg 2.0}
{:mass 3, :mass-avg 3.0}
{:mass 4, :mass-avg 4.0}
{:mass 5, :mass-avg 5.0}
{:mass 6, :mass-avg 6.0}
{:mass 7, :mass-avg 7.0}
{:mass 8, :mass-avg 8.0}
{:mass 9, :mass-avg 9.0}]
(-> (ds/select new-ds [:mass :mass-avg] (range 10))
ds/mapseq-reader)))
(let [sorted-ds (ds/sort-by-column new-ds :mass-avg >)]
(is (= [{:mass 19, :mass-avg 18.4}
{:mass 18, :mass-avg 17.8}
{:mass 17, :mass-avg 17.0}
{:mass 16, :mass-avg 16.0}
{:mass 15, :mass-avg 15.0}
{:mass 14, :mass-avg 14.0}
{:mass 13, :mass-avg 13.0}
{:mass 12, :mass-avg 12.0}
{:mass 11, :mass-avg 11.0}
{:mass 10, :mass-avg 10.0}]
(-> (ds/select sorted-ds [:mass :mass-avg] (range 10))
ds/mapseq-reader)))))
(let [nth-db (ds/take-nth src-ds 5)]
(is (= [7 12] (dtype/shape nth-db)))
(is (= [{:mass 192.0, :width 8}
{:mass 80.0, :width 5}
{:mass 166.0, :width 6}
{:mass 156.0, :width 7}
{:mass 160.0, :width 7}
{:mass 356.0, :width 9}
{:mass 158.0, :width 7}
{:mass 150.0, :width 7}
{:mass 154.0, :width 7}
{:mass 186.0, :width 7}]
(->> (-> (ds/select nth-db [:mass :width] (range 10))
ds/mapseq-reader)
(map #(update % :width int))))))))
(deftest one-hot
(testing "Testing one-hot into multiple column groups"
(let [src-ds (test-utils/mapseq-fruit-dataset)
dataset (-> src-ds
(ds/remove-columns [:fruit-subtype :fruit-label])
(ds-mod/set-inference-target :fruit-name)
(ds/categorical->one-hot [:fruit-name]))]
(is (= {:one-hot-table
{:orange :fruit-name-orange,
:mandarin :fruit-name-mandarin,
:apple :fruit-name-apple,
:lemon :fruit-name-lemon},
:src-column :fruit-name,
:result-datatype :int64}
(into {} (first (ds-cat/dataset->one-hot-maps dataset)))))
(is (= #{:mass :fruit-name-orange :fruit-name-mandarin :width :fruit-name-apple :color-score
:fruit-name-lemon :height}
(->> (ds/columns dataset)
(map ds-col/column-name)
set)))
(is (= (->> (ds/mapseq-reader src-ds)
(take 20)
(mapv :fruit-name))
(->> (first (vals (ds-mod/labels dataset)))
(take 20)
vec)))
(is (= {:color-score :regression,
:fruit-name-orange :classification,
:fruit-name-lemon :classification,
:fruit-name-mandarin :classification,
:fruit-name-apple :classification,
:height :regression
:width :regression,
:mass :regression,
}
(ds-mod/model-type dataset (ds/column-names dataset)))))))
(deftest generalized-mapseq-ds
(let [ds (ds/->dataset [{:a 1 :b {:a 1 :b 2}}
{:a 2}])]
(is (= #{:int64 :persistent-map}
(set (map dtype/get-datatype (vals ds)))))))
(deftest tensors-in-mapseq
(let [ds (ds/->dataset [{:a (dtt/->tensor (partition 3 (range 9)))
:b "hello"}
{:a (dtt/->tensor (partition 3 (range 9)))
:b "goodbye"}])]
(is (= #{:tensor :string}
(set (map dtype/get-datatype (vals ds)))))))
(deftest datetime-missing
(let [ds (ds/->dataset [{:d "1971-01-01"}
{:d "1970-01-01"}
{:d nil}
{:d "0001-01-01"}]
{:parser-fn {:d :local-date}})]
(is (= 1 (dtype/ecount (ds-col/missing (ds :d)))))))
+68
View File
@@ -0,0 +1,68 @@
(ns tech.v3.dataset.math-test
(:require [tech.v3.dataset :as ds]
[tech.v3.dataset.math :as ds-math]
[tech.v3.dataset.tensor :as ds-tens]
[tech.v3.datatype :as dtype]
[tech.v3.datatype.datetime :as dtype-dt]
[tech.v3.datatype.functional :as dfn]
[tech.v3.tensor :as dtt]
[clojure.test :refer [deftest is]]))
(deftest basic-interp
(let [interp-ds (-> (ds/->dataset "test/data/stocks.csv")
(ds/filter-column "symbol" "MSFT")
;;The interpolate requires a sorted dataset
(ds/sort-by-column "date")
(ds-math/interpolate-loess "date" "price"
{:result-name "price-loess"}))]
(is (not (nil? (:interpolator (meta (interp-ds "price-loess"))))))))
(deftest fill-range-replace
(let [ds (-> (ds/->dataset {:a [1 5 10 15 20]
:b [2 2 nil 4 8]})
(ds-math/fill-range-replace :a 2))]
(is (dfn/equals
[1.0 3.0 5.0 6.66 8.33 10.0
11.66 13.33 15.0 16.66 18.33 20.0]
(vec (ds :a))
0.1))
(is (= [2 2 2 2 2 2 2 2 4 4 4 8]
(vec (ds :b)))))
(let [ds (-> (ds/->dataset {:a [1 5 10 15 20]
:b [2 2 nil 4 8]})
(ds-math/fill-range-replace :a 2 nil))]
(is (= [2 nil 2 nil nil nil nil nil 4 nil nil 8]
(vec (ds :b)))))
(let [ds (-> (ds/->dataset {:a [1 5 10 15 20]
:b [2 2 nil 4 8]})
(ds-math/fill-range-replace :a 2 :value 20))]
(is (= [2 20 2 20 20 20 20 20 4 20 20 8]
(vec (ds :b)))))
(let [ds (-> (ds/->dataset {:a (dtype-dt/plus-temporal-amount
(dtype-dt/local-date)
[1 5 10 15 20]
:days)
:b [2 2 nil 4 8]})
(ds-math/fill-range-replace :a (* 2 dtype-dt/milliseconds-in-day)
:value 20))]
(is (= [2 20 2 20 20 20 20 20 4 20 20 8]
(vec (ds :b))))))
(comment
(def test-ds (ds/->dataset {:a [7 4 6 8 8 7 5 9 7 8]
:b [4 1 3 6 5 2 3 5 4 2]
:c [3 8 5 1 7 9 3 8 5 2]}))
(def test-data (dtt/->tensor [[10 8 6 20 9]
[11 21 23 18 4]
[12 7 5 13 19]
[ 3 14 15 22 17]
[24 1 2 0 16]] :datatype :float64))
(def test-data (dtt/transpose (dtt/->tensor [[7 4 6 8 8 7 5 9 7 8]
[4 1 3 6 5 2 3 5 4 2]
[3 8 5 1 7 9 3 8 5 2]] :datatype :float64)
[1 0]))
)
+37
View File
@@ -0,0 +1,37 @@
(ns tech.v3.dataset.metamorph-test
(:require [tech.v3.dataset.metamorph :as ds-mm]
[tech.v3.dataset :as ds]
[clojure.test :as t :refer [deftest is]]))
(def df
(ds/->dataset "test/data/ames-train.csv.gz" {:key-fn keyword}))
(deftest call-with-df-1
(is (= [1 2 3 4 5]
(->>
((ds-mm/set-inference-target :SalePrice) df)
:metamorph/data
:Id
(take 5)
))))
(deftest call-with-df-2
(is (= [1 2 3 4 5]
(->>
((ds-mm/rename-columns {:SalePrice :sale-price :Id :id})
{:metamorph/data df})
((ds-mm/set-inference-target :sale-price) )
:metamorph/data
:id
(take 5)))))
(deftest brief
(let [df (ds/select-columns df (sort (ds/column-names df)))]
(is (= 334.0
(->>
((ds-mm/brief)
{:metamorph/data df})
:metamorph/data
first
:min)))))
+45
View File
@@ -0,0 +1,45 @@
(ns tech.v3.dataset.modelling-test
(:require [tech.v3.dataset.modelling :as modelling]
[tech.v3.dataset :as ds]
[tech.v3.dataset.categorical :as ds-cat]
[tech.v3.dataset.test-utils :as test-utils]
[tech.v3.datatype :as dtype]
[clojure.test :refer [deftest is]]))
(deftest k-fold-sanity
(let [dataset-seq (modelling/k-fold-datasets (test-utils/mapseq-fruit-dataset) 5 {})]
(is (= 5 (count dataset-seq)))
(is (= [[7 47] [7 47] [7 47] [7 47] [7 48]]
(->> dataset-seq
(mapv (comp dtype/shape :train-ds)))))
(is (= [[7 12] [7 12] [7 12] [7 12] [7 11]]
(->> dataset-seq
(mapv (comp dtype/shape :test-ds)))))))
(deftest train-test-split-sanity
(let [dataset (modelling/train-test-split
(test-utils/mapseq-fruit-dataset) {})]
(is (= [7 41]
(dtype/shape (:train-ds dataset))))
(is (= [7 18]
(dtype/shape (:test-ds dataset))))))
(deftest prob-dist->label-col
(let [ds (ds/->dataset (tech.v3.dataset/->dataset
{:y-0 [0.0 0.5 0.3 0.1]
:y-1 [0.3 0.8 0.2 0.3]}))
prob-dist-ds (modelling/probability-distributions->label-column ds :y)
label-ds (ds-cat/reverse-map-categorical-xforms prob-dist-ds)]
(is (= [:y-1 :y-1 :y-0 :y-1]
(label-ds :y)))))
(deftest issue-267-prob-dist-fail-on-nan-missing
(is (thrown? Throwable
(-> (tech.v3.dataset/->dataset {:y-0 [Double/NaN] :y-1 [0.3]})
(modelling/probability-distributions->label-column :y))))
(is (thrown? Throwable
(-> (tech.v3.dataset/->dataset {:y-0 [nil] :y-1 [0.3]} )
(tech.v3.dataset.modelling/probability-distributions->label-column :y)))))
@@ -0,0 +1,29 @@
(ns tech.v3.dataset.object-columns-test
(:require [clojure.test :refer [deftest is]]
[tech.v3.dataset :as ds]
[tech.v3.datatype :as dtype]
[tech.v3.datatype.datetime :as dtype-dt]
[tech.v3.tensor :as dtt]))
(deftest basic-object-columns
(let [src-ds (ds/->dataset {:a (range 10)
:b (repeat 10 {:a 1 :b 2})})]
(is (= :persistent-map
(dtype/get-datatype (src-ds :b))))
(is (= (vec (repeat 10 {:a 1 :b 2}))
(vec (dtype/->reader (src-ds :b)))))))
(deftest involved-object-columns
(let [src-ds (ds/->dataset
{:dates (list "2000-01-01" "2000-02-01" "2000-03-01"
"2000-04-01" "2000-05-01")
:integers (range 5)
:durations (repeat 5 (dtype-dt/duration))
:doubles (map double (range 5))
:tensors (repeat 5 (dtt/->tensor (partition 2 (range 4))))})]
(is (= #{:float64 :string :int64 :tensor
:packed-duration}
(->> (map dtype/get-datatype (vals src-ds))
set)))))
+528
View File
@@ -0,0 +1,528 @@
(ns tech.v3.dataset.parse-test
(:require [clojure.test :refer [deftest is]]
[tech.v3.datatype :as dtype]
[tech.v3.datatype.functional :as dfn]
[tech.v3.datatype.bitmap :as bitmap]
[tech.v3.dataset :as ds]
[tech.v3.dataset.zip :as zip]
[tech.v3.dataset.column :as ds-col]
[tech.v3.dataset.protocols :as ds-proto]
[tech.v3.dataset.io.nippy]
[tech.v3.libs.arrow :as arrow]
[tech.v3.libs.clj-transit :as ds-transit]
[taoensso.nippy :as nippy]
[clojure.set :as set]
[clojure.java.io :as io])
(:import [com.univocity.parsers.csv CsvFormat CsvParserSettings CsvParser]
[java.nio.charset StandardCharsets]))
(def test-file "test/data/ames-house-prices/train.csv")
(def missing-data
(->> [{:column-name "LotFrontage", :missing-count 259}
{:column-name "Alley", :missing-count 1369}
{:column-name "MasVnrType", :missing-count 8}
{:column-name "MasVnrArea", :missing-count 8}
{:column-name "BsmtQual", :missing-count 37}
{:column-name "BsmtCond", :missing-count 37}
{:column-name "BsmtExposure", :missing-count 38}
{:column-name "BsmtFinType1", :missing-count 37}
{:column-name "BsmtFinType2", :missing-count 38}
{:column-name "Electrical", :missing-count 1}
{:column-name "FireplaceQu", :missing-count 690}
{:column-name "GarageType", :missing-count 81}
{:column-name "GarageYrBlt", :missing-count 81}
{:column-name "GarageFinish", :missing-count 81}
{:column-name "GarageQual", :missing-count 81}
{:column-name "GarageCond", :missing-count 81}
{:column-name "PoolQC", :missing-count 1453}
{:column-name "Fence", :missing-count 1179}
{:column-name "MiscFeature", :missing-count 1406}
]
(map (juxt :column-name :missing-count))
(sort-by first)))
(def datatype-answers
[["1stFlrSF" :int16]
["2ndFlrSF" :int16]
["3SsnPorch" :int16]
["Alley" :string]
["BedroomAbvGr" :int16]
["BldgType" :string]
["BsmtCond" :string]
["BsmtExposure" :string]
["BsmtFinSF1" :int16]
["BsmtFinSF2" :int16]
["BsmtFinType1" :string]
["BsmtFinType2" :string]
["BsmtFullBath" :int16]
["BsmtHalfBath" :int16]
["BsmtQual" :string]
["BsmtUnfSF" :int16]
["CentralAir" :string]
["Condition1" :string]
["Condition2" :string]
["Electrical" :string]
["EnclosedPorch" :int16]
["ExterCond" :string]
["ExterQual" :string]
["Exterior1st" :string]
["Exterior2nd" :string]
["Fence" :string]
["FireplaceQu" :string]
["Fireplaces" :int16]
["Foundation" :string]
["FullBath" :int16]
["Functional" :string]
["GarageArea" :int16]
["GarageCars" :int16]
["GarageCond" :string]
["GarageFinish" :string]
["GarageQual" :string]
["GarageType" :string]
["GarageYrBlt" :int16]
["GrLivArea" :int16]
["HalfBath" :int16]
["Heating" :string]
["HeatingQC" :string]
["HouseStyle" :string]
["Id" :int16]
["KitchenAbvGr" :int16]
["KitchenQual" :string]
["LandContour" :string]
["LandSlope" :string]
["LotArea" :int32]
["LotConfig" :string]
["LotFrontage" :int16]
["LotShape" :string]
["LowQualFinSF" :int16]
["MSSubClass" :int16]
["MSZoning" :string]
["MasVnrArea" :int16]
["MasVnrType" :string]
["MiscFeature" :string]
["MiscVal" :int16]
["MoSold" :int16]
["Neighborhood" :string]
["OpenPorchSF" :int16]
["OverallCond" :int16]
["OverallQual" :int16]
["PavedDrive" :string]
["PoolArea" :int16]
["PoolQC" :string]
["RoofMatl" :string]
["RoofStyle" :string]
["SaleCondition" :string]
["SalePrice" :int32]
["SaleType" :string]
["ScreenPorch" :int16]
["Street" :string]
["TotRmsAbvGrd" :int16]
["TotalBsmtSF" :int16]
["Utilities" :string]
["WoodDeckSF" :int16]
["YearBuilt" :int16]
["YearRemodAdd" :int16]
["YrSold" :int16]])
(deftest base-ames-parser-test
(let [result (ds/->dataset test-file)
dtypes (->> (vals result)
(map meta)
(sort-by :name)
(mapv (juxt :name :datatype)))]
(is (= (set (map first datatype-answers))
(set (map first dtypes))))
(let [dtype-map (into {} dtypes)
differences (->> datatype-answers
(map (fn [[colname col-dtype]]
(let [detected-dtype (dtype-map colname)]
(when-not (= detected-dtype col-dtype)
{:name colname
:expected-datatype col-dtype
:result-datatype detected-dtype}))))
(remove nil?)
seq)]
(is (nil? differences)
(str differences)))
(let [result-missing-data (->> (vals result)
(map (juxt ds-col/column-name
(comp dtype/ecount ds-col/missing)))
(remove #(= 0 (second %)))
(sort-by first))]
(is (= (set (map first missing-data))
(set (map first result-missing-data))))))
(let [result (ds/->dataset
test-file
{:n-records 100
:column-whitelist ["Id" "SalePrice" "YearBuilt"]})]
(is (= 3 (count result)))
;;Header row accounts for one.
(is (= 100 (ds/row-count result)))))
(deftest base-ames-load-test
;;Here we just test that the options correctly pass through ->dataset
(let [result (ds/->dataset test-file
{:n-records 100
:column-whitelist ["Id" "SalePrice" "YearBuilt"]})]
(is (= 3 (ds/column-count result)))
;;Header row accounts for one.
(is (= 100 (ds/row-count result)))))
(deftest specify-column-types
;;parse everything as float32
(let [result (ds/->dataset
test-file
{:n-records 100
:column-whitelist ["1stFlrSF" "2ndFlrSF" "3SsnPorch"]
:parser-fn :float32})]
(is (= #{:float32}
(set (map dtype/get-datatype (vals result)))))
(is (= 3 (ds/column-count result))))
;;Next up is a map of colname->datatype
(let [result (ds/->dataset
test-file
{:n-records 100
:column-whitelist ["1stFlrSF" "2ndFlrSF" "3SsnPorch"]
:parser-fn {"1stFlrSF" :float32
"2ndFlrSF" :int32}})]
(is (= #{:float32 :int32 :int16}
(set (map dtype/get-datatype (vals result)))))))
(deftest semi-colon-delimited-file
(let [result (ds/->dataset "test/data/sample01.csv"
{:separator \;})]
(is (= 3 (ds/column-count result)))))
(deftest tough-file
(let [result (ds/->dataset "test/data/essential.csv"
{:n-initial-skip-rows 1
:skip-bad-rows? true})]
(is (= 5 (ds/column-count result)))))
(defn- make-essential-csv-parser
[]
(-> (doto (CsvParserSettings.)
(.. getFormat (setLineSeparator "\n"))
(.setHeaderExtractionEnabled true)
(.setIgnoreLeadingWhitespaces true)
(.setIgnoreTrailingWhitespaces true))
(CsvParser.)))
(deftest custom-csv-parser
(let [result (ds/->dataset "test/data/essential.csv"
{:csv-parser (make-essential-csv-parser)
:skip-bad-rows? true})]
(is (= 5 (ds/column-count result)))))
(deftest simple-write-test
(let [initial-ds (ds/->dataset
test-file
{:num-rows 20
:column-whitelist ["1stFlrSF" "2ndFlrSF" "3SsnPorch"]})
_ (ds/write! initial-ds "test.tsv")
new-ds (ds/->dataset "test.tsv")]
(is (dfn/equals (initial-ds "1stFlrSF")
(new-ds "1stFlrSF")))
(is (dfn/equals (initial-ds "2ndFlrSF")
(new-ds "2ndFlrSF"))))
(let [missing-ds (-> (ds/->dataset
test-file
{:n-records 20
:column-whitelist [43 44 69]})
(ds/update-column
"1stFlrSF"
#(ds-col/set-missing % [2 4 7 9])))
_ (ds/write! missing-ds "test.tsv")
new-ds (ds/->dataset "test.tsv")]
(is (dfn/equals (missing-ds "1stFlrSF")
(new-ds "1stFlrSF")))
(is (= #{2 4 7 9}
(set (ds-col/missing (new-ds "1stFlrSF")))))))
(deftest date-time-format-test-1
(let [stock-ds (ds/->dataset "test/data/stocks.csv")]
(is (= :packed-local-date (dtype/get-datatype (stock-ds "date")))))
(let [temp-ds (ds/->dataset "test/data/seattle-temps.csv")]
(is (= :zoned-date-time (dtype/get-datatype (temp-ds "date")))))
(let [stock-ds (ds/->dataset "test/data/stocks.csv"
{:parser-fn
{"date" :local-date}})]
(is (= :local-date (dtype/get-datatype (stock-ds "date"))))))
(deftest custom-reader
(is (= 560 (ds/row-count (ds/->dataset (io/reader "test/data/stocks.csv")
{:file-type :csv})))))
(defn verify-relaxed-parse
[ds]
(let [date-col (ds "date")
col-meta (meta date-col)
^List unparsed-data (:unparsed-data col-meta)
^RoaringBitmap unparsed-indexes (:unparsed-indexes col-meta)]
(is (= :packed-local-date (dtype/get-datatype date-col)))
;;Make sure unparsed data came through intact
(is (= #{"hello" "1212"}
(set unparsed-data)))))
(deftest bad-csv-relaxed-1
(let [ds (ds/->dataset "test/data/stocks-bad-date.csv")]
(is (= :string (dtype/get-datatype (ds "date"))))
;;Make sure unparsed data came through intact
(is (= #{"hello" "1212"}
(set/intersection #{"hello" "1212"}
(set (ds-col/unique (ds "date"))))))
(let [updated-ds (ds/update-column
ds "date" (partial ds-col/parse-column
[:packed-local-date :relaxed?]))]
(verify-relaxed-parse updated-ds))))
(deftest bad-csv-relaxed-2
(let [ds (ds/->dataset "test/data/stocks-bad-date.csv"
{:parser-fn
{"date" [:packed-local-date :relaxed?]}})]
(verify-relaxed-parse ds)))
(deftest csv-keyword-colnames
(let [stocks (ds/->dataset "test/data/stocks.csv" {:key-fn keyword})]
(is (every? keyword? (ds/column-names stocks)))))
(deftest parse-empty-column-name
(let [data (ds/->dataset "test/data/rcsv.csv")]
(is (= #{"column-0" "Urban Female" "Urban Male" "Rural Female" "Rural Male"}
(set (ds/column-names data))))))
(deftest parse-ip-addrs-as-string
(let [data (ds/->dataset "test/data/ip-addrs.csv")]
(is (= :string (dtype/get-datatype (data "ip"))))))
(def arrow-file "test/data/iris.feather")
(def parquet-file "test/data/parquet/userdata1.parquet")
;;We will get back to this one. Potentially there are good ways into this
;;via arrow.
#_(deftest parse-parquet
(let [ds (ds/->dataset parquet-file)]
(is (= 13 (ds/column-count ds)))
(is (= 1000 (ds/row-count ds)))
(is (= #{:local-date-time :float64 :int32 :string}
(->> (map dtype/get-datatype (vals ds))
set)))))
(deftest parse-ragged
(let [ds (ds/->dataset "test/data/ragged.csv"
{:header-row? false
:key-fn keyword})]
(is (= [:column-0 :column-1 :column-2 :column-3 :column-4 :column-5
:column-6 :column-7 :column-8 :column-9 :column-10 :column-11]
(vec (ds/column-names ds))))
(is (= 12 (ds/column-count ds)))
(is (= [4 24 31 33 65 67 68 71 75 76 93 97]
(vec ((ds/value-reader ds) 4))))
(is (= [10 33 51 66 67 84 nil nil nil nil nil nil]
(vec ((ds/value-reader ds) 10))))))
(deftest parse-small-doubles
(let [ds (ds/->dataset "test/data/double_parse_test.csv")]
(is (= 197 (count (filter #(not= 0.0 % ) (ds "pvalue")))))))
(deftest string-separators
(let [ds (ds/->dataset "test/data/double_parse_test.csv" {:separator ","})]
(is (= 197 (count (filter #(not= 0.0 % ) (ds "pvalue")))))
(is (thrown? Throwable (ds/->dataset "test/data/double_parse_test.csv"
{:separator ",n"})))))
(deftest quoted-column-data
(try
(let [ds (ds/->dataset [{:a "onelongstring"}])]
(ds/write! ds "quoted.csv" {:quote? true})
(is (= "\"a\"\n\"onelongstring\"\n"
(slurp "quoted.csv"))))
(finally
(.delete (java.io.File. "quoted.csv")))))
(deftest text-data
(try
(let [ds (ds/->dataset [{:a "onestring"}
{:a "anotherstring"}
{}]
{:parser-fn :text})
_ (is (= :text (-> (ds :a) meta :datatype)))
_ (ds/write! ds "text.csv")
_ (ds/write! ds "text.nippy")
csv-ds (ds/->dataset "text.csv" {:parser-fn {"a" :text}
:key-fn keyword})
_ (is (= :text (-> (csv-ds :a) meta :datatype)))
;;_ (is (= 3 (ds/row-count csv-ds)))
nippy-ds (ds/->dataset "text.nippy")
_ (is (= :text (-> (nippy-ds :a) meta :datatype)))
_ (is (= 3 (ds/row-count nippy-ds)))
_ (arrow/write-dataset-to-stream! ds "text.arrow")
ds-copy (arrow/read-stream-dataset-copying "text.arrow" {:key-fn keyword})
_ (is (= :text (-> (ds-copy :a) meta :datatype)))
_ (is (= 3 (ds/row-count nippy-ds)))
ds-inplace (arrow/read-stream-dataset-inplace "text.arrow")]
(is (= :text (-> (ds-inplace "a") meta :datatype)))
(is (= 3 (ds/row-count nippy-ds))))
(finally
(.delete (java.io.File. "text.csv"))
(.delete (java.io.File. "text.nippy"))
(.delete (java.io.File. "text.arrow")))))
(deftest custom-parse-method
(try
(let [src-ds (ds/->dataset {:a ["1" "missing" "parse-failure" "2" "3"]})
_ (ds/write! src-ds "custom-parse.csv")
ds (ds/->dataset
"custom-parse.csv"
{:parser-fn {"a" [:int64
(fn [str-val]
(cond
(= str-val "missing")
:tech.v3.dataset/missing
(= str-val "parse-failure")
:tech.v3.dataset/parse-failure
:else
(Long/parseLong str-val)))]}})]
(is (= [1 nil nil 2 3]
(vec (ds "a"))))
(is (= #{1 2} (set (ds/missing ds))))
(is (= #{2}
(set (:unparsed-indexes (meta (ds "a"))))))
(is (= ["parse-failure"]
(vec (:unparsed-data (meta (ds "a")))))))
(finally
(.delete (java.io.File. "custom-parse.csv")))))
(deftest stocks-v5
(let [v5 (ds/->dataset "test/data/stocks-v5.nippy")
cur (ds/->dataset "test/data/stocks.csv")]
(is (= (vec (v5 "date"))
(vec (cur "date"))))))
(deftest gzipped-input-stream-issue-247
(let [ds (ds/->dataset (io/input-stream "test/data/ames-train.csv.gz")
{:file-type :csv
:gzipped? true})
correct-ds (ds/->dataset "test/data/ames-train.csv.gz")]
(is (= (ds/row-count correct-ds) (ds/row-count ds)))))
(deftest pokemon-csv
(let [ds (ds/->dataset "test/data/pokemon.csv")]
(is (= "['Overgrow', 'Chlorophyll']" (first (ds "abilities"))))))
(deftest issue-292
(let [ds (ds/->dataset "test/data/issue-292.csv" )]
(is (== 3 (ds/column-count ds)))))
(deftest json-test
(try
(let [ds (-> (ds/->dataset "test/data/stocks.csv")
(ds/column-map "date" str ["date"]))
_ (ds/write! ds "stocks.json")
jds (ds/->dataset "stocks.json")]
(is (= (vec (ds "date")) (vec (jds "date"))))
(is (dfn/equals (ds "price") (jds "price"))))
(finally
(.delete (java.io.File. "stocks.json")))))
(deftest nippy-column
(let [ds (ds/->dataset {:a [1 2 3] :b [4 5 6]})
frozen (nippy/freeze (ds :a))
thawed (nippy/thaw frozen)]
(is (dfn/equals (ds :a) thawed))
(is (ds-proto/is-column? thawed))))
(deftest empty-csv
(let [ds (ds/->dataset "test/data/empty-csv-header.csv")]
(is (= 7 (ds/column-count ds))))
(let [ds (ds/->dataset "test/data/empty-csv.csv")]
(is (= 0 (ds/column-count ds)))
(is (ds/dataset? ds))))
(deftest comment-char
(let [ds (ds/->dataset "test/data/csv-comment.csv")
rows (ds/rows ds)]
(is (= 5 (ds/row-count ds)))
(is (= (rows -1) (rows -2)))))
(deftest issue-304
(let [ds (ds/->dataset "test/data/issue-292.csv" {:n-initial-skip-rows 10})]
(is (= 11 (-> (ds "10") (first))))))
(deftest issue-362
(let [ds-seq (zip/zipfile->dataset-seq "test/data/unknown.zip")]
(is (= 2 (count ds-seq)))))
(deftest issue-388-transit-support
(let [ds (ds/->dataset {:a [1 2 3]
:b [:one :two :three]})
str-data (ds-transit/dataset->transit-str ds)
nds (ds-transit/transit-str->dataset str-data)]
(is (= (ds :a) (nds :a)))
(is (= (ds :b) (nds :b)))))
(deftest issue-434-transit-support
(let [ds (ds/->dataset {:a [1 2 3]
:b [:one :two :three]
;;transit encoding is milli instants
:c (dtype/make-container :packed-milli-instant [(java.time.Instant/now) (java.time.Instant/now)])})
str-data (ds-transit/dataset->transit-str ds)
nds (ds-transit/transit-str->dataset str-data)]
(is (= (ds :a) (nds :a)))
(is (= (ds :b) (nds :b)))
(is (= (ds :c) (nds :c)))))
(deftest issue-414-json-parser-fn
(is (= [1 2 3] (get (ds/->dataset "test/data/local_date.json"
{:parser-fn {:time-period :local-date}})
"test"))))
(deftest dataset-parser-clear-packed-column
(let [p (ds/dataset-parser)]
(ds-proto/add-row p {:date (java.time.Instant/now)})
(ds-proto/ds-clear p)
(ds-proto/add-row p {:date (java.time.Instant/now)})
(is (= 1 (count (@p :date))))))
+13
View File
@@ -0,0 +1,13 @@
(ns tech.v3.dataset.parser-test
(:require [tech.v3.dataset :as ds]
[tech.v3.dataset.protocols :as ds-proto]
[clojure.test :refer [deftest is]]))
(deftest all-missing-ds
(let [p (ds/dataset-parser)
_ (ds-proto/add-row p {})
ds @p]
(is (not (ds/dataset? ds)))
(is (= 1 (:tech.v3.dataset/row-count ds))
(= :all (:tech.v3.dataset/missing ds)))))
+461
View File
@@ -0,0 +1,461 @@
(ns tech.v3.dataset.reductions-test
(:require [tech.v3.dataset.reductions :as ds-reduce]
[tech.v3.dataset :as ds]
[tech.v3.dataset.column :as ds-col]
[tech.v3.datatype.functional :as dfn]
[tech.v3.datatype :as dtype]
[tech.v3.datatype.datetime :as dtype-dt]
[tech.v3.datatype.argops :as argops]
[tech.v3.datatype.statistics :as stats]
[tech.v3.dataset.reductions.apache-data-sketch :as ds-sketch]
[tech.v3.dataset.categorical :as dsc]
[tech.v3.parallel.for :as pfor]
[ham-fisted.api :as hamf]
[ham-fisted.function :as hamf-fn]
[ham-fisted.reduce :as hamf-rf]
[ham-fisted.lazy-noncaching :as lznc]
[clojure.test :refer [deftest is]]
[clojure.core.protocols :as cl-proto])
(:import [tech.v3.datatype UnaryPredicate FastStruct$FMapEntry]
[java.time LocalDate YearMonth]
[ham_fisted Consumers$IncConsumer Reductions IAMapEntry]
[java.util ArrayList Map$Entry Arrays]))
(deftest simple-reduction
(let [stocks (ds/->dataset "test/data/stocks.csv" {:key-fn keyword})
agg-ds (-> (ds-reduce/group-by-column-agg
:symbol
{:n-elems (ds-reduce/row-count)
:price-avg (ds-reduce/mean :price)
:price-sum (ds-reduce/sum :price)
:symbol (ds-reduce/first-value :symbol)
:n-dates (ds-reduce/count-distinct :date :int32)}
[stocks stocks stocks])
(ds/sort-by-column :symbol))
_ (println agg-ds)
single-price (-> (->> (ds/group-by-column stocks :symbol)
(map (fn [[k ds]]
{:symbol k
:n-elems (ds/row-count ds)
:price-sum (dfn/sum (ds :price))
:price-avg (dfn/mean (ds :price))}))
(ds/->>dataset))
(ds/sort-by-column :symbol))]
(is (= 5 (ds/row-count agg-ds)))
(is (dfn/equals (agg-ds :n-elems)
(dfn/* 3 (single-price :n-elems))))
(is (dfn/equals (agg-ds :price-sum)
(dfn/* 3 (single-price :price-sum))))
(is (dfn/equals (agg-ds :price-avg)
(single-price :price-avg)))))
(deftest simple-reduction-filtered
(let [stocks (ds/->dataset "test/data/stocks.csv" {:key-fn keyword})
agg-ds (-> (ds-reduce/group-by-column-agg
:symbol
{:n-elems (ds-reduce/row-count)
:price-avg (ds-reduce/mean :price)
:price-sum (ds-reduce/sum :price)
:symbol (ds-reduce/first-value :symbol)
:n-dates (ds-reduce/count-distinct :date :int32)}
{:index-filter (fn [dataset]
(let [rdr (dtype/->reader (dataset :price))]
(hamf-fn/long-predicate
idx (> (.readDouble rdr idx) 100.0))))}
[stocks stocks stocks])
(ds/sort-by-column :symbol))
fstocks (ds/filter-column stocks :price #(> % 100.0))
single-price (->
(->> (ds/group-by-column fstocks :symbol)
(map (fn [[k ds]]
{:symbol k
:n-elems (ds/row-count ds)
:price-sum (dfn/sum (ds :price))
:price-avg (dfn/mean (ds :price))}))
(ds/->>dataset))
(ds/sort-by-column :symbol))]
(is (= 4 (ds/row-count agg-ds)))
(is (dfn/equals (agg-ds :n-elems)
(dfn/* 3 (single-price :n-elems))))
(is (dfn/equals (agg-ds :price-sum)
(dfn/* 3 (single-price :price-sum))))
(is (dfn/equals (agg-ds :price-avg)
(single-price :price-avg)))))
(deftest issue-201-incorrect-result-column-count
(let [stocks (ds/->dataset "test/data/stocks.csv" {:key-fn keyword})
agg-ds (ds-reduce/group-by-column-agg
:symbol
{:n-elems (ds-reduce/row-count)
:price-avg (ds-reduce/mean :price)
:price-avg2 (ds-reduce/mean :price)
:price-avg3 (ds-reduce/mean :price)
:price-sum (ds-reduce/sum :price)
:price-med (ds-reduce/prob-median :price)
:symbol (ds-reduce/first-value :symbol)
:n-dates (ds-reduce/count-distinct :date :int32)}
[stocks stocks stocks])
simple-agg-ds (ds-reduce/aggregate
{:n-elems (ds-reduce/row-count)
:price-avg (ds-reduce/mean :price)
:price-avg2 (ds-reduce/mean :price)
:price-avg3 (ds-reduce/mean :price)
:price-sum (ds-reduce/sum :price)
:price-med (ds-reduce/prob-median :price)
:symbol (ds-reduce/first-value :symbol)
:n-dates (ds-reduce/count-distinct :date :int32)}
[stocks stocks stocks])]
(is (= 8 (ds/column-count agg-ds)))
(is (= 8 (ds/column-count simple-agg-ds)))))
(deftest data-sketches-test
(let [stocks (ds/->dataset "test/data/stocks.csv" {:key-fn keyword})
result (ds-reduce/aggregate
{:n-elems (ds-reduce/row-count)
:n-dates (ds-reduce/count-distinct :date :int32)
:n-dates-hll (ds-sketch/prob-set-cardinality :date {:datatype :string})
:n-symbols-hll (ds-sketch/prob-set-cardinality
:symbol {:datatype :string})
:quantiles (ds-sketch/prob-quantiles :price [0.25 0.5 0.75])
:cdfs (ds-sketch/prob-cdfs :price [50 100 150])
:pmfs (ds-sketch/prob-pmfs :price [50 100 150])}
[stocks stocks stocks])
{:keys [n-dates-hll n-symbols-hll]} (first (ds/mapseq-reader result))]
(is (dfn/equals [123 5]
[n-dates-hll
n-symbols-hll]
0.1))))
(deftest reservoir-sampling-test
(let [stocks (ds/->dataset "test/data/stocks.csv" {:key-fn keyword})
ds-seq [stocks stocks stocks]
small-ds-seq [(-> (ds/shuffle stocks)
(ds/select-rows (range 50)))]
agg-map {:n-elems (ds-reduce/row-count)
:price-std (ds-reduce/reservoir-desc-stat
:price 100 :standard-deviation)
:sub-ds (ds-reduce/reservoir-dataset 100)}
straight (ds-reduce/aggregate agg-map ds-seq)
straight-small (ds-reduce/aggregate agg-map small-ds-seq)
grouped (ds-reduce/group-by-column-agg :symbol agg-map ds-seq)
grouped-small (ds-reduce/group-by-column-agg :symbol agg-map ds-seq)]
;;Mainly ensuring that nothing throws.
(is (every? #(or (= 3 (ds/column-count %))
(= 4 (ds/column-count %)))
[straight straight-small
grouped grouped-small])))
(let [missing-ds (ds/new-dataset [(ds-col/new-column
:missing (range 1000)
nil
(->> (range 1000)
(map (fn [^long idx]
(when (== 0 (rem idx 3))
idx)))
(remove nil?)))])
agg-ds
(ds-reduce/aggregate {:sub-ds (ds-reduce/reservoir-dataset 50)}
[missing-ds])
sub-ds (first (:sub-ds agg-ds))]
;;Make sure we carry the missing set across
(is (not (.isEmpty ^org.roaringbitmap.RoaringBitmap (ds/missing sub-ds))))
(is (every? #(or (nil? %)
(not= 0 (rem (long %) 3)))
(:missing sub-ds)))))
(defn- create-otfrom-init-dataset
[& [{:keys [n-simulations n-placements n-expansion n-rows]
:or {n-simulations 100
n-placements 50
n-expansion 20
n-rows 1000000}}]]
(->> (for [idx (range n-rows)]
(let [sd (.minusDays (dtype-dt/local-date) (+ 200 (rand-int 365)))
ed (.plusDays sd (rand-int n-expansion))]
{:simulation (rand-int n-simulations)
:placement (rand-int n-placements)
:start sd
:end ed}))
(ds/->>dataset)))
;;Slightly less efficient than implementing an inline IReduceInit impl is to create
;;a record with a custom IReduceInit implementation.
(defrecord YMC [year-month ^long count]
clojure.lang.IReduceInit
(reduce [this rfn init]
(let [init (hamf/reduced-> rfn init
(clojure.lang.MapEntry/create :year-month year-month)
(clojure.lang.MapEntry/create :count count))]
(if (and __extmap (not (reduced? init)))
(reduce rfn init __extmap)
init))))
(def inc-cons-fn (hamf-fn/function k (Consumers$IncConsumer.)))
(defn- tally-days-as-year-months
[{:keys [^LocalDate start ^LocalDate end]}]
;;Using a hash provider with equals semantics allows the hamf hashtable to
;;compete on equal terms with the java hashtable. In that we find that compute,
;;computeIfAbsent and reduce perform as fast as anything on the jvm when we are using
;;Object/equals and Object/hashCode for the map functionality.
(let [tally (hamf/java-hashmap)]
(dotimes [idx (.until start end java.time.temporal.ChronoUnit/DAYS)]
(let [ym (YearMonth/from (.plusDays start idx))]
;;Compute if absent is ever so slightly faster than compute as it involves
;;less mutation of the original hashtable. It does, however, require the
;;value in the node itself to be mutable.
(.inc ^Consumers$IncConsumer (.computeIfAbsent tally ym inc-cons-fn))))
(hamf/custom-ireduce
rfn acc
(Reductions/iterReduce (.entrySet tally)
acc
(fn [acc ^Map$Entry kv]
(rfn acc
(hamf/custom-ireduce
rrfn aacc
(-> aacc
(rrfn (hamf/make-map-entry :year-month (.getKey kv)))
(rrfn (hamf/make-map-entry :count (deref (.getValue kv))))))))))))
(defn- otfrom-pathway
[ds]
(->> (ds/row-mapcat ds tally-days-as-year-months
;;generate a sequence of datasets
{:result-type :as-seq})
;;sequence of datasets
(ds-reduce/group-by-column-agg
[:simulation :placement :year-month]
{:count (ds-reduce/sum :count)})
;;single dataset - do joins and such here
(#(let [ds %
count (ds :count)]
(assoc ds :count2 (dfn/sq count))))
(ds-reduce/group-by-column-agg
[:placement :year-month]
{:min-count (ds-reduce/prob-quantile :count 0.0)
:low-95-count (ds-reduce/prob-quantile :count 0.05)
:q1-count (ds-reduce/prob-quantile :count 0.25)
:median-count (ds-reduce/prob-quantile :count 0.50)
:q3-count (ds-reduce/prob-quantile :count 0.75)
:high-95-count (ds-reduce/prob-quantile :count 0.95)
:max-count (ds-reduce/prob-quantile :count 1.0)
:count (ds-reduce/sum :count)})))
(defn- tally-days-columnwise
[ds]
(let [starts (dtype/->buffer (ds :start))
ends (dtype/->buffer (ds :end))
n-rows (.lsize starts)
indexes (dtype/prealloc-list :int64 n-rows)
year-months (dtype/prealloc-list :object n-rows) ;;ArrayList works fine here also.
counts (dtype/prealloc-list :int32 n-rows)
incrementor (hamf-fn/bi-function k v
(if v
(unchecked-inc (long v))
1))
tally (hamf/java-hashmap)]
;;Loop through dataset and append results columnwise.
(dotimes [row-idx n-rows]
;;minimize hashtable resize operations
(.clear tally)
(let [^LocalDate start (starts row-idx)
^LocalDate end (ends row-idx)
nd (.until start end java.time.temporal.ChronoUnit/DAYS)]
(dotimes [day-idx nd]
(.inc ^Consumers$IncConsumer (.computeIfAbsent tally (YearMonth/from (.plusDays start day-idx)) inc-cons-fn)))
(.forEach tally (hamf-fn/bi-consumer
k v
(.addLong indexes row-idx)
(.add year-months k)
(.add counts (deref v))))))
(-> (ds/select-rows ds indexes)
;;avoid datatype and missing scans
(assoc :year-month #:tech.v3.dataset{:data year-months
:force-datatype? true
:missing (tech.v3.datatype.bitmap/->bitmap)}
:count counts))))
(defn- otfrom-columnwise-pathway
[ds]
(->> (ds/pmap-ds ds tally-days-columnwise
;;generate a sequence of datasets
{:result-type :as-seq})
;;sequence of datasets
(ds-reduce/group-by-column-agg
[:simulation :placement :year-month]
{:count (ds-reduce/sum :count)})
;;single dataset - do joins and such here
(#(let [ds %
count (ds :count)]
;;return a sequence of datasets for next step
[(assoc ds :count2 (dfn/sq count))]))
(ds-reduce/group-by-column-agg
[:placement :year-month]
{:min-count (ds-reduce/prob-quantile :count 0.0)
:low-95-count (ds-reduce/prob-quantile :count 0.05)
:q1-count (ds-reduce/prob-quantile :count 0.25)
:median-count (ds-reduce/prob-quantile :count 0.50)
:q3-count (ds-reduce/prob-quantile :count 0.75)
:high-95-count (ds-reduce/prob-quantile :count 0.95)
:max-count (ds-reduce/prob-quantile :count 1.0)
:count (ds-reduce/sum :count)})))
(deftest otfrom-pathway-test
(let [ds (create-otfrom-init-dataset)
start (ds :start)
end (ds :end)
total-count (->> (dtype/emap #(dtype-dt/between %1 %2 :days) :int64 start end)
(dfn/sum))
;;warmup
_ (do (otfrom-pathway ds)
(otfrom-columnwise-pathway ds))
_ (println "otfrom pathway timing")
ofds (time (otfrom-pathway ds))
_ (println "otfrom columnwise pathway timing")
of-cwise-ds (time (otfrom-columnwise-pathway ds))
ofsum (dfn/sum (ofds :count))
of-cwise-sum (dfn/sum (of-cwise-ds :count))]
(is (= ofsum total-count))
(is (= of-cwise-sum total-count))))
(deftest issue-314
(let [dstds (->
(ds-reduce/group-by-column-agg
:foo
{:foos (ds-reduce/distinct :value)}
(ds/->dataset (into [] (map (fn [i] {:foo 'foo :value (str i)})) (range 3))))
(ds/column-map :foos-2 (fn [values] values) [:foos]))]
(is (= ["0" "1" "2"]
(vec (first (dstds :foos-2)))))))
(deftest issue-312
(let [ds (ds-reduce/aggregate
{:n-elems (ds-reduce/count-distinct :genre)}
[(ds/->dataset "test/data/example-genres.nippy")])]
(is (pos? (first (ds :n-elems))))))
(deftest group-by-agg-changes-source
(let [ds (-> [{:job "Professional" :sex "Male" :age "[35-40)" :salary 3991.2}
{:job "Professional" :sex "Male" :age "[35-40)" :salary 2364.6}
{:job "Professional" :sex "Male" :age "[35-40)" :salary 3114.7}
{:job "Artist" :sex "Female" :age "[35-35)" :salary 2345.1}
{:job "Artist" :sex "Female" :age "[35-35)" :salary 4562.1}
{:job "Artist" :sex "Female" :age "[35-35)" :salary 1214.1}
{:job "Artist" :sex "Female" :age "[35-35)" :salary 4531.1}]
(ds/->dataset)
(assoc "salary (binned)" ["a" "b" "c" "d" "e" "f" "g"]))
ds2 (ds-reduce/group-by-column-agg
[:job :sex :age]
{:fj (ds-reduce/row-count)}
[ds])]
(is (= #{:job :sex :age :salary "salary (binned)"}
(set (keys (.-colmap ds)))))
))
(deftest maximum-test
(let [ds (ds/->dataset {:x (repeatedly 100 rand)})
ev (last (:x (ds/sort-by-column ds :x)))
out-ds (ds-reduce/aggregate {:max-x (ds-reduce/maximum :x)} ds)]
(is (= 1 (ds/row-count out-ds)))
(is (= (first (:max-x out-ds))
ev))))
(comment
(do
(defn max-int64
[colname]
(ds-reduce/reducer->column-reducer
(hamf-rf/parallel-reducer (fn ^long [] Long/MIN_VALUE)
(fn ^long [^long a ^long b] (Long/max a b))
(fn ^long [^long a ^long b] (Long/max a b)))
:int64
colname))
(defn sum-int64
[colname]
(ds-reduce/reducer->column-reducer
(hamf-rf/parallel-reducer (fn ^long [] 0)
(fn ^long [^long a ^long b] (Long/sum a b))
(fn ^long [^long a ^long b] (Long/sum a b)))
:int64
colname))
(defn max-float64
[colname]
(ds-reduce/reducer->column-reducer
(hamf-rf/parallel-reducer (fn ^double [] 0.0)
(fn ^double [^double a ^double b] (Double/max a b))
(fn ^double [^double a ^double b] (Double/max a b)))
:float64
colname))
(defn sum-float64
[colname]
(ds-reduce/reducer->column-reducer
(hamf-rf/parallel-reducer (fn ^double [] 0.0)
(fn ^double [^double a ^double b] (Double/sum a b))
(fn ^double [^double a ^double b] (Double/sum a b)))
:float64
colname))
(deftype FloatSumObj [^{:unsynchronized-mutable true
:tag double} dval]
java.util.function.DoubleConsumer
(accept [this v] (set! dval (+ dval v)))
ham_fisted.Reducible
(reduce [this other] (set! dval (+ dval (.- dval ^FloatSumObj other))))
clojure.lang.IDeref
(deref [this] dval))
(defn sum-float64-consumer
[colname]
(ds-reduce/reducer->column-reducer
(hamf-rf/double-consumer-reducer #(FloatSumObj. 0.0))
:float64
colname))
(def n-rows 500000)
(def ds (ds/->dataset (repeatedly n-rows
(fn [] {:a (rand-int 50000)
:b (rand-int 500)}))))
(def one-hot (dsc/fit-one-hot ds :b)))
(dotimes [idx 100]
(time
(ds-reduce/group-by-column-agg
:a
(into {} (for [col (-> one-hot :one-hot-table vals)
:when (not= col :a)]
{col (sum-float64 col)}))
{:parser-fn :float64}
(dsc/transform-one-hot ds one-hot))))
(for [[name reducer] {:ds-reduce/sum ds-reduce/sum
:max-int64 max-int64
:sum-int64 sum-int64
:max-float64 max-float64
:sum-float64 sum-float64}]
{name (repeatedly 3 (fn []
))})
)
+14
View File
@@ -0,0 +1,14 @@
(ns tech.v3.dataset.set-test
(:require [tech.v3.dataset :as ds]
[tech.v3.dataset.set :as ds-set]
[clojure.test :refer [deftest is]]))
(deftest union-intersection-test
(let [ds-a (ds/->dataset [{:a 1 :b 2} {:a 1 :b 2} {:a 2 :b 3}])
ds-b (ds/->dataset [{:a 1 :b 2} {:a 1 :b 2} {:a 3 :b 3}])]
(is (= [{:a 2, :b 3} {:a 3, :b 3} {:a 1, :b 2} {:a 1, :b 2}]
(ds/rows (ds-set/reduce-union [ds-a ds-b]))))
(is (= [{:a 1, :b 2} {:a 1, :b 2}]
(ds/rows (ds-set/reduce-intersection [ds-a ds-b]))))))
+26
View File
@@ -0,0 +1,26 @@
(ns tech.v3.dataset.test-utils
(:require [tech.v3.io :as io]
[clojure.string :as s]
[tech.v3.dataset :as ds]
[camel-snake-kebab.core :refer [->kebab-case]]))
(defn load-mapseq-fruit-dataset
[]
(let [fruit-ds (slurp (io/input-stream "test/data/fruit_data_with_colors.txt"))
dataset (->> (s/split fruit-ds #"\n")
(mapv #(s/split % #"\s+")))
ds-keys (->> (first dataset)
(mapv (comp keyword ->kebab-case)))]
(->> (rest dataset)
(map (fn [ds-line]
(->> ds-line
(map (fn [ds-val]
(try
(Double/parseDouble ^String ds-val)
(catch Throwable e
(-> (->kebab-case ds-val)
keyword)))))
(zipmap ds-keys))))
(ds/->dataset))))
(def mapseq-fruit-dataset (memoize load-mapseq-fruit-dataset))
@@ -0,0 +1,30 @@
(ns tech.v3.dataset.update-columns-test
(:require [tech.v3.dataset :as ds]
[tech.v3.dataset.column-filters :as cf]
[tech.v3.datatype.functional :as dfn]
[clojure.test :refer [deftest is]]))
(deftest update-columns-selector-fn
(let [ds (ds/->dataset {:a [1. 2. 3. 4.]
:b [5 6 7 8]
:c ["A" "B" "C" "D"]})
ds' (-> ds
(ds/update-columns cf/numeric
#(dfn// (dfn/- % (dfn/mean %))
(dfn/standard-deviation %)))
)]
(is (> 0.001 (Math/abs (reduce + (map - [-1.16189 -0.38729 0.38729 1.16189] (vec (ds' :a)))))))
(is (> 0.001 (Math/abs (reduce + (map - [-1.16189 -0.38729 0.38729 1.16189] (vec (ds' :d)))))))
(is (= ["A" "B" "C" "D"] (vec (ds' :c)))))
(let [ds (ds/->dataset {:a [1. 2. 3. 4.]
:b [5 6 7 8]
:c ["A" "B" "C" "D"]})
ds' (as-> ds $
(ds/update-columns $ (ds/column-names (cf/numeric $))
#(dfn// (dfn/- % (dfn/mean %))
(dfn/standard-deviation %)))
)]
(is (> 0.001 (Math/abs (reduce + (map - [-1.16189 -0.38729 0.38729 1.16189] (vec (ds' :a)))))))
(is (> 0.001 (Math/abs (reduce + (map - [-1.16189 -0.38729 0.38729 1.16189] (vec (ds' :d)))))))
(is (= ["A" "B" "C" "D"] (vec (ds' :c))))))
File diff suppressed because it is too large Load Diff
+354
View File
@@ -0,0 +1,354 @@
(ns tech.v3.libs.arrow-test
(:require [tech.v3.libs.arrow :as arrow]
[tech.v3.dataset :as ds]
[tech.v3.dataset.column :as ds-col]
[tech.v3.dataset.impl.sparse-column :as sparse-col]
[tech.v3.datatype.functional :as dfn]
[tech.v3.datatype :as dtype]
[tech.v3.libs.parquet]
[tech.v3.datatype.datetime :as dtype-dt]
[tech.v3.resource :as resource]
[clojure.test :refer [deftest is]])
(:import [java.time LocalTime]
[tech.v3.dataset Text]
[java.util Map]
[java.io ByteArrayOutputStream ByteArrayInputStream]))
(tech.v3.dataset.utils/set-slf4j-log-level :info)
(defn supported-datatype-ds
([n]
(-> (ds/->dataset {:boolean [true false true true false false true false false true]
:bytes (byte-array (range n))
:ubytes (dtype/make-container :uint8 (dfn/rem (range n) 256))
:shorts (short-array (range n))
:ushorts (dtype/make-container :uint16 (range n))
:ints (int-array (range n))
:uints (dtype/make-container :uint32 (range n))
:longs (long-array (range n))
:floats (float-array (range n))
:doubles (double-array (range n))
:strings (map str (range n))
:text (map (comp #(Text. %) str) (range n))
:instants (repeatedly n dtype-dt/instant)
:bigdec (repeatedly n #(BigDecimal/valueOf (+ 100 (rand-int 1700)) 2))
;; :bigint (let [rng (java.util.Random.)]
;; (repeatedly n #(BigInteger. 256 rng )))
;;external formats often don't support dash-case
:local_dates (repeatedly n dtype-dt/local-date)
:local_times (repeatedly n dtype-dt/local-time)
:uuids (repeatedly n #(java.util.UUID/randomUUID))})
(vary-meta assoc :name :testtable)))
([]
(supported-datatype-ds 10)))
(comment
(arrow/dataset->stream! (supported-datatype-ds 1000) "test/data/alldtypes.arrow-ipc-lz4"
{:compression :lz4})
(arrow/dataset->stream! (supported-datatype-ds 1000) "test/data/alldtypes.arrow-ipc-zstd"
{:compression :zstd})
(let [sds (supported-datatype-ds 1000)]
(arrow/dataset-seq->stream! "test/data/alldtypes.arrow-file-zstd"
{:compression :zstd
:format :file
:strings-as-text? true}
[(ds/select-rows sds (range 500))
;;test when you have to add more string dictionary values
(ds/select-rows sds (range 500 1000))]))
(def ignored (arrow/stream->dataset-seq "test/data/alldtypes.arrow-file-zstd"))
(def ignored (arrow/stream->dataset "test/data/alldtypes.arrow-ipc-zstd"))
)
(deftest base-datatype-test
(try
(resource/stack-resource-context
(let [ds (supported-datatype-ds)
_ (arrow/dataset->stream! ds "alldtypes.arrow")
mmap-ds (arrow/stream->dataset "alldtypes.arrow" {:open-type :mmap
:key-fn keyword})
copy-ds (arrow/stream->dataset "alldtypes.arrow" {:key-fn keyword})]
(doseq [col (vals ds)]
(let [cname ((meta col) :name)
dt (dtype/elemwise-datatype col)
inp-col (mmap-ds cname)
cp-col (copy-ds cname)]
(is (= dt (dtype/elemwise-datatype inp-col)) (str "inplace failure " cname))
(is (= dt (dtype/elemwise-datatype cp-col)) (str "copy failure " cname))
(is (= (vec col) (vec inp-col)) (str "inplace failure " cname))
(is (= (vec col) (vec cp-col)) (str "copy failure " cname))))))
(finally
(.delete (java.io.File. "alldtypes.arrow")))))
(deftest base-sparse-datatype-test
(try
(resource/stack-resource-context
(let [ds (sparse-col/->sparse-ds (supported-datatype-ds) 0.0)
_ (arrow/dataset->stream! ds "alldtypes-sparse.arrow")
mmap-ds (arrow/stream->dataset "alldtypes-sparse.arrow" {:open-type :mmap
:key-fn keyword})
copy-ds (arrow/stream->dataset "alldtypes-sparse.arrow" {:key-fn keyword})]
(is (every? sparse-col/is-sparse? (.values ^Map ds)))
(is (every? sparse-col/is-sparse? (.values ^Map mmap-ds)))
(is (every? sparse-col/is-sparse? (.values ^Map copy-ds)))
(doseq [col (vals ds)]
(let [cname ((meta col) :name)
dt (dtype/elemwise-datatype col)
inp-col (mmap-ds cname)
cp-col (copy-ds cname)]
(is (= dt (dtype/elemwise-datatype inp-col)) (str "inplace failure " cname))
(is (= dt (dtype/elemwise-datatype cp-col)) (str "copy failure " cname))
(is (= (vec col) (vec inp-col)) (str "inplace failure " cname))
(is (= (vec col) (vec cp-col)) (str "copy failure " cname))))))
(finally
(.delete (java.io.File. "alldtypes-sparse.arrow")))))
(deftest arrow-file-types
;;lz4 compression
(let [all-files ["test/data/alldtypes.arrow-feather" ;lz4
"test/data/alldtypes.arrow-feather-compressed" ;zstd
"test/data/alldtypes.arrow-feather-v1" ;v1
]]
(doseq [file all-files]
(is (= 1000 (ds/row-count (arrow/stream->dataset file)))))
;; lz4 with dependent frames))))))
(is (= 31962 (ds/row-count (arrow/stream->dataset "test/data/tweets_sentiment.feather"))))))
(deftest base-ds-seq-test
(try
(let [ds (supported-datatype-ds)
_ (arrow/dataset-seq->stream! "alldtypes-seq.arrow" {:strings-as-text? false} [ds ds ds])
mmap-ds-seq (arrow/stream->dataset-seq "alldtypes-seq.arrow" {:key-fn keyword
:open-type :mmap})
copy-ds-seq (arrow/stream->dataset-seq "alldtypes-seq.arrow" {:key-fn keyword})]
(is (= 3 (count mmap-ds-seq)))
(is (= 3 (count copy-ds-seq)))
(let [mmap-ds (last mmap-ds-seq)
copy-ds (last copy-ds-seq)]
(doseq [col (vals ds)]
(let [cname ((meta col) :name)
dt (dtype/elemwise-datatype col)
inp-col (mmap-ds cname)
cp-col (copy-ds cname)]
(is (= dt (dtype/elemwise-datatype inp-col)) (str "inplace failure " cname))
(is (= dt (dtype/elemwise-datatype cp-col)) (str "copy failure " cname))
(is (= (vec col) (vec inp-col)) (str "inplace failure " cname))
(is (= (vec col) (vec cp-col)) (str "copy failure " cname))))))
(finally
(.delete (java.io.File. "alldtypes-seq.arrow")))))
(deftest simple-stocks
(try
(let [stocks (ds/->dataset "test/data/stocks.csv")
_ (arrow/dataset->stream! stocks "temp.stocks.arrow")
stocks-copying (arrow/stream->dataset "temp.stocks.arrow")
stocks-inplace (arrow/stream->dataset "temp.stocks.arrow" {:open-type :mmap})
pystocks-copying (arrow/stream->dataset "test/data/stocks.pyarrow.stream")
pystocks-inplace (arrow/stream->dataset "test/data/stocks.pyarrow.stream")]
;;This is here just to make sure that the data isn't cleaned up until it
;;actually can safely be cleaned up. This was a bug that caused datatype to
;;bump from 5.11 to 5.12
(System/gc)
(is (dfn/equals (stocks "price") (stocks-copying "price")))
(is (dfn/equals (stocks "price") (stocks-inplace "price")))
(is (dfn/equals (stocks "price") (pystocks-copying "price")))
(is (dfn/equals (stocks "price") (pystocks-inplace "price")))
(is (= (vec (stocks "symbol")) (vec (stocks-copying "symbol"))))
(is (= (vec (stocks "symbol")) (vec (stocks-inplace "symbol"))))
;;python saves strings inline in the file - equivalent to :strings-as-text?
;;save option
(is (= (vec (stocks "symbol")) (mapv str (pystocks-copying "symbol"))))
(is (= (vec (stocks "symbol")) (mapv str (pystocks-inplace "symbol")))))
(finally
(.delete (java.io.File. "temp.stocks.arrow")))))
(deftest ames-house-prices
(try
(let [ames (ds/->dataset "test/data/ames-house-prices/train.csv")
_ (arrow/dataset->stream! ames "temp.ames.arrow")
ames-copying (arrow/stream->dataset "temp.ames.arrow")
ames-inplace (arrow/stream->dataset "temp.ames.arrow" {:open-type :mmap})
pyames-copying (arrow/stream->dataset "test/data/ames.pyarrow.stream")
pyames-inplace (arrow/stream->dataset "test/data/ames.pyarrow.stream")]
(System/gc)
(is (dfn/equals (ames "SalePrice") (ames-copying "SalePrice")))
(is (dfn/equals (ames "SalePrice") (ames-inplace "SalePrice")))
(is (= (ds-col/missing (ames "LotFrontage"))
(ds-col/missing (ames-copying "LotFrontage"))))
(is (= (ds-col/missing (ames "LotFrontage"))
(ds-col/missing (ames-inplace "LotFrontage"))))
(is (not= 0 (dtype/ecount (ds-col/missing (ames-inplace "LotFrontage")))))
(is (dfn/equals (ames "SalePrice") (pyames-copying "SalePrice")))
(is (dfn/equals (ames "SalePrice") (pyames-inplace "SalePrice")))
(is (= (ds-col/missing (ames "LotFrontage"))
(ds-col/missing (pyames-copying "LotFrontage"))))
(is (= (ds-col/missing (ames "LotFrontage"))
(ds-col/missing (pyames-inplace "LotFrontage")))))
(finally
(.delete (java.io.File. "temp.ames.arrow")))))
(deftest ames-compression-test
(try
(let [ames (ds/->dataset "test/data/ames-house-prices/train.csv")
_ (arrow/dataset->stream! ames "ames-uncompressed.arrow")
_ (arrow/dataset->stream! ames "ames-zstd.arrow" {:compression
{:compression-type :zstd
;;default is 3
:level 5}})
_ (arrow/dataset->stream! ames "ames-lz4.arrow" {:compression :lz4})
_ (arrow/dataset->stream! (sparse-col/->sparse-ds ames)
"ames-sparse-zstd.arrow" {:compression
{:compression-type :zstd
;;default is 3
:level 5}})
file-len (fn [path] (.length (java.io.File. (str path))))
_ (println (ds/->dataset {:save-type [:uncompressed :zstd :sparse-zstd :lz4]
:file-size [(file-len "ames-uncompressed.arrow")
(file-len "ames-zstd.arrow")
(file-len "ames-sparse-zstd.arrow")
(file-len "ames-lz4.arrow")]}))
uncomp (arrow/stream->dataset "ames-uncompressed.arrow")
zstd (arrow/stream->dataset "ames-zstd.arrow")
sparse-zstd (arrow/stream->dataset "ames-sparse-zstd.arrow")
lz4 (arrow/stream->dataset "ames-lz4.arrow")]
(System/gc)
(is (dfn/equals (uncomp "SalePrice") (zstd "SalePrice")))
(is (dfn/equals (uncomp "LotFrontage") (sparse-zstd "LotFrontage")))
(is (dfn/equals (uncomp "SalePrice") (lz4 "SalePrice"))))
(finally
(.delete (java.io.File. "ames-uncompressed.arrow"))
(.delete (java.io.File. "ames-zstd.arrow"))
(.delete (java.io.File. "ames-sparse-zstd.arrow"))
(.delete (java.io.File. "ames-lz4.arrow")))))
(deftest date-arrow-test
(let [date-data (arrow/read-stream-dataset-copying "test/data/with_date.arrow"
{:integer-datetime-types? true})]
(is (= [18070 18072 18063]
(date-data "date")))
(is (= :epoch-days (dtype/elemwise-datatype (date-data "date")))))
(let [date-data (arrow/read-stream-dataset-copying "test/data/with_date.arrow")]
(is (= (mapv #(java.time.LocalDate/parse %)
["2019-06-23" "2019-06-25" "2019-06-16"])
(date-data "date")))
(is (= :packed-local-date (dtype/elemwise-datatype (date-data "date"))))))
(deftest odd-parquet-crash
(let [test-data (ds/->dataset "test/data/part-00000-74d3eb51-bc9c-4ba5-9d13-9e0d71eea31f.c000.snappy.parquet")]
(try
(arrow/write-dataset-to-stream! test-data "test.arrow")
(let [arrow-ds (arrow/read-stream-dataset-copying "test.arrow")]
(is (= (ds/missing test-data)
(ds/missing arrow-ds))))
(finally
(.delete (java.io.File. "test.arrow"))))))
(deftest failed-R-file
(let [cp-data (arrow/read-stream-dataset-copying "test/data/part-8981.ipc_stream")
inp-data (arrow/read-stream-dataset-inplace "test/data/part-8981.ipc_stream")]
(is (= (vec (ds/column-names cp-data))
(vec (ds/column-names inp-data))))))
(deftest large-var-char-file
(let [cp-data (arrow/read-stream-dataset-copying "test/data/largeVarChar.ipc")
inp-data (arrow/read-stream-dataset-inplace "test/data/largeVarChar.ipc")]
(is (= (vec (ds/column-names cp-data))
(vec (ds/column-names inp-data))))
(is (= (vec (first (ds/columns cp-data)))
(vec (first (ds/columns inp-data)))))))
(deftest uuid-test
(let [py-uuid (ds/->dataset "test/data/uuid_ext.arrow" {:key-fn keyword})]
(is (= :uuid (dtype/elemwise-datatype (py-uuid :id))))
(is (= (mapv #(java.util.UUID/fromString %)
["8be643d6-0df7-4e5e-837c-f94170c87914"
"24bc9cf4-e2e8-444f-bb2d-82394f33ff76"
"e8149e1b-aef6-4671-b1b4-3b7a21eed92a"])
(py-uuid :id))))
(try
(let [uuid-ds (ds/->dataset "test/data/uuid.parquet"
{:parser-fn {"uuids" :uuid}})
_ (arrow/write-dataset-to-stream! uuid-ds "test-uuid.arrow")
copying-ds (arrow/read-stream-dataset-copying "test-uuid.arrow")
inplace-ds (arrow/read-stream-dataset-inplace "test-uuid.arrow")]
(is (= :uuid ((comp :datatype meta) (copying-ds "uuids"))))
(is (= :uuid ((comp :datatype meta) (inplace-ds "uuids"))))
(is (= (vec (copying-ds "uuids"))
(vec (inplace-ds "uuids"))))
(is (= (vec (uuid-ds "uuids"))
(vec (copying-ds "uuids")))))
(finally
(.delete (java.io.File. "test-uuid.arrow")))))
(deftest local-time
(try
(let [ds (ds/->dataset {"a" (range 10)
"b" (repeat 10 (java.time.LocalTime/now))})
_ (arrow/write-dataset-to-stream! ds "test-local-time.arrow")
copying-ds (arrow/read-stream-dataset-copying "test-local-time.arrow")
inplace-ds (arrow/read-stream-dataset-inplace "test-local-time.arrow")]
(is (= :packed-local-time (dtype/elemwise-datatype (copying-ds "b"))))
(is (= :packed-local-time (dtype/elemwise-datatype (inplace-ds "b"))))
(is (= (vec (copying-ds "b"))
(vec (inplace-ds "b"))))
;;Making a primitive container will use the packed data.
(is (= (vec (ds "b"))
(vec (copying-ds "b")))))
(finally
(.delete (java.io.File. "test-local-time.arrow")))))
(deftest string-arrow
(let [dataset (ds/->dataset [{"col1" "a"}] {:parser-fn :string})
baos (ByteArrayOutputStream.)]
(resource/stack-resource-context
(arrow/dataset->stream! dataset baos {:compression :lz4})
(let [written-bytes (.toByteArray baos)
arrow-ds-rtt (arrow/stream->dataset written-bytes)
_ (.reset baos)
_ (arrow/dataset->stream! arrow-ds-rtt baos {:compression :lz4})
b2 (.toByteArray baos)
final-ds (arrow/stream->dataset b2)]
(is (= (vec (dataset "col1"))
(vec (final-ds "col1"))))))))
(deftest nullcol
(let [ds (arrow/stream->dataset "test/data/withnullcol.arrow")]
(is (= (vec (range (ds/row-count ds)))
(vec (ds/missing (ds "nullcol")))))))
(deftest list-datatypes-read-only
(let [ds (ds/->dataset "test/data/arrow_list.arrow")]
(is (= [["dog" "car"]
["dog" "flower"]
["car" "flower"]]
(mapv vec (ds "class-name"))))))
(deftest empty-array-dataset
(is (nil? (arrow/stream->dataset "test/data/empty.arrow"))))
+28
View File
@@ -0,0 +1,28 @@
(ns tech.v3.libs.csv-test
(:require [clojure.test :refer [deftest is testing]]
[tech.v3.dataset :as ds]
[tech.v3.dataset.io.csv :as csv-parse]))
(def duplicate-headers-file "test/data/duplicate-headers.csv")
(deftest ensure-unique-headers-test
(testing "that all headers are are forced to be unique"
(let [ds (ds/->dataset duplicate-headers-file
{:ensure-unique-column-names? true})]
(is (ds/column-count ds) 7)
(is (count (set (ds/column-names ds))) 7))
(let [ds (csv-parse/csv->dataset duplicate-headers-file
{:ensure-unique-column-names? true})]
(is (ds/column-count ds) 7)
(is (count (set (ds/column-names ds))) 7)))
(testing "that exception is thrown on duplicate headers"
(is (thrown? RuntimeException (ds/->dataset duplicate-headers-file))))
(testing "that custom postfix-fn works correctly"
(let [ds (ds/->dataset duplicate-headers-file
{:ensure-unique-column-names? true
:unique-column-name-fn (fn [col-idx colname] (str colname "::" col-idx))})]
(is (some? (ds/column ds "column::2")))
(is (some? (ds/column ds "column::4")))
(is (some? (ds/column ds "column-1::6"))))))
+99
View File
@@ -0,0 +1,99 @@
(ns tech.v3.libs.fastexcel-test
(:require [tech.v3.libs.fastexcel :as xlsx-parse]
[tech.v3.dataset :as ds]
[tech.v3.datatype :as dtype]
[clojure.test :refer [deftest is testing]]))
(def xls-file "test/data/file_example_XLS_1000.xls")
(def xlsx-file "test/data/file_example_XLSX_1000.xlsx")
(def sparse-file "test/data/sparsefile.xlsx")
(def stocks-file "test/data/stocks.xlsx")
(def stocks-bad-date-file "test/data/stocks-bad-date.xlsx")
(def duplicate-headers-file "test/data/duplicate-headers.xlsx")
(deftest happy-path-parse-test
(let [ds (first (xlsx-parse/workbook->datasets xlsx-file))]
(is (= #{"column-0" "Age" "Country" "First Name" "Gender" "Date" "Last Name" "Id"}
(set (ds/column-names ds))))
(is (= #{:float64 :string}
(set (map dtype/get-datatype (ds/columns ds)))))
(is (= 1000 (ds/row-count ds)))
(is (= 8 (ds/column-count ds)))))
(deftest sparse-file-parse-test
(let [ds (first (xlsx-parse/workbook->datasets sparse-file))]
(is (= 8 (ds/row-count ds)))
(is (= 8 (ds/column-count ds)))
(is (every? #(= (set (range 8)) %)
(map (comp set ds/missing ds) ["column-0" "a" "column-6"])))
(is (= [1.0 1.0 1.0 "a" 2.0 23.0]
(->> (ds/columns ds)
(mapcat (comp dtype/->reader ds/drop-missing))
vec)))))
(deftest datetime-test
(let [ds (first (xlsx-parse/workbook->datasets
stocks-file
{:parser-fn {"date" :packed-local-date}}))]
(is (= :packed-local-date (dtype/get-datatype (ds "date"))))))
(deftest bad-datetime-test
(let [ds (first (xlsx-parse/workbook->datasets stocks-bad-date-file))]
(is (= :string (dtype/get-datatype (ds "date"))))
(is (= {java.lang.String 29}
(->> (ds "date")
(map type)
frequencies)))))
(deftest skip-rows-test
(let [ds (ds/->dataset "test/data/holdings-daily-us-en-mdy.xlsx"
{:n-initial-skip-rows 4
:parser-fn {"Identifier" :string
"Weight" :float64}})]
;;column-8 had no data
(is (= #{:float64 :string :boolean}
(set (map dtype/get-datatype (vals ds)))))
(is (= ["Name"
"Ticker"
"Identifier"
"SEDOL"
"Weight"
"Sector"
"Shares Held"
"Local Currency"
"column-8"]
(vec (ds/column-names ds))))))
(deftest ensure-unique-headers-test
(testing "that all headers are are forced to be unique"
(let [ds (ds/->dataset duplicate-headers-file
{:ensure-unique-column-names? true})]
(is (ds/column-count ds) 7)
(is (count (set (ds/column-names ds))) 7))
(let [ds (first (xlsx-parse/workbook->datasets duplicate-headers-file
{:ensure-unique-column-names? true}))]
(is (ds/column-count ds) 7)
(is (count (set (ds/column-names ds))) 7)))
(testing "that exception is thrown on duplicate headers"
(is (thrown? RuntimeException (ds/->dataset duplicate-headers-file)))
(is (thrown? RuntimeException (xlsx-parse/workbook->datasets duplicate-headers-file))))
(testing "that custom postfix-fn works correctly"
(let [ds (ds/->dataset duplicate-headers-file
{:ensure-unique-column-names? true
:unique-column-name-fn (fn [col-idx colname] (str colname "::" col-idx))})]
(is (some? (ds/column ds "column::2")))
(is (some? (ds/column ds "column::4")))
(is (some? (ds/column ds "column-1::6"))))))
(deftest number-colname
(let [ds (ds/->dataset "test/data/number_column.xlsx")]
(is (= (first (ds/column-names ds)) 0.0))))
+124
View File
@@ -0,0 +1,124 @@
(ns tech.v3.libs.parquet-test
(:require [tech.v3.dataset :as ds]
[tech.v3.datatype :as dtype]
[tech.v3.datatype.functional :as dfn]
[tech.v3.libs.parquet :as parquet]
[tech.v3.dataset.utils :as ds-utils]
[tech.v3.dataset.column :as ds-col]
[tech.v3.datatype.datetime :as dtype-dt]
[clojure.test :refer [deftest is]]))
(ds-utils/set-slf4j-log-level :info)
(deftest stocks-test
(try
(let [stocks (ds/->dataset "test/data/stocks.csv")
_ (ds/write! stocks "stocks.parquet")
stocks-p (ds/->dataset "stocks.parquet")]
(is (= (vec (stocks "symbol"))
(mapv str (stocks-p "symbol"))))
(is (dfn/equals (stocks "price")
(stocks-p "price")))
(is (= (vec (stocks "date"))
(vec (stocks-p "date")))))
(finally
(.delete (java.io.File. "stocks.parquet")))))
(deftest userdata1-test
(try
(let [testd (ds/->dataset "test/data/parquet/userdata1.parquet")
_ (ds/write! testd "userdata1.parquet")
newd (ds/->dataset "userdata1.parquet")
_ (ds/write! newd "userdata1.nippy")
nippy-d (ds/->dataset "userdata1.nippy")]
(is (= (vec (testd "registration_dttm"))
(vec (newd "registration_dttm"))))
(is (= (vec (testd "comments"))
(vec (newd "comments"))))
(is (= (vec (testd "comments"))
(vec (nippy-d "comments")))))
(finally
(.delete (java.io.File. "userdata1.parquet"))
(.delete (java.io.File. "userdata1.nippy")))))
(deftest whitelist-test
(let [testd (ds/->dataset "test/data/parquet/userdata1.parquet"
{:column-whitelist ["first_name" "last_name" "gender"]})]
(is (= 3 (ds/column-count testd)))))
(deftest ames-ds
(try
(let [ames (ds/->dataset "test/data/ames-house-prices/train.csv")
_ (ds/write! ames "ames.parquet")
newd (ds/->dataset "ames.parquet")]
(is (= (ds/missing (ames "LotFrontage"))
(ds/missing (newd "LotFrontage"))))
(is (= (vec (ames "CentralAir"))
(vec (newd "CentralAir"))))
(is (dfn/equals (ames "SalePrice") (newd "SalePrice"))))
(finally
(.delete (java.io.File. "ames.parquet")))))
(deftest uuid-test
(try
(let [uuid-ds (ds/->dataset "test/data/uuid.parquet"
{:parser-fn {"uuids" :uuid}})
_ (ds/write! uuid-ds "test-uuid.parquet")
new-ds (ds/->dataset "test-uuid.parquet"
{:parser-fn {"uuids" :uuid}})]
(is (= :uuid ((comp :datatype meta) (uuid-ds "uuids"))))
(is (= :uuid ((comp :datatype meta) (new-ds "uuids")))))
(finally
(.delete (java.io.File. "test-uuid.parquet")))))
(deftest missing-uint8-data
;;Use a large enough value the the system is forced to use uint8 columns else
;;it will default to int8 columns based on the column data min/max
(let [ds (ds/->dataset {:a (dtype/make-container :uint8 [10 20 245])})
ds (ds/update-column ds :a #(ds-col/set-missing % [1 5]))]
(try
(parquet/ds->parquet ds "test.parquet")
(let [nds (ds/->dataset "test.parquet" {:key-fn keyword})]
(is (= 3 (ds/row-count nds)))
(is (= [1] (vec (dtype/->reader (ds/missing ds)))))
(is (= :uint8 (dtype/elemwise-datatype (ds :a))))
(is (= :uint8 (dtype/elemwise-datatype (nds :a))))
(is (= [1] (vec (dtype/->reader (ds/missing nds))))))
(finally
(.delete (java.io.File. "test.parquet"))))))
(deftest nested-parquet
(let [ds (ds/->dataset "test/data/nested.parquet")]
(is (= [1 nil 2 nil 3 nil nil] (vec (ds "id"))))
(is (= ["a" "b" "a" "b" "a" "b" "c"] (vec (ds "val.key_value.key"))))
(is (= ["va" "vb" nil nil "vb" nil nil] (vec (ds "val2.key_value.key"))))))
(deftest local-time
(try
(let [ds (ds/->dataset {:a (range 10)
:b (repeat 10 (java.time.LocalTime/now))})
_ (parquet/ds->parquet ds "test.parquet")
pds (ds/->dataset "test.parquet" {:key-fn keyword})]
(is (= (vec (ds :b))
(vec (pds :b)))))
(finally
(.delete (java.io.File. "test.parquet")))))
(deftest decimaltable
(let [table (ds/->dataset "test/data/decimaltable.parquet")
decimals (table "decimals")]
(is (dfn/equals [3.420 1.246] decimals))))
(deftest issue-401-paruet-missing-column
(is (= 4 (ds/column-count (ds/->dataset "test/data/2024-03-03.parquet")))))
+115
View File
@@ -0,0 +1,115 @@
(ns tech.v3.libs.poi-test
(:require [tech.v3.libs.poi :as xlsx-parse]
[tech.v3.dataset :as ds]
[tech.v3.dataset.column :as ds-col]
[tech.v3.datatype.functional :as dfn]
[tech.v3.datatype :as dtype]
[clojure.test :refer [deftest is testing]]))
(def xls-file "test/data/file_example_XLS_1000.xls")
(def xlsx-file "test/data/file_example_XLSX_1000.xlsx")
(def sparse-file "test/data/sparsefile.xlsx")
(def stocks-file "test/data/stocks.xlsx")
(def duplicate-headers-file "test/data/duplicate-headers.xls")
(deftest happy-path-parse-test
(let [ds (first (xlsx-parse/workbook->datasets xlsx-file))
ds2 (first (xlsx-parse/workbook->datasets xlsx-file))]
(is (= #{"column-0" "Age" "Country" "First Name" "Gender" "Date" "Last Name" "Id"}
(set (ds/column-names ds))))
(is (= #{"column-0" "Age" "Country" "First Name" "Gender" "Date" "Last Name" "Id"}
(set (ds/column-names ds2))))
(is (= #{:float64 :string}
(set (map dtype/get-datatype (ds/columns ds)))))
(is (= 1000 (ds/row-count ds)))
(is (= 1000 (ds/row-count ds2)))
(is (= 8 (ds/column-count ds)))
(is (= 8 (ds/column-count ds2)))
(is (dfn/equals (ds "Age") (ds2 "Age")))
(is (dfn/equals (ds "Id") (ds2 "Id")))))
(deftest sparse-file-parse-test
(let [ds (first (xlsx-parse/workbook->datasets sparse-file))]
(is (= 8 (ds/row-count ds)))
(is (= 8 (ds/column-count ds)))
(is (every? #(= (set (range 8)) %)
(map (comp set ds-col/missing ds) ["column-0" "a" "column-6"])))
(is (= [1.0 1.0 1.0 "a" 2.0 23.0]
(->> (ds/columns ds)
(mapcat (comp dtype/->reader ds/drop-missing))
vec)))))
(deftest datetime-test
(let [ds (first (xlsx-parse/workbook->datasets
stocks-file
{:parser-fn {"date" :packed-local-date}}))]
(is (= :packed-local-date (dtype/get-datatype (ds "date"))))))
(deftest custom-parser-test
(let [ds (first (xlsx-parse/workbook->datasets
xls-file
{:parser-fn {"Date" [:local-date
"dd/MM/yyyy"]}}))]
(is (= :local-date (dtype/get-datatype (ds "Date"))))))
(deftest integer-field-test
(let [ds (first (xlsx-parse/workbook->datasets
xls-file
{:parser-fn {"Id" :int64}}))]
(is (= :int64 (dtype/get-datatype (ds "Id"))))))
(deftest xls-keyword-colnames
(let [ds (first (xlsx-parse/workbook->datasets
xls-file
{:key-fn keyword}))]
;;The first column is an integer so keyword returns nil for that.
;;This is also a good example in that the system produces keywords with spaces
;;in them...that definitely isn't ideal.
(is (every? keyword? (rest (ds/column-names ds))))))
(deftest key-fn-number-columns
(let [ds (first (xlsx-parse/workbook->datasets xlsx-file {:key-fn keyword}))]
(is (= 0 (count (filter nil? (ds/column-names ds)))))
(is (= #{:column-0 :Age :Country (keyword "First Name") :Gender :Date
(keyword "Last Name") (keyword "Id")}
(set (ds/column-names ds))))))
(deftest auto-infer-dates
(let [ds (first (xlsx-parse/workbook->datasets "test/data/stocks-with-dates.xlsx"))]
(is (= #{:string :packed-local-date :float64}
(->> (vals ds)
(map (comp :datatype meta))
set)))))
(deftest ensure-unique-headers-test
(testing "that all headers are are forced to be unique"
(let [ds (ds/->dataset duplicate-headers-file
{:ensure-unique-column-names? true})]
(is (ds/column-count ds) 7)
(is (count (set (ds/column-names ds))) 7))
(let [ds (first (xlsx-parse/workbook->datasets duplicate-headers-file
{:ensure-unique-column-names? true}))]
(is (ds/column-count ds) 7)
(is (count (set (ds/column-names ds))) 7)))
(testing "that exception is thrown on duplicate headers"
(is (thrown? RuntimeException (ds/->dataset duplicate-headers-file)))
(is (thrown? RuntimeException (xlsx-parse/workbook->datasets duplicate-headers-file))))
(testing "that custom postfix-fn works correctly"
(let [ds (ds/->dataset duplicate-headers-file
{:ensure-unique-column-names? true
:unique-column-name-fn (fn [col-idx colname] (str colname "::" col-idx))})]
(is (some? (ds/column ds "column::2")))
(is (some? (ds/column ds "column::4")))
(is (some? (ds/column ds "column-1::6"))))))