init research
This commit is contained in:
@@ -0,0 +1,189 @@
|
||||
(ns tech.v3.dataset.categorical-test
|
||||
(:require [tech.v3.dataset.categorical :as ds-cat]
|
||||
[tech.v3.dataset.modelling :as ds-mod]
|
||||
[tech.v3.datatype :as dtype]
|
||||
[tech.v3.dataset.column-filters :as cf]
|
||||
[clojure.test :refer [deftest is] :as t]
|
||||
[tech.v3.dataset :as ds]))
|
||||
|
||||
|
||||
(deftest prediction
|
||||
(is (= [:no :yes]
|
||||
(->
|
||||
(ds/->dataset {:yes [0.3 0.5] :no [0.7 0.5]})
|
||||
(ds-mod/probability-distributions->label-column :val)
|
||||
(ds-cat/reverse-map-categorical-xforms)
|
||||
:val))))
|
||||
|
||||
|
||||
|
||||
(deftest prob-dist
|
||||
(let [prob
|
||||
(->
|
||||
(ds/->dataset {:yes [0.3 0.5] :no [0.7 0.5]})
|
||||
(ds-mod/probability-distributions->label-column :val)
|
||||
(ds-cat/reverse-map-categorical-xforms))]
|
||||
|
||||
|
||||
(is (= (:yes prob) [0.3 0.5]))
|
||||
(is (= (:no prob) [0.7 0.5]))
|
||||
(is (= (:val prob) [:no :yes]))))
|
||||
|
||||
|
||||
|
||||
(deftest cat-to-number
|
||||
(is (=
|
||||
(set
|
||||
(->
|
||||
(ds/->dataset {:x [:a :b] :y ["1" "0"]})
|
||||
(ds/categorical->number [:y])
|
||||
:y))
|
||||
(set [0 1]))))
|
||||
|
||||
|
||||
|
||||
|
||||
(defn- cat->num [table-args]
|
||||
(->
|
||||
(ds/->dataset {:y [:a :b :c :d]})
|
||||
(ds/categorical->number [:y] table-args)
|
||||
:y
|
||||
meta
|
||||
:categorical-map
|
||||
:lookup-table
|
||||
clojure.set/map-invert))
|
||||
|
||||
|
||||
(deftest test-categorical->number []
|
||||
(is (= {5 :a, 2 :b, 0 :d, 1 :c}
|
||||
(cat->num [[:a 5] [:b 2]])))
|
||||
(is (= {5 :a, 0 :b, 1 :d, 2 :c}
|
||||
(cat->num [[:a 5] [:b 0]])))
|
||||
(is (= (cat->num [])
|
||||
{0 :d, 1 :c, 2 :a, 3 :b}))
|
||||
(is (= (cat->num [[:not-present 1]])
|
||||
{1 :not-present, 0 :d, 2 :c, 3 :a, 4 :b}))
|
||||
(is (= (cat->num [[:a 1 :b 1]])
|
||||
{1 :a, 0 :d, 2 :c, 3 :b})))
|
||||
|
||||
|
||||
(deftest cat-map-regression
|
||||
(is (every? #(Double/isFinite %)
|
||||
(-> (ds/->dataset "test/data/titanic.csv")
|
||||
(ds/update-column "Survived"
|
||||
(fn [col]
|
||||
(let [val-map {0 :drowned
|
||||
1 :survived}]
|
||||
(dtype/emap val-map :keyword col))))
|
||||
(ds/categorical->number cf/categorical)
|
||||
(ds/column "Survived")))))
|
||||
(deftest categorical-assignments-are-integers
|
||||
(is (= #{0 1 2 3}
|
||||
(->
|
||||
(ds/->dataset {:x1 [1 2 4 5 6 5 6 7]
|
||||
:x2 [5 6 6 7 8 2 4 6]
|
||||
:y [:a :b :b :a :c :a :b :d]})
|
||||
(ds/categorical->number [:y])
|
||||
(get :y)
|
||||
distinct
|
||||
set))))
|
||||
|
||||
|
||||
(defn- =-invert-cat [target-1 target-2
|
||||
lookup-one lookup-two
|
||||
result-datatype
|
||||
expected-result
|
||||
]
|
||||
(let [ds (ds/->dataset {:target [target-1 target-2]})
|
||||
inverted
|
||||
(ds-cat/invert-categorical-map ds
|
||||
{:lookup-table {:one lookup-one
|
||||
:two lookup-two},
|
||||
:src-column :target,
|
||||
:result-datatype result-datatype})
|
||||
inverted-target (-> inverted :target)]
|
||||
(= expected-result inverted-target)))
|
||||
;(format "expected %s, found: %s" expected-result) (seq inverted-target)))
|
||||
|
||||
(deftest invert-cat--works
|
||||
(is
|
||||
(=-invert-cat 1 2
|
||||
1 2
|
||||
:int
|
||||
[:one :two]))
|
||||
; TODO - should pass ?
|
||||
(is (=-invert-cat 1.0 2.0
|
||||
1 2
|
||||
:int
|
||||
[:one :two]))
|
||||
|
||||
; TODO - should pass ?
|
||||
(is (=-invert-cat 1.99999 2.99999
|
||||
1 2
|
||||
:int
|
||||
[:one :two]))
|
||||
|
||||
; TODO - should pass ?
|
||||
(is (=-invert-cat 1.2 1.3
|
||||
1 2
|
||||
:int
|
||||
[:one :one])))
|
||||
|
||||
(deftest invert-cat--throws
|
||||
|
||||
|
||||
(is (thrown? Exception
|
||||
(=-invert-cat 1.0 2.0
|
||||
1.0 2.0
|
||||
:float
|
||||
[:one :two])
|
||||
;; => Execution error at tech.v3.dataset.categorical/invert-categorical-map$fn (categorical.clj:177).
|
||||
;; Unable to find src value for numeric value 1.0
|
||||
))
|
||||
|
||||
(is (thrown? Exception
|
||||
(=-invert-cat 1 2
|
||||
4 5
|
||||
:int
|
||||
[:one :two])))
|
||||
;; => Execution error at tech.v3.dataset.categorical/invert-categorical-map$fn (categorical.clj:177).
|
||||
;; Unable to find src value for numeric value 1
|
||||
|
||||
(is (thrown? Exception
|
||||
(=-invert-cat 1 2
|
||||
1.0 2.0
|
||||
:int
|
||||
[:one :two]))))
|
||||
;; => Execution error at tech.v3.dataset.categorical/invert-categorical-map$fn (categorical.clj:177).
|
||||
;; Unable to find src value for numeric value 1
|
||||
|
||||
|
||||
(defn- is-roundtrip-ok [raw-model-prediction]
|
||||
(let [
|
||||
train-ds
|
||||
(->
|
||||
(ds/->dataset {:target [:a :b :c]})
|
||||
(ds/categorical->number [:target])
|
||||
)
|
||||
cat-map (-> train-ds :target meta :categorical-map)
|
||||
|
||||
prediction-ds
|
||||
(->
|
||||
(ds/->dataset {:target raw-model-prediction})
|
||||
(ds/assoc-metadata [:target] :categorical-map cat-map)
|
||||
(ds-cat/reverse-map-categorical-xforms))]
|
||||
(is (= [:c :a :b] (:target prediction-ds)))
|
||||
))
|
||||
|
||||
|
||||
(deftest round-trip
|
||||
;; only this should pass
|
||||
(is-roundtrip-ok [0 1 2])
|
||||
|
||||
;; currently these all pass, while I would like them to all fail
|
||||
(is-roundtrip-ok [0.0 1.2 2.2])
|
||||
(is-roundtrip-ok [0.9 1.9 2.9])
|
||||
(is-roundtrip-ok (float-array [0 1 2]))
|
||||
(is-roundtrip-ok (float-array [0 1.9 2.9]))
|
||||
(is-roundtrip-ok (double-array [0 1.5 2.2])))
|
||||
|
||||
@@ -0,0 +1,62 @@
|
||||
(ns tech.v3.dataset.datetime-test
|
||||
(:require [tech.v3.dataset :as ds]
|
||||
[tech.v3.datatype :as dtype]
|
||||
[tech.v3.datatype.datetime :as dtype-dt]
|
||||
[clojure.test :refer [deftest is]]))
|
||||
|
||||
|
||||
(deftest epoch-millis-second-maps
|
||||
(let [ds (-> (ds/->dataset "test/data/stocks.csv")
|
||||
(ds/update-column "date" dtype-dt/datetime->milliseconds)
|
||||
(ds/mapseq-reader))]
|
||||
(is (number? (get (first ds) "date")))))
|
||||
|
||||
|
||||
(deftest datetime-column-datatype-test
|
||||
(let [ds (ds/->dataset "test/data/stocks.csv")]
|
||||
(is (= :packed-local-date
|
||||
(-> (ds "date")
|
||||
(dtype/->reader)
|
||||
(dtype/sub-buffer 0 20)
|
||||
(dtype/get-datatype))))))
|
||||
|
||||
|
||||
(deftest stocks-descriptive-stats
|
||||
(let [stocks (ds/->dataset "test/data/stocks.csv")
|
||||
desc-stats (ds/descriptive-stats stocks)
|
||||
date-only (-> (ds/filter-column desc-stats :col-name #(= "date" %))
|
||||
(ds/mapseq-reader)
|
||||
(first))]
|
||||
(is (every? dtype-dt/datetime-datatype?
|
||||
(map dtype/get-datatype
|
||||
(vals (select-keys date-only [:min :mean :max])))))))
|
||||
|
||||
|
||||
(deftest stocks-descriptive-stats-2
|
||||
(let [stocks (-> (ds/->dataset "test/data/stocks.csv")
|
||||
(ds/update-column "date" (partial dtype/emap
|
||||
dtype-dt/local-date->instant
|
||||
:instant)))
|
||||
desc-stats (ds/descriptive-stats stocks {:stat-names (ds/all-descriptive-stats-names)})
|
||||
date-only (-> (ds/filter-column desc-stats :col-name #(= "date" %))
|
||||
(ds/mapseq-reader)
|
||||
(first))]
|
||||
(is (every? dtype-dt/datetime-datatype?
|
||||
(map dtype/get-datatype
|
||||
(vals (select-keys date-only [:min :mean :max
|
||||
:quartile-1 :quartile-3])))))))
|
||||
|
||||
|
||||
(deftest datetime-shenanigans-1
|
||||
(is (= (java.time.LocalDateTime/of 2020 01 01 11 22 33)
|
||||
(nth (ds/column
|
||||
(ds/->dataset {:dt [(java.time.LocalDateTime/of 2020 01 01 11 22 33)
|
||||
(java.time.LocalDateTime/of 2020 10 01 01 01 01)]})
|
||||
:dt) 0)))
|
||||
|
||||
(is (= (java.time.LocalDateTime/of 2020 01 01 11 22 33)
|
||||
(dtype/get-value
|
||||
(ds/column
|
||||
(ds/->dataset {:dt [(java.time.LocalDateTime/of 2020 01 01 11 22 33)
|
||||
(java.time.LocalDateTime/of 2020 10 01 01 01 01)]})
|
||||
:dt) 0))))
|
||||
@@ -0,0 +1,57 @@
|
||||
(ns tech.v3.dataset.format-sequence-test
|
||||
(:require [tech.v3.dataset.format-sequence :refer [format-sequence]]
|
||||
[clojure.test :refer [deftest is]]))
|
||||
|
||||
|
||||
(def a [0.000001 0.00001 0.0001 0.001 0.01 0.1 0.0
|
||||
1.0 10.0 100.0 1000.0 10000.0 100000.0])
|
||||
(def b [10.0 10.1 10.11 10.111 10.1111 10.11111
|
||||
1.0 1.1 1.11 1.111 1.1111 1.11111
|
||||
0.0 0.1 0.11 0.111 0.1111 -0.11111])
|
||||
(def c (range -5 4 0.8795833))
|
||||
(def d [-1.0e-20 -1.334e-100 3.43e100 4.556e20
|
||||
1.0e-20 1.334e-100 -3.43e100 -41.556e20
|
||||
0.999e-300 -0.999e300])
|
||||
(def e [-1.0e99 1.0e99])
|
||||
(def f [-1.0e100 1.0e100])
|
||||
(def g [0.002 0.0002 0.000333 0.1 -0.0003 0.0])
|
||||
(def h [0.002 0.0002 0.00333 0.00001 -0.0003 0.022 0.0001])
|
||||
(def i [10.0 ##NaN ##Inf ##-Inf 100 0.001 nil])
|
||||
(def j (map float [39.81 36.35 43.22 28.37 25.45
|
||||
-39.81 36.351 43.221 28.371 25.451]))
|
||||
|
||||
(deftest regression-tests
|
||||
(is (= (format-sequence j)
|
||||
'(" 39.810" " 36.350" " 43.220" " 28.370" " 25.450" "-39.810" " 36.351" " 43.221" " 28.371" " 25.451")))
|
||||
(is (= (format-sequence i 0 0)
|
||||
'("1.0E+01" " NaN" " Inf" " -Inf" "1.0E+02" "1.0E-03" " NaN")))
|
||||
(is (= (format-sequence a)
|
||||
'(" 0.000001" " 0.000010" " 0.000100" " 0.001000" " 0.010000" " 0.100000" " 0.000000" " 1.000000" " 10.000000" " 100.000000" " 1000.000000" " 10000.000000" "100000.000000")))
|
||||
(is (= (format-sequence a 5 4)
|
||||
'("1.0E-06" "1.0E-05" "1.0E-04" "1.0E-03" "1.0E-02" "1.0E-01" "0.0E+00" "1.0E+00" "1.0E+01" "1.0E+02" "1.0E+03" "1.0E+04" "1.0E+05")))
|
||||
(is (= (format-sequence b)
|
||||
'("10.00000" "10.10000" "10.11000" "10.11100" "10.11110" "10.11111" " 1.00000" " 1.10000" " 1.11000" " 1.11100" " 1.11110" " 1.11111" " 0.00000" " 0.10000" " 0.11000" " 0.11100" " 0.11110" "-0.11111")))
|
||||
(is (= (format-sequence b 5 2)
|
||||
'(" 1.00000E+01" " 1.01000E+01" " 1.01100E+01" " 1.01110E+01" " 1.01111E+01" " 1.01111E+01" " 1.00000E+00" " 1.10000E+00" " 1.11000E+00" " 1.11100E+00" " 1.11110E+00" " 1.11111E+00" " 0.00000E+00" " 1.00000E-01" " 1.10000E-01" " 1.11000E-01" " 1.11100E-01" "-1.11110E-01")))
|
||||
(is (= (format-sequence c)
|
||||
'("-5.0000000" "-4.1204167" "-3.2408334" "-2.3612501" "-1.4816668" "-0.6020835" " 0.2774998" " 1.1570831" " 2.0366664" " 2.9162497" " 3.7958330")))
|
||||
(is (= (format-sequence c 4)
|
||||
'("-5.0000" "-4.1204" "-3.2408" "-2.3613" "-1.4817" "-0.6021" " 0.2775" " 1.1571" " 2.0367" " 2.9162" " 3.7958")))
|
||||
(is (= (format-sequence c 4 0)
|
||||
'("-5.0000E+00" "-4.1204E+00" "-3.2408E+00" "-2.3613E+00" "-1.4817E+00" "-6.0208E-01" " 2.7750E-01" " 1.1571E+00" " 2.0367E+00" " 2.9162E+00" " 3.7958E+00")))
|
||||
(is (= (format-sequence d)
|
||||
'("-1.0000E-020" "-1.3340E-100" " 3.4300E+100" " 4.5560E+020" " 1.0000E-020" " 1.3340E-100" "-3.4300E+100" "-4.1556E+021" " 9.9900E-301" "-9.9900E+299")))
|
||||
(is (= (format-sequence e)
|
||||
'("-1.0E+99" " 1.0E+99")))
|
||||
(is (= (format-sequence f)
|
||||
'("-1.0E+100" " 1.0E+100")))
|
||||
(is (= (format-sequence g)
|
||||
'(" 0.002000" " 0.000200" " 0.000333" " 0.100000" "-0.000300" " 0.000000")))
|
||||
(is (= (format-sequence h)
|
||||
'(" 0.00200" " 0.00020" " 0.00333" " 0.00001" "-0.00030" " 0.02200" " 0.00010")))
|
||||
(is (= (format-sequence i)
|
||||
'(" 10.000" " NaN" " Inf" " -Inf" "100.000" " 0.001" " NaN")))
|
||||
(is (= (format-sequence i 0 0)
|
||||
'("1.0E+01" " NaN" " Inf" " -Inf" "1.0E+02" "1.0E-03" " NaN")))
|
||||
(is (= (format-sequence j)
|
||||
'(" 39.810" " 36.350" " 43.220" " 28.370" " 25.450" "-39.810" " 36.351" " 43.221" " 28.371" " 25.451"))))
|
||||
@@ -0,0 +1,37 @@
|
||||
(ns tech.v3.dataset.github-test
|
||||
(:require [tech.v3.dataset :as ds]
|
||||
[tech.v3.datatype :as dtype]
|
||||
[clojure.test :refer [deftest is]]))
|
||||
|
||||
|
||||
|
||||
(comment
|
||||
;;This sometimes returns a 500 error.
|
||||
(deftest load-github-events
|
||||
(let [ds (ds/->dataset "https://api.github.com/events" {:file-type :json
|
||||
:key-fn keyword})]
|
||||
(is (every? keyword? (ds/column-names ds)))
|
||||
(is (= [8 30] (dtype/shape ds)))))
|
||||
(do
|
||||
(require '[tech.v3.datatype.functional :as dfn])
|
||||
(require '[tech.v3.datatype.argops :as argops])
|
||||
(require '[tech.v3.datatype.unary-pred :as un-pred])
|
||||
(defonce flights (ds/->dataset "https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv")))
|
||||
|
||||
(time (-> (dfn/+ (flights "arr_delay")
|
||||
(flights "dep_delay"))
|
||||
(dfn/< 0)
|
||||
(un-pred/bool-reader->indexes)
|
||||
(dtype/ecount)))
|
||||
|
||||
;;Another way to get the same result is to use summation. Booleans are
|
||||
;;interpreted very specifically below where false is 0 and 1 is true.
|
||||
;;Double summation is very fast.
|
||||
(time (-> (dfn/+ (flights "arr_delay")
|
||||
(flights "dep_delay"))
|
||||
(dfn/< 0)
|
||||
(dfn/sum)))
|
||||
|
||||
|
||||
|
||||
)
|
||||
@@ -0,0 +1,45 @@
|
||||
(ns tech.v3.dataset.infer-test
|
||||
(:require [tech.v3.dataset :as ds]
|
||||
[tech.v3.datatype :as dtype]
|
||||
[tech.v3.datatype.functional :as dfn]
|
||||
[tech.v3.datatype.bitmap :as bitmap]
|
||||
[clojure.test :refer [deftest is]]))
|
||||
|
||||
|
||||
(deftest simple-inference
|
||||
(letfn [(inferred-equals [lhs rhs]
|
||||
(let [test-col (-> (ds/->dataset [])
|
||||
(assoc :testdata lhs)
|
||||
(:testdata))]
|
||||
(is (= (dtype/elemwise-datatype test-col)
|
||||
(dtype/elemwise-datatype rhs)))
|
||||
(is (every? identity (dfn/eq test-col rhs)) (vec lhs))))]
|
||||
(inferred-equals [true false true false] (boolean-array [true false true false]))
|
||||
(inferred-equals (list 0 Double/NaN 1.0) (double-array [0.0 Double/NaN 1.0]))
|
||||
(inferred-equals #:tech.v3.dataset{:data [1 2 3 nil 4]
|
||||
:force-datatype? true}
|
||||
[1 2 3 nil 4])
|
||||
(inferred-equals (list 0 Double/NaN 1.0 nil nil)
|
||||
(double-array [0.0 Double/NaN 1.0 Double/NaN Double/NaN]))
|
||||
(is (= #{2 4}
|
||||
(set (ds/missing (-> (ds/->dataset [])
|
||||
(assoc :test-data [1 2 nil 3 nil]))))))
|
||||
(is (= #{2 4}
|
||||
(set (ds/missing
|
||||
(-> (ds/->dataset [])
|
||||
(assoc :test-data #:tech.v3.dataset{:data [1 2 nil 3 nil]
|
||||
:force-datatype? true}))))))
|
||||
(is
|
||||
(= #{}
|
||||
(set (ds/missing
|
||||
(-> (ds/->dataset [])
|
||||
(assoc :test-data #:tech.v3.dataset{:data [1 2 nil 3 nil]
|
||||
:force-datatype? true
|
||||
:missing (bitmap/->bitmap)}))))))
|
||||
(is
|
||||
(= #{}
|
||||
(set (ds/missing
|
||||
(-> (ds/->dataset [])
|
||||
(assoc :test-data #:tech.v3.dataset{:data [1 2 nil 3 nil]
|
||||
:missing (bitmap/->bitmap)}))))))
|
||||
))
|
||||
+401
@@ -0,0 +1,401 @@
|
||||
(ns tech.v3.dataset.join-test
|
||||
(:require [tech.v3.dataset :as ds]
|
||||
[tech.v3.dataset.join :as ds-join]
|
||||
[tech.v3.dataset.column :as ds-col]
|
||||
[tech.v3.datatype :as dtype]
|
||||
[tech.v3.datatype.packing :as packing]
|
||||
[tech.v3.datatype.functional :as dfn]
|
||||
[tech.v3.datatype.datetime :as dtype-dt]
|
||||
[clojure.test :refer [deftest is testing]])
|
||||
(:import [java.time LocalDate]))
|
||||
|
||||
|
||||
(deftest simple-join-test
|
||||
(let [lhs (ds/->dataset {:a (range 10)
|
||||
:b (range 10)})
|
||||
rhs (ds/->dataset {:a (->> (range 10)
|
||||
(mapcat (partial repeat 2))
|
||||
(vec))
|
||||
:c (->> (range 10)
|
||||
(mapcat (partial repeat 2))
|
||||
(vec))})
|
||||
{:keys [inner rhs-missing]} (ds-join/hash-join :a lhs rhs)]
|
||||
(is (dfn/equals (inner :a) (inner :b)))
|
||||
(is (dfn/equals (inner :b) (inner :c)))
|
||||
(is (empty? (seq rhs-missing))))
|
||||
(let [lhs (ds/->dataset {:a (range 10)
|
||||
:b (range 10)})
|
||||
rhs (ds/->dataset {:a (->> (range 15)
|
||||
(mapcat (partial repeat 2))
|
||||
(vec))
|
||||
:c (->> (range 15)
|
||||
(mapcat (partial repeat 2))
|
||||
(vec))})
|
||||
{:keys [inner rhs-missing]} (ds-join/hash-join [:b :c] lhs rhs
|
||||
{:rhs-missing? true})]
|
||||
(is (dfn/equals (inner :a) (inner :b)))
|
||||
(is (dfn/equals (inner :b) (inner :right.a)))
|
||||
(is (= [20 21 22 23 24 25 26 27 28 29] (vec rhs-missing))))
|
||||
(let [lhs (ds/->dataset {:a (range 15)
|
||||
:b (range 15)})
|
||||
rhs (ds/->dataset {:a (->> (range 10)
|
||||
(mapcat (partial repeat 2))
|
||||
(vec))
|
||||
:c (->> (range 10)
|
||||
(mapcat (partial repeat 2))
|
||||
(vec))})
|
||||
{:keys [inner lhs-missing]} (ds-join/hash-join :a lhs rhs
|
||||
{:lhs-missing? true})]
|
||||
(is (dfn/equals (inner :a) (inner :b)))
|
||||
(is (dfn/equals (inner :b) (inner :c)))
|
||||
(is (= [10 11 12 13 14] (vec lhs-missing)))))
|
||||
|
||||
(defn lhs-customer-db
|
||||
[]
|
||||
(ds/->dataset [{"CustomerID" 1,
|
||||
"CustomerName" "Alfreds Futterkiste",
|
||||
"ContactName" "Maria Anders",
|
||||
"Address" "Obere Str. 57",
|
||||
"City" "Berlin",
|
||||
"PostalCode" 12209,
|
||||
"Country" "Germany"}
|
||||
{"CustomerID" 2,
|
||||
"CustomerName" "Ana Trujillo Emparedados y helados",
|
||||
"ContactName" "Ana Trujillo",
|
||||
"Address" "Avda. de la Constitución 2222",
|
||||
"City" "México D.F.",
|
||||
"PostalCode" 5021,
|
||||
"Country" "Mexico"}
|
||||
{"CustomerID" 3,
|
||||
"CustomerName" "Antonio Moreno Taquería",
|
||||
"ContactName" "Antonio Moreno",
|
||||
"Address" "Mataderos 2312",
|
||||
"City" "México D.F.",
|
||||
"PostalCode" 5023,
|
||||
"Country" "Mexico"}]
|
||||
{:parser-fn {"PostalCode" :int16}}))
|
||||
|
||||
(defn rhs-customer-db
|
||||
[]
|
||||
(ds/->dataset [{"OrderID" 10308,
|
||||
"CustomerID" 2,
|
||||
"EmployeeID" 7,
|
||||
"OrderDate" "1996-09-18",
|
||||
"ShipperID" 3}
|
||||
{"OrderID" 10309,
|
||||
"CustomerID" 37,
|
||||
"EmployeeID" 3,
|
||||
"OrderDate" "1996-09-19",
|
||||
"ShipperID" 1}
|
||||
{"OrderID" 10310,
|
||||
"CustomerID" 77,
|
||||
"EmployeeID" 8,
|
||||
"OrderDate" "1996-09-20",
|
||||
"ShipperID" 2}]
|
||||
{:parser-fn {"OrderID" :int16
|
||||
"CustomerID" :int16
|
||||
"EmployeeID" :int16
|
||||
"ShipperID" :int16}}))
|
||||
|
||||
|
||||
(deftest inner-join-test
|
||||
(let [lhs (lhs-customer-db)
|
||||
rhs (rhs-customer-db)
|
||||
join-data (ds-join/inner-join "CustomerID" lhs rhs)
|
||||
lhs-colname-map (:left-column-names (meta join-data))
|
||||
rhs-colname-map (:right-column-names (meta join-data))]
|
||||
(is (= (count lhs-colname-map)
|
||||
(ds/column-count lhs)))
|
||||
(is (= (count rhs-colname-map)
|
||||
(ds/column-count rhs)))))
|
||||
|
||||
|
||||
;;sample from https://www.w3schools.com/sql/sql_join_left.asp
|
||||
(deftest left-join-test
|
||||
(let [lhs (lhs-customer-db)
|
||||
rhs (rhs-customer-db)
|
||||
join-data (ds-join/left-join "CustomerID" lhs rhs)
|
||||
recs (ds/mapseq-reader join-data)
|
||||
empty-int? #{-32768}
|
||||
empty-string? #{""}
|
||||
empty-val? #(or (empty-int? %) (empty-string? %)
|
||||
(nil? %))
|
||||
realized (some #(when (= (get % "CustomerID") 2) %) recs)
|
||||
unrealized (filter #(not= % realized) recs)
|
||||
lhs-colname-map (:left-column-names (meta join-data))
|
||||
rhs-colname-map (:right-column-names (meta join-data))]
|
||||
(is (every? (complement empty-val?) (vals realized))
|
||||
"Ana's record should be fully realized.")
|
||||
(is (every? identity
|
||||
(for [{:strs [OrderID OrderDate ShipperID]}
|
||||
unrealized]
|
||||
;;We can't do order date because they are dates
|
||||
(every? empty-val? [OrderID ShipperID])))
|
||||
"Everyone else should have missing entries from RHS.")
|
||||
(is (= (count lhs-colname-map)
|
||||
(ds/column-count lhs)))
|
||||
(is (= (count rhs-colname-map)
|
||||
(ds/column-count rhs)))))
|
||||
|
||||
|
||||
(deftest right-join-test
|
||||
(let [lhs (lhs-customer-db)
|
||||
rhs (rhs-customer-db)
|
||||
join-data (ds-join/right-join "CustomerID" lhs rhs)
|
||||
lhs-colname-map (:left-column-names (meta join-data))
|
||||
rhs-colname-map (:right-column-names (meta join-data))]
|
||||
(is (= #{2 37 77} (set (join-data "right.CustomerID"))))
|
||||
(is (= #{"Ana Trujillo" nil} (set (join-data "ContactName"))))
|
||||
(is (= #{5021 nil} (set (map #(when % (int %)) (join-data "PostalCode")))))
|
||||
(is (= #{1 2} (set (ds-col/missing (join-data "ContactName")))))
|
||||
(is (= #{1 2} (set (ds-col/missing (join-data "PostalCode")))))
|
||||
(is (= (count lhs-colname-map)
|
||||
(ds/column-count lhs)))
|
||||
(is (= (count rhs-colname-map)
|
||||
(ds/column-count rhs)))))
|
||||
|
||||
|
||||
(deftest duplicate-column-test
|
||||
(let [test-ds (ds/->dataset "test/data/ames-house-prices/train.csv"
|
||||
{:column-whitelist ["SalePrice" "1stFlrSF" "2ndFlrSF"]
|
||||
:n-records 5
|
||||
:parser-fn {:SalePrice :float32}})
|
||||
jt (ds-join/inner-join "1stFlrSF" test-ds test-ds)]
|
||||
(is (= (ds/column-count jt)
|
||||
(count (distinct (ds/column-names jt))))))
|
||||
(let [test-ds (ds/->dataset "test/data/ames-house-prices/train.csv"
|
||||
{:column-whitelist ["SalePrice" "1stFlrSF" "2ndFlrSF"]
|
||||
:n-records 5
|
||||
:parser-fn {:SalePrice :float32}})
|
||||
jt (ds-join/inner-join ["1stFlrSF" "2ndFlrSF"] test-ds test-ds)]
|
||||
(is (= (ds/column-count jt)
|
||||
(count (distinct (ds/column-names jt)))))))
|
||||
|
||||
|
||||
(deftest join-tuple-cname
|
||||
(let [DS (ds/->dataset [{:a 11 [:a :b] 2}])
|
||||
lj (ds-join/left-join :a DS DS)
|
||||
rj (ds-join/right-join :a DS DS)
|
||||
ljt (ds-join/left-join [[:a :b][:a :b]] DS DS)]
|
||||
;;no nil column names
|
||||
(is (every? identity (ds/column-names lj)))
|
||||
(is (every? identity (ds/column-names rj)))
|
||||
(is (every? identity (ds/column-names ljt)))))
|
||||
|
||||
|
||||
(defn- drop-missing
|
||||
[ds]
|
||||
(ds/drop-rows ds (ds/missing ds)))
|
||||
|
||||
|
||||
(deftest asof-lt
|
||||
(let [ds-a (ds/->dataset {:a (range 10)})
|
||||
ds-b (ds/->dataset {:a (dfn/* 2 (range 10))})
|
||||
ds-bm (ds/->dataset {:a (dfn/- (dfn/* 2 (range 10)) 5)})
|
||||
ds-bmm (ds/->dataset {:a (dfn/- (dfn/* 2 (range 10)) 14)})]
|
||||
(is (= [2 2 4 4 6 6 8 8 10 10]
|
||||
(vec ((ds-join/left-join-asof :a ds-a ds-b {:asof-op :<}) :right.a))))
|
||||
(is (= [0 2 2 4 4 6 6 8 8 10]
|
||||
(vec ((ds-join/left-join-asof :a ds-a ds-b {:asof-op :<=}) :right.a))))
|
||||
(is (= [1 3 3 5 5 7 7 9 9 11]
|
||||
(vec ((ds-join/left-join-asof :a ds-a ds-bm {:asof-op :<}) :right.a))))
|
||||
(is (= [2 2 4 4 nil nil nil nil nil nil]
|
||||
(vec ((ds-join/left-join-asof :a ds-a ds-bmm {:asof-op :<}) :right.a)))))
|
||||
|
||||
(let [cur-date (dtype-dt/local-date)
|
||||
date-fn #(when %
|
||||
(dtype-dt/plus-temporal-amount cur-date % :days))
|
||||
ds-a (ds/->dataset {:a (date-fn (range 10))})
|
||||
ds-b (ds/->dataset {:a (date-fn (dfn/* 2 (range 10)))})
|
||||
ds-bm (ds/->dataset {:a (date-fn (dfn/- (dfn/* 2 (range 10)) 5))})
|
||||
ds-bmm (ds/->dataset {:a (date-fn (dfn/- (dfn/* 2 (range 10)) 14))})]
|
||||
(is (= (vec (date-fn [2 2 4 4 6 6 8 8 10 10]))
|
||||
(vec (packing/unpack
|
||||
((ds-join/left-join-asof :a ds-a ds-b {:asof-op :<}) :right.a)))))
|
||||
(is (= (date-fn [0 2 2 4 4 6 6 8 8 10])
|
||||
(vec (packing/unpack
|
||||
((ds-join/left-join-asof :a ds-a ds-b {:asof-op :<=}) :right.a)))))
|
||||
(is (= (date-fn [1 3 3 5 5 7 7 9 9 11])
|
||||
(vec (packing/unpack
|
||||
((ds-join/left-join-asof :a ds-a ds-bm {:asof-op :<}) :right.a)))))
|
||||
(is (= (date-fn [2 2 4 4])
|
||||
(vec (packing/unpack
|
||||
((drop-missing (ds-join/left-join-asof
|
||||
:a ds-a ds-bmm {:asof-op :<}))
|
||||
:right.a)))))))
|
||||
|
||||
|
||||
(deftest asof-gt
|
||||
(let [ds-a (ds/->dataset {:a (range 10)})
|
||||
ds-b (ds/->dataset {:a (dfn/* 2 (range 10))})
|
||||
ds-bm (ds/->dataset {:a (dfn/- (dfn/* 2 (range 10)) 5)})
|
||||
ds-bmm (ds/->dataset {:a (dfn/- (dfn/* 2 (range 10)) 14)})]
|
||||
(is (= [nil 0 0 2 2 4 4 6 6 8]
|
||||
(vec ((ds-join/left-join-asof :a ds-a ds-b {:asof-op :>}) :right.a))))
|
||||
(is (= [0 0 2 2 4 4 6 6 8 8]
|
||||
(vec ((ds-join/left-join-asof :a ds-a ds-b {:asof-op :>=}) :right.a))))
|
||||
(is (= [-1 -1 1 1 3 3 5 5 7 7]
|
||||
(vec ((ds-join/left-join-asof :a ds-a ds-bm {:asof-op :>}) :right.a))))
|
||||
(is (= [-2 0 0 2 2 4 4 4 4 4]
|
||||
(vec ((ds-join/left-join-asof :a ds-a ds-bmm {:asof-op :>}) :right.a))))))
|
||||
|
||||
|
||||
(deftest asof-nearest
|
||||
(let [ds-a (ds/->dataset {:a (range 10)})
|
||||
ds-b (ds/->dataset {:a (dfn/* 3 (range 10))})
|
||||
ds-bm (ds/->dataset {:a (dfn/- (dfn/* 3 (range 10)) 5)})
|
||||
ds-bmm (ds/->dataset {:a (dfn/- (dfn/* 3 (range 10)) 20)})]
|
||||
(is (= [0 0 3 3 3 6 6 6 9 9]
|
||||
(vec ((ds-join/left-join-asof :a ds-a ds-b {:asof-op :nearest})
|
||||
:right.a))))
|
||||
(is (= [1 1 1 4 4 4 7 7 7 10]
|
||||
(vec ((ds-join/left-join-asof :a ds-a ds-bm {:asof-op :nearest})
|
||||
:right.a))))
|
||||
(is (= [1 1 1 4 4 4 7 7 7 7]
|
||||
(vec ((ds-join/left-join-asof :a ds-a ds-bmm {:asof-op :nearest})
|
||||
:right.a))))))
|
||||
|
||||
|
||||
(deftest pd-merge
|
||||
(let [ds-a (ds/->dataset {:a [:a :b :b :a :c]
|
||||
:b (range 5)
|
||||
:c (range 5)})
|
||||
ds-b (ds/->dataset {:a [:a :b :a :b :d]
|
||||
:b (range 5)
|
||||
:c (range 6 11)})]
|
||||
(is (= [0 1 2 3 4 nil nil nil]
|
||||
(vec ((ds-join/pd-merge ds-a ds-b {:on [:a :b] :how :outer}) :c))))
|
||||
(is (= [6 7 nil nil nil]
|
||||
(vec ((ds-join/pd-merge ds-a ds-b {:on [:a :b] :how :left}) :right.c))))
|
||||
(is (= [0 1 nil nil nil]
|
||||
(vec ((ds-join/pd-merge ds-a ds-b {:on [:a :b] :how :right}) :left.c))))
|
||||
(is (= [6 7]
|
||||
(vec ((ds-join/pd-merge ds-a ds-b {:on [:a :b] :how :inner}) :right.c))))
|
||||
(is (= [6 7 8 9 10 6 7 8 9 10 6 7 8 9 10 6 7 8 9 10 6 7 8 9 10]
|
||||
(vec ((ds-join/pd-merge ds-a ds-b {:how :cross}) :right.c))))))
|
||||
|
||||
|
||||
(deftest double-join
|
||||
(let [a (ds/->dataset [{:name "a" :a 1.0 :b 2.0}
|
||||
{:name "b" :a 1.0 :b 2.0}
|
||||
{:name "c" :a 1.0 :b 2.0}])
|
||||
b (ds/->dataset [{:name "a" :c 1.0}
|
||||
{:name "b" :c 1.0}])]
|
||||
(is (= [1.0 1.0 nil]
|
||||
(vec ((ds-join/left-join :name a b) :c))))
|
||||
(is (= ["a" "b" nil]
|
||||
(vec ((ds-join/left-join :name a b) :right.name))))
|
||||
(is (= [2.0 2.0 2.0]
|
||||
(vec ((ds-join/left-join :name a b) :b))))
|
||||
(is (= [1.0 1.0 1.0]
|
||||
(vec ((ds-join/left-join :name a b) :a))))
|
||||
(is (= ["a" "b" "c"]
|
||||
(vec ((ds-join/left-join :name a b) :name))))
|
||||
(ds-join/left-join :name a b)))
|
||||
|
||||
(deftest eraderna-left-join
|
||||
(testing "Changing the type of int shouldn't break the join"
|
||||
(let [a (-> (ds/->dataset [{:y 2022}]))
|
||||
a' (-> a
|
||||
(ds/column-cast :y :int16))
|
||||
b (ds/->dataset [{:y 2022 :s "2022"}
|
||||
{:y 2023 :s "2023"}])]
|
||||
(is (=
|
||||
((ds-join/left-join :y a b) :s)
|
||||
((ds-join/left-join :y a' b) :s))))))
|
||||
|
||||
|
||||
(deftest cross-join
|
||||
(let [res (ds-join/pd-merge
|
||||
(ds/->dataset {:a [1 2 3] :b [4 5 6]})
|
||||
(ds/->dataset {:c [:a :b :c] :d [:x :y :z]})
|
||||
{:how :cross})]
|
||||
(is (= [1 1 1 2 2 2 3 3 3]
|
||||
(res :a)))
|
||||
(is (= [:a :b :c :a :b :c :a :b :c]
|
||||
(res :c)))))
|
||||
|
||||
|
||||
(deftest pd-merge-issue-302
|
||||
(let [res (ds-join/pd-merge (ds/->dataset {:id ["a" "b"]
|
||||
:x [1 2]})
|
||||
(ds/->dataset {:id ["c"]
|
||||
:y [3]})
|
||||
{:on [:id] :how :outer})]
|
||||
(is (= [nil nil 3] (vec (:y res))))))
|
||||
|
||||
|
||||
(deftest left-join-dates
|
||||
(is (= [{:a (LocalDate/of 2022 12 20)
|
||||
:b 4,
|
||||
:right.a (LocalDate/of 2022 12 20)
|
||||
:c 5}
|
||||
{:a (LocalDate/of 2022 12 28)
|
||||
:b 3}
|
||||
{:a (LocalDate/of 2022 12 30)
|
||||
:b 4}]
|
||||
(vec (ds/rows
|
||||
(tech.v3.dataset.join/left-join
|
||||
:a
|
||||
(ds/->dataset [{:a (LocalDate/of 2022 12 28) :b 3}
|
||||
{:a (LocalDate/of 2022 12 30) :b 4}
|
||||
{:a (LocalDate/of 2022 12 20) :b 4}])
|
||||
(ds/->dataset [{:a (LocalDate/of 2022 12 20) :c 5}
|
||||
{:a (LocalDate/of 2022 10 20) :c 6}
|
||||
{:a (LocalDate/of 2022 11 20) :c 7}])))))))
|
||||
|
||||
(deftest issue-361
|
||||
(let [ds1 (ds/->dataset {:a '(\1 \2 \3 \4 \5 \6 \7 \8 \9)})
|
||||
ds2 (ds/->dataset {:a '(\0 \9 \8 \7 \6 \5 \4 \3 \2)})
|
||||
jds (ds-join/left-join :a ds1 ds2)]
|
||||
(is (= 9 (ds/row-count jds)))
|
||||
(is (= 1 (dtype/ecount (ds/missing jds))))))
|
||||
|
||||
|
||||
(deftest issue-377
|
||||
(let [j (ds-join/left-join :a
|
||||
(ds/->dataset {:a [nil 2]
|
||||
:b [3 4]})
|
||||
(ds/->dataset {:a [nil 4]
|
||||
:b [6 7]}))]
|
||||
(is (= [6 nil] (vec (j :right.b))))))
|
||||
|
||||
|
||||
(deftest short-types
|
||||
(let [lds (ds/->dataset [{:i "foo" :y (short 2022)}])
|
||||
rds (ds/->dataset [{:i "foo" :y 2022 :s "2022"}
|
||||
{:i "foo" :y 2023 :s "2023"}])
|
||||
jds (ds-join/pd-merge lds rds {:on [:i :y]})]
|
||||
(is (= {:i "foo" :y 2022 :s "2022"}
|
||||
(ds/row-at jds 0))))
|
||||
(is (= 1 (ds/row-count
|
||||
(ds-join/left-join :z
|
||||
(ds/->dataset [{:z ["foo" (short 2022)]}])
|
||||
(ds/->dataset [{:z ["foo" (long 2022)] :s "2022"}
|
||||
{:z ["foo" (long 2023)] :s "2023"}]))))))
|
||||
|
||||
|
||||
(deftest issue-381
|
||||
(let [make-row (fn [] {:row 1})
|
||||
left (ds/->dataset (repeatedly 10000 make-row))
|
||||
right (ds/->dataset (repeatedly 1000 make-row))
|
||||
jds (ds-join/left-join :row left right)]
|
||||
(is (= (* 10000 1000) (ds/row-count jds)))))
|
||||
|
||||
|
||||
(deftest pd-merge-error
|
||||
(let [ds1 (ds/->dataset {:customer ["A" "A" "A"]
|
||||
:product ["A" "B" "C"]})
|
||||
ds2 (ds/->dataset {:product ["B" "C"]})
|
||||
mm (ds-join/pd-merge ds1 ds2 {:on :product :how :inner})]
|
||||
(is (= #{:product :customer}
|
||||
(set (ds/column-names mm))))))
|
||||
|
||||
|
||||
(deftest pd-merge-issue-435
|
||||
(is (ds-join/pd-merge (ds/empty-dataset)
|
||||
(ds/->dataset {:t [0 1] :x [:a :b]})
|
||||
{:on :t :how :outer}))
|
||||
(is (ds-join/pd-merge (ds/->dataset {:t [0 1] :x [:a :b]})
|
||||
(ds/empty-dataset)
|
||||
{:on :t :how :outer})))
|
||||
@@ -0,0 +1,222 @@
|
||||
(ns tech.v3.dataset.mapseq-test
|
||||
(:require [tech.v3.dataset :as ds]
|
||||
[tech.v3.dataset.column :as ds-col]
|
||||
[tech.v3.dataset.column-filters :as cf]
|
||||
[tech.v3.dataset.math :as ds-math]
|
||||
[tech.v3.dataset.modelling :as ds-mod]
|
||||
[tech.v3.dataset.categorical :as ds-cat]
|
||||
[tech.v3.dataset.test-utils :as test-utils]
|
||||
[tech.v3.datatype :as dtype]
|
||||
[tech.v3.datatype.functional :as dtype-fn]
|
||||
[tech.v3.tensor :as dtt]
|
||||
[clojure.set :as set]
|
||||
[clojure.test :refer [deftest is testing]]))
|
||||
|
||||
|
||||
(deftest mapseq-classification-test
|
||||
(let [src-ds (test-utils/mapseq-fruit-dataset)
|
||||
dataset (ds/bind-> src-ds ds
|
||||
(ds/remove-columns [:fruit-subtype :fruit-label])
|
||||
(ds/categorical->number cf/categorical)
|
||||
(ds/update (cf/difference ds (cf/categorical ds))
|
||||
#(ds-math/transform-minmax % (ds-math/fit-minmax %)))
|
||||
(ds-mod/set-inference-target :fruit-name))
|
||||
mapseq-ds (ds/mapseq-reader (test-utils/mapseq-fruit-dataset))
|
||||
|
||||
src-keys (set (keys (first mapseq-ds)))
|
||||
result-keys (->> (ds/columns dataset)
|
||||
(map ds-col/column-name)
|
||||
(set))
|
||||
non-categorical (ds/column-names
|
||||
(cf/difference dataset (cf/categorical dataset)))]
|
||||
|
||||
|
||||
(is (= #{59}
|
||||
(->> (ds/columns dataset)
|
||||
(map dtype/ecount)
|
||||
set)))
|
||||
|
||||
|
||||
;;Column names can be keywords.
|
||||
(is (= src-keys
|
||||
(set (->> (ds/columns src-ds)
|
||||
(map ds-col/column-name)))))
|
||||
|
||||
(is (= (set/difference src-keys #{:fruit-subtype :fruit-label})
|
||||
result-keys))
|
||||
|
||||
;; Map back from values to keys for labels. For tablesaw, column values
|
||||
;; are never keywords.
|
||||
(is (= (mapv :fruit-name mapseq-ds)
|
||||
(vec (first (vals (ds-mod/labels dataset))))))
|
||||
|
||||
(is (= {:fruit-name :classification}
|
||||
(ds-mod/model-type dataset)))
|
||||
|
||||
(is (= {:fruit-name :classification,
|
||||
:mass :regression,
|
||||
:width :regression,
|
||||
:height :regression,
|
||||
:color-score :regression}
|
||||
(ds-mod/model-type dataset (ds/column-names dataset))))
|
||||
|
||||
;;Does the post-transformation value of fruit-name map to the
|
||||
;;pre-transformation value of fruit-name?
|
||||
(is (= (mapv :fruit-name mapseq-ds)
|
||||
(->> (ds-cat/reverse-map-categorical-xforms dataset)
|
||||
(ds/mapseq-reader)
|
||||
(mapv :fruit-name))))
|
||||
|
||||
|
||||
(is (= (as-> (ds/select dataset :all (range 10)) dataset
|
||||
(ds/mapseq-reader dataset)
|
||||
(group-by :fruit-name dataset))
|
||||
(as-> (ds/select dataset :all (range 10)) ds
|
||||
(ds/group-by-column ds :fruit-name)
|
||||
(map (fn [[k group-ds]]
|
||||
[k (vec (ds/mapseq-reader group-ds))])
|
||||
ds)
|
||||
(into {} ds))))
|
||||
|
||||
;;forward map from input value to encoded value.
|
||||
;;After ETL, column values are all doubles
|
||||
(let [apple-value (get (ds-mod/inference-target-label-map dataset) :apple)]
|
||||
(is (= #{:apple}
|
||||
(as-> dataset ds
|
||||
(ds/filter ds #(= apple-value (:fruit-name %)))
|
||||
;;Use full version of ->flyweight to do reverse mapping of numeric
|
||||
;;fruit name back to input label.
|
||||
(ds-cat/reverse-map-categorical-xforms ds)
|
||||
(ds/mapseq-reader ds)
|
||||
(map :fruit-name ds)
|
||||
(set ds)))))
|
||||
|
||||
|
||||
|
||||
|
||||
;; Ensure range map works
|
||||
(is (= (vec (repeat (count non-categorical) [-0.5 0.5]))
|
||||
(->> non-categorical
|
||||
(mapv (fn [colname]
|
||||
(let [{col-min :min
|
||||
col-max :max} (-> (ds/column dataset colname)
|
||||
(ds-col/stats [:min :max]))]
|
||||
[col-min col-max]))))))
|
||||
|
||||
;;Concatenation should work
|
||||
(is (= (mapv :fruit-name
|
||||
(concat mapseq-ds mapseq-ds))
|
||||
(->> (-> (ds/concat dataset dataset)
|
||||
(ds-cat/reverse-map-categorical-xforms)
|
||||
(ds/mapseq-reader))
|
||||
(mapv :fruit-name))))
|
||||
|
||||
(let [new-ds (ds/bind-> (ds/->dataset (map hash-map (repeat :mass) (range 20))) dataset
|
||||
;;The mean should happen in double or floating point space.
|
||||
(assoc :mass-avg
|
||||
(dtype-fn/fixed-rolling-window
|
||||
(dtype/elemwise-cast (dataset :mass) :float64)
|
||||
5 dtype-fn/mean)))]
|
||||
(is (= [{:mass 0, :mass-avg 0.6}
|
||||
{:mass 1, :mass-avg 1.2}
|
||||
{:mass 2, :mass-avg 2.0}
|
||||
{:mass 3, :mass-avg 3.0}
|
||||
{:mass 4, :mass-avg 4.0}
|
||||
{:mass 5, :mass-avg 5.0}
|
||||
{:mass 6, :mass-avg 6.0}
|
||||
{:mass 7, :mass-avg 7.0}
|
||||
{:mass 8, :mass-avg 8.0}
|
||||
{:mass 9, :mass-avg 9.0}]
|
||||
(-> (ds/select new-ds [:mass :mass-avg] (range 10))
|
||||
ds/mapseq-reader)))
|
||||
(let [sorted-ds (ds/sort-by-column new-ds :mass-avg >)]
|
||||
(is (= [{:mass 19, :mass-avg 18.4}
|
||||
{:mass 18, :mass-avg 17.8}
|
||||
{:mass 17, :mass-avg 17.0}
|
||||
{:mass 16, :mass-avg 16.0}
|
||||
{:mass 15, :mass-avg 15.0}
|
||||
{:mass 14, :mass-avg 14.0}
|
||||
{:mass 13, :mass-avg 13.0}
|
||||
{:mass 12, :mass-avg 12.0}
|
||||
{:mass 11, :mass-avg 11.0}
|
||||
{:mass 10, :mass-avg 10.0}]
|
||||
(-> (ds/select sorted-ds [:mass :mass-avg] (range 10))
|
||||
ds/mapseq-reader)))))
|
||||
(let [nth-db (ds/take-nth src-ds 5)]
|
||||
(is (= [7 12] (dtype/shape nth-db)))
|
||||
(is (= [{:mass 192.0, :width 8}
|
||||
{:mass 80.0, :width 5}
|
||||
{:mass 166.0, :width 6}
|
||||
{:mass 156.0, :width 7}
|
||||
{:mass 160.0, :width 7}
|
||||
{:mass 356.0, :width 9}
|
||||
{:mass 158.0, :width 7}
|
||||
{:mass 150.0, :width 7}
|
||||
{:mass 154.0, :width 7}
|
||||
{:mass 186.0, :width 7}]
|
||||
(->> (-> (ds/select nth-db [:mass :width] (range 10))
|
||||
ds/mapseq-reader)
|
||||
(map #(update % :width int))))))))
|
||||
|
||||
(deftest one-hot
|
||||
(testing "Testing one-hot into multiple column groups"
|
||||
(let [src-ds (test-utils/mapseq-fruit-dataset)
|
||||
dataset (-> src-ds
|
||||
(ds/remove-columns [:fruit-subtype :fruit-label])
|
||||
(ds-mod/set-inference-target :fruit-name)
|
||||
(ds/categorical->one-hot [:fruit-name]))]
|
||||
(is (= {:one-hot-table
|
||||
{:orange :fruit-name-orange,
|
||||
:mandarin :fruit-name-mandarin,
|
||||
:apple :fruit-name-apple,
|
||||
:lemon :fruit-name-lemon},
|
||||
:src-column :fruit-name,
|
||||
:result-datatype :int64}
|
||||
(into {} (first (ds-cat/dataset->one-hot-maps dataset)))))
|
||||
(is (= #{:mass :fruit-name-orange :fruit-name-mandarin :width :fruit-name-apple :color-score
|
||||
:fruit-name-lemon :height}
|
||||
(->> (ds/columns dataset)
|
||||
(map ds-col/column-name)
|
||||
set)))
|
||||
(is (= (->> (ds/mapseq-reader src-ds)
|
||||
(take 20)
|
||||
(mapv :fruit-name))
|
||||
(->> (first (vals (ds-mod/labels dataset)))
|
||||
(take 20)
|
||||
vec)))
|
||||
|
||||
(is (= {:color-score :regression,
|
||||
:fruit-name-orange :classification,
|
||||
:fruit-name-lemon :classification,
|
||||
:fruit-name-mandarin :classification,
|
||||
:fruit-name-apple :classification,
|
||||
:height :regression
|
||||
:width :regression,
|
||||
:mass :regression,
|
||||
}
|
||||
(ds-mod/model-type dataset (ds/column-names dataset)))))))
|
||||
|
||||
|
||||
(deftest generalized-mapseq-ds
|
||||
(let [ds (ds/->dataset [{:a 1 :b {:a 1 :b 2}}
|
||||
{:a 2}])]
|
||||
(is (= #{:int64 :persistent-map}
|
||||
(set (map dtype/get-datatype (vals ds)))))))
|
||||
|
||||
|
||||
(deftest tensors-in-mapseq
|
||||
(let [ds (ds/->dataset [{:a (dtt/->tensor (partition 3 (range 9)))
|
||||
:b "hello"}
|
||||
{:a (dtt/->tensor (partition 3 (range 9)))
|
||||
:b "goodbye"}])]
|
||||
(is (= #{:tensor :string}
|
||||
(set (map dtype/get-datatype (vals ds)))))))
|
||||
|
||||
|
||||
(deftest datetime-missing
|
||||
(let [ds (ds/->dataset [{:d "1971-01-01"}
|
||||
{:d "1970-01-01"}
|
||||
{:d nil}
|
||||
{:d "0001-01-01"}]
|
||||
{:parser-fn {:d :local-date}})]
|
||||
(is (= 1 (dtype/ecount (ds-col/missing (ds :d)))))))
|
||||
@@ -0,0 +1,68 @@
|
||||
(ns tech.v3.dataset.math-test
|
||||
(:require [tech.v3.dataset :as ds]
|
||||
[tech.v3.dataset.math :as ds-math]
|
||||
[tech.v3.dataset.tensor :as ds-tens]
|
||||
[tech.v3.datatype :as dtype]
|
||||
[tech.v3.datatype.datetime :as dtype-dt]
|
||||
[tech.v3.datatype.functional :as dfn]
|
||||
[tech.v3.tensor :as dtt]
|
||||
[clojure.test :refer [deftest is]]))
|
||||
|
||||
|
||||
(deftest basic-interp
|
||||
(let [interp-ds (-> (ds/->dataset "test/data/stocks.csv")
|
||||
(ds/filter-column "symbol" "MSFT")
|
||||
;;The interpolate requires a sorted dataset
|
||||
(ds/sort-by-column "date")
|
||||
(ds-math/interpolate-loess "date" "price"
|
||||
{:result-name "price-loess"}))]
|
||||
(is (not (nil? (:interpolator (meta (interp-ds "price-loess"))))))))
|
||||
|
||||
|
||||
(deftest fill-range-replace
|
||||
(let [ds (-> (ds/->dataset {:a [1 5 10 15 20]
|
||||
:b [2 2 nil 4 8]})
|
||||
(ds-math/fill-range-replace :a 2))]
|
||||
(is (dfn/equals
|
||||
[1.0 3.0 5.0 6.66 8.33 10.0
|
||||
11.66 13.33 15.0 16.66 18.33 20.0]
|
||||
(vec (ds :a))
|
||||
0.1))
|
||||
(is (= [2 2 2 2 2 2 2 2 4 4 4 8]
|
||||
(vec (ds :b)))))
|
||||
(let [ds (-> (ds/->dataset {:a [1 5 10 15 20]
|
||||
:b [2 2 nil 4 8]})
|
||||
(ds-math/fill-range-replace :a 2 nil))]
|
||||
(is (= [2 nil 2 nil nil nil nil nil 4 nil nil 8]
|
||||
(vec (ds :b)))))
|
||||
(let [ds (-> (ds/->dataset {:a [1 5 10 15 20]
|
||||
:b [2 2 nil 4 8]})
|
||||
(ds-math/fill-range-replace :a 2 :value 20))]
|
||||
(is (= [2 20 2 20 20 20 20 20 4 20 20 8]
|
||||
(vec (ds :b)))))
|
||||
(let [ds (-> (ds/->dataset {:a (dtype-dt/plus-temporal-amount
|
||||
(dtype-dt/local-date)
|
||||
[1 5 10 15 20]
|
||||
:days)
|
||||
:b [2 2 nil 4 8]})
|
||||
(ds-math/fill-range-replace :a (* 2 dtype-dt/milliseconds-in-day)
|
||||
:value 20))]
|
||||
(is (= [2 20 2 20 20 20 20 20 4 20 20 8]
|
||||
(vec (ds :b))))))
|
||||
|
||||
|
||||
(comment
|
||||
(def test-ds (ds/->dataset {:a [7 4 6 8 8 7 5 9 7 8]
|
||||
:b [4 1 3 6 5 2 3 5 4 2]
|
||||
:c [3 8 5 1 7 9 3 8 5 2]}))
|
||||
|
||||
(def test-data (dtt/->tensor [[10 8 6 20 9]
|
||||
[11 21 23 18 4]
|
||||
[12 7 5 13 19]
|
||||
[ 3 14 15 22 17]
|
||||
[24 1 2 0 16]] :datatype :float64))
|
||||
(def test-data (dtt/transpose (dtt/->tensor [[7 4 6 8 8 7 5 9 7 8]
|
||||
[4 1 3 6 5 2 3 5 4 2]
|
||||
[3 8 5 1 7 9 3 8 5 2]] :datatype :float64)
|
||||
[1 0]))
|
||||
)
|
||||
@@ -0,0 +1,37 @@
|
||||
(ns tech.v3.dataset.metamorph-test
|
||||
(:require [tech.v3.dataset.metamorph :as ds-mm]
|
||||
[tech.v3.dataset :as ds]
|
||||
[clojure.test :as t :refer [deftest is]]))
|
||||
|
||||
(def df
|
||||
(ds/->dataset "test/data/ames-train.csv.gz" {:key-fn keyword}))
|
||||
|
||||
|
||||
(deftest call-with-df-1
|
||||
(is (= [1 2 3 4 5]
|
||||
(->>
|
||||
((ds-mm/set-inference-target :SalePrice) df)
|
||||
:metamorph/data
|
||||
:Id
|
||||
(take 5)
|
||||
))))
|
||||
|
||||
(deftest call-with-df-2
|
||||
(is (= [1 2 3 4 5]
|
||||
(->>
|
||||
((ds-mm/rename-columns {:SalePrice :sale-price :Id :id})
|
||||
{:metamorph/data df})
|
||||
((ds-mm/set-inference-target :sale-price) )
|
||||
:metamorph/data
|
||||
:id
|
||||
(take 5)))))
|
||||
|
||||
(deftest brief
|
||||
(let [df (ds/select-columns df (sort (ds/column-names df)))]
|
||||
(is (= 334.0
|
||||
(->>
|
||||
((ds-mm/brief)
|
||||
{:metamorph/data df})
|
||||
:metamorph/data
|
||||
first
|
||||
:min)))))
|
||||
@@ -0,0 +1,45 @@
|
||||
(ns tech.v3.dataset.modelling-test
|
||||
(:require [tech.v3.dataset.modelling :as modelling]
|
||||
[tech.v3.dataset :as ds]
|
||||
[tech.v3.dataset.categorical :as ds-cat]
|
||||
[tech.v3.dataset.test-utils :as test-utils]
|
||||
[tech.v3.datatype :as dtype]
|
||||
[clojure.test :refer [deftest is]]))
|
||||
|
||||
(deftest k-fold-sanity
|
||||
(let [dataset-seq (modelling/k-fold-datasets (test-utils/mapseq-fruit-dataset) 5 {})]
|
||||
(is (= 5 (count dataset-seq)))
|
||||
(is (= [[7 47] [7 47] [7 47] [7 47] [7 48]]
|
||||
(->> dataset-seq
|
||||
(mapv (comp dtype/shape :train-ds)))))
|
||||
(is (= [[7 12] [7 12] [7 12] [7 12] [7 11]]
|
||||
(->> dataset-seq
|
||||
(mapv (comp dtype/shape :test-ds)))))))
|
||||
|
||||
|
||||
(deftest train-test-split-sanity
|
||||
(let [dataset (modelling/train-test-split
|
||||
(test-utils/mapseq-fruit-dataset) {})]
|
||||
(is (= [7 41]
|
||||
(dtype/shape (:train-ds dataset))))
|
||||
(is (= [7 18]
|
||||
(dtype/shape (:test-ds dataset))))))
|
||||
|
||||
|
||||
(deftest prob-dist->label-col
|
||||
(let [ds (ds/->dataset (tech.v3.dataset/->dataset
|
||||
{:y-0 [0.0 0.5 0.3 0.1]
|
||||
:y-1 [0.3 0.8 0.2 0.3]}))
|
||||
prob-dist-ds (modelling/probability-distributions->label-column ds :y)
|
||||
label-ds (ds-cat/reverse-map-categorical-xforms prob-dist-ds)]
|
||||
(is (= [:y-1 :y-1 :y-0 :y-1]
|
||||
(label-ds :y)))))
|
||||
|
||||
|
||||
(deftest issue-267-prob-dist-fail-on-nan-missing
|
||||
(is (thrown? Throwable
|
||||
(-> (tech.v3.dataset/->dataset {:y-0 [Double/NaN] :y-1 [0.3]})
|
||||
(modelling/probability-distributions->label-column :y))))
|
||||
(is (thrown? Throwable
|
||||
(-> (tech.v3.dataset/->dataset {:y-0 [nil] :y-1 [0.3]} )
|
||||
(tech.v3.dataset.modelling/probability-distributions->label-column :y)))))
|
||||
@@ -0,0 +1,29 @@
|
||||
(ns tech.v3.dataset.object-columns-test
|
||||
(:require [clojure.test :refer [deftest is]]
|
||||
[tech.v3.dataset :as ds]
|
||||
[tech.v3.datatype :as dtype]
|
||||
[tech.v3.datatype.datetime :as dtype-dt]
|
||||
[tech.v3.tensor :as dtt]))
|
||||
|
||||
|
||||
(deftest basic-object-columns
|
||||
(let [src-ds (ds/->dataset {:a (range 10)
|
||||
:b (repeat 10 {:a 1 :b 2})})]
|
||||
(is (= :persistent-map
|
||||
(dtype/get-datatype (src-ds :b))))
|
||||
(is (= (vec (repeat 10 {:a 1 :b 2}))
|
||||
(vec (dtype/->reader (src-ds :b)))))))
|
||||
|
||||
|
||||
(deftest involved-object-columns
|
||||
(let [src-ds (ds/->dataset
|
||||
{:dates (list "2000-01-01" "2000-02-01" "2000-03-01"
|
||||
"2000-04-01" "2000-05-01")
|
||||
:integers (range 5)
|
||||
:durations (repeat 5 (dtype-dt/duration))
|
||||
:doubles (map double (range 5))
|
||||
:tensors (repeat 5 (dtt/->tensor (partition 2 (range 4))))})]
|
||||
(is (= #{:float64 :string :int64 :tensor
|
||||
:packed-duration}
|
||||
(->> (map dtype/get-datatype (vals src-ds))
|
||||
set)))))
|
||||
+528
@@ -0,0 +1,528 @@
|
||||
(ns tech.v3.dataset.parse-test
|
||||
(:require [clojure.test :refer [deftest is]]
|
||||
[tech.v3.datatype :as dtype]
|
||||
[tech.v3.datatype.functional :as dfn]
|
||||
[tech.v3.datatype.bitmap :as bitmap]
|
||||
[tech.v3.dataset :as ds]
|
||||
[tech.v3.dataset.zip :as zip]
|
||||
[tech.v3.dataset.column :as ds-col]
|
||||
[tech.v3.dataset.protocols :as ds-proto]
|
||||
[tech.v3.dataset.io.nippy]
|
||||
[tech.v3.libs.arrow :as arrow]
|
||||
[tech.v3.libs.clj-transit :as ds-transit]
|
||||
[taoensso.nippy :as nippy]
|
||||
[clojure.set :as set]
|
||||
[clojure.java.io :as io])
|
||||
(:import [com.univocity.parsers.csv CsvFormat CsvParserSettings CsvParser]
|
||||
[java.nio.charset StandardCharsets]))
|
||||
|
||||
|
||||
(def test-file "test/data/ames-house-prices/train.csv")
|
||||
|
||||
|
||||
(def missing-data
|
||||
(->> [{:column-name "LotFrontage", :missing-count 259}
|
||||
{:column-name "Alley", :missing-count 1369}
|
||||
{:column-name "MasVnrType", :missing-count 8}
|
||||
{:column-name "MasVnrArea", :missing-count 8}
|
||||
{:column-name "BsmtQual", :missing-count 37}
|
||||
{:column-name "BsmtCond", :missing-count 37}
|
||||
{:column-name "BsmtExposure", :missing-count 38}
|
||||
{:column-name "BsmtFinType1", :missing-count 37}
|
||||
{:column-name "BsmtFinType2", :missing-count 38}
|
||||
{:column-name "Electrical", :missing-count 1}
|
||||
{:column-name "FireplaceQu", :missing-count 690}
|
||||
{:column-name "GarageType", :missing-count 81}
|
||||
{:column-name "GarageYrBlt", :missing-count 81}
|
||||
{:column-name "GarageFinish", :missing-count 81}
|
||||
{:column-name "GarageQual", :missing-count 81}
|
||||
{:column-name "GarageCond", :missing-count 81}
|
||||
{:column-name "PoolQC", :missing-count 1453}
|
||||
{:column-name "Fence", :missing-count 1179}
|
||||
{:column-name "MiscFeature", :missing-count 1406}
|
||||
]
|
||||
(map (juxt :column-name :missing-count))
|
||||
(sort-by first)))
|
||||
|
||||
(def datatype-answers
|
||||
[["1stFlrSF" :int16]
|
||||
["2ndFlrSF" :int16]
|
||||
["3SsnPorch" :int16]
|
||||
["Alley" :string]
|
||||
["BedroomAbvGr" :int16]
|
||||
["BldgType" :string]
|
||||
["BsmtCond" :string]
|
||||
["BsmtExposure" :string]
|
||||
["BsmtFinSF1" :int16]
|
||||
["BsmtFinSF2" :int16]
|
||||
["BsmtFinType1" :string]
|
||||
["BsmtFinType2" :string]
|
||||
["BsmtFullBath" :int16]
|
||||
["BsmtHalfBath" :int16]
|
||||
["BsmtQual" :string]
|
||||
["BsmtUnfSF" :int16]
|
||||
["CentralAir" :string]
|
||||
["Condition1" :string]
|
||||
["Condition2" :string]
|
||||
["Electrical" :string]
|
||||
["EnclosedPorch" :int16]
|
||||
["ExterCond" :string]
|
||||
["ExterQual" :string]
|
||||
["Exterior1st" :string]
|
||||
["Exterior2nd" :string]
|
||||
["Fence" :string]
|
||||
["FireplaceQu" :string]
|
||||
["Fireplaces" :int16]
|
||||
["Foundation" :string]
|
||||
["FullBath" :int16]
|
||||
["Functional" :string]
|
||||
["GarageArea" :int16]
|
||||
["GarageCars" :int16]
|
||||
["GarageCond" :string]
|
||||
["GarageFinish" :string]
|
||||
["GarageQual" :string]
|
||||
["GarageType" :string]
|
||||
["GarageYrBlt" :int16]
|
||||
["GrLivArea" :int16]
|
||||
["HalfBath" :int16]
|
||||
["Heating" :string]
|
||||
["HeatingQC" :string]
|
||||
["HouseStyle" :string]
|
||||
["Id" :int16]
|
||||
["KitchenAbvGr" :int16]
|
||||
["KitchenQual" :string]
|
||||
["LandContour" :string]
|
||||
["LandSlope" :string]
|
||||
["LotArea" :int32]
|
||||
["LotConfig" :string]
|
||||
["LotFrontage" :int16]
|
||||
["LotShape" :string]
|
||||
["LowQualFinSF" :int16]
|
||||
["MSSubClass" :int16]
|
||||
["MSZoning" :string]
|
||||
["MasVnrArea" :int16]
|
||||
["MasVnrType" :string]
|
||||
["MiscFeature" :string]
|
||||
["MiscVal" :int16]
|
||||
["MoSold" :int16]
|
||||
["Neighborhood" :string]
|
||||
["OpenPorchSF" :int16]
|
||||
["OverallCond" :int16]
|
||||
["OverallQual" :int16]
|
||||
["PavedDrive" :string]
|
||||
["PoolArea" :int16]
|
||||
["PoolQC" :string]
|
||||
["RoofMatl" :string]
|
||||
["RoofStyle" :string]
|
||||
["SaleCondition" :string]
|
||||
["SalePrice" :int32]
|
||||
["SaleType" :string]
|
||||
["ScreenPorch" :int16]
|
||||
["Street" :string]
|
||||
["TotRmsAbvGrd" :int16]
|
||||
["TotalBsmtSF" :int16]
|
||||
["Utilities" :string]
|
||||
["WoodDeckSF" :int16]
|
||||
["YearBuilt" :int16]
|
||||
["YearRemodAdd" :int16]
|
||||
["YrSold" :int16]])
|
||||
|
||||
|
||||
(deftest base-ames-parser-test
|
||||
(let [result (ds/->dataset test-file)
|
||||
dtypes (->> (vals result)
|
||||
(map meta)
|
||||
(sort-by :name)
|
||||
(mapv (juxt :name :datatype)))]
|
||||
(is (= (set (map first datatype-answers))
|
||||
(set (map first dtypes))))
|
||||
|
||||
(let [dtype-map (into {} dtypes)
|
||||
differences (->> datatype-answers
|
||||
(map (fn [[colname col-dtype]]
|
||||
(let [detected-dtype (dtype-map colname)]
|
||||
(when-not (= detected-dtype col-dtype)
|
||||
{:name colname
|
||||
:expected-datatype col-dtype
|
||||
:result-datatype detected-dtype}))))
|
||||
(remove nil?)
|
||||
seq)]
|
||||
|
||||
(is (nil? differences)
|
||||
(str differences)))
|
||||
(let [result-missing-data (->> (vals result)
|
||||
(map (juxt ds-col/column-name
|
||||
(comp dtype/ecount ds-col/missing)))
|
||||
(remove #(= 0 (second %)))
|
||||
(sort-by first))]
|
||||
(is (= (set (map first missing-data))
|
||||
(set (map first result-missing-data))))))
|
||||
|
||||
(let [result (ds/->dataset
|
||||
test-file
|
||||
{:n-records 100
|
||||
:column-whitelist ["Id" "SalePrice" "YearBuilt"]})]
|
||||
(is (= 3 (count result)))
|
||||
;;Header row accounts for one.
|
||||
(is (= 100 (ds/row-count result)))))
|
||||
|
||||
|
||||
(deftest base-ames-load-test
|
||||
;;Here we just test that the options correctly pass through ->dataset
|
||||
(let [result (ds/->dataset test-file
|
||||
{:n-records 100
|
||||
:column-whitelist ["Id" "SalePrice" "YearBuilt"]})]
|
||||
(is (= 3 (ds/column-count result)))
|
||||
;;Header row accounts for one.
|
||||
(is (= 100 (ds/row-count result)))))
|
||||
|
||||
|
||||
(deftest specify-column-types
|
||||
;;parse everything as float32
|
||||
(let [result (ds/->dataset
|
||||
test-file
|
||||
{:n-records 100
|
||||
:column-whitelist ["1stFlrSF" "2ndFlrSF" "3SsnPorch"]
|
||||
:parser-fn :float32})]
|
||||
(is (= #{:float32}
|
||||
(set (map dtype/get-datatype (vals result)))))
|
||||
(is (= 3 (ds/column-count result))))
|
||||
|
||||
;;Next up is a map of colname->datatype
|
||||
(let [result (ds/->dataset
|
||||
test-file
|
||||
{:n-records 100
|
||||
:column-whitelist ["1stFlrSF" "2ndFlrSF" "3SsnPorch"]
|
||||
:parser-fn {"1stFlrSF" :float32
|
||||
"2ndFlrSF" :int32}})]
|
||||
(is (= #{:float32 :int32 :int16}
|
||||
(set (map dtype/get-datatype (vals result)))))))
|
||||
|
||||
|
||||
(deftest semi-colon-delimited-file
|
||||
(let [result (ds/->dataset "test/data/sample01.csv"
|
||||
{:separator \;})]
|
||||
(is (= 3 (ds/column-count result)))))
|
||||
|
||||
|
||||
(deftest tough-file
|
||||
(let [result (ds/->dataset "test/data/essential.csv"
|
||||
{:n-initial-skip-rows 1
|
||||
:skip-bad-rows? true})]
|
||||
(is (= 5 (ds/column-count result)))))
|
||||
|
||||
|
||||
(defn- make-essential-csv-parser
|
||||
[]
|
||||
(-> (doto (CsvParserSettings.)
|
||||
(.. getFormat (setLineSeparator "\n"))
|
||||
(.setHeaderExtractionEnabled true)
|
||||
(.setIgnoreLeadingWhitespaces true)
|
||||
(.setIgnoreTrailingWhitespaces true))
|
||||
(CsvParser.)))
|
||||
|
||||
|
||||
(deftest custom-csv-parser
|
||||
(let [result (ds/->dataset "test/data/essential.csv"
|
||||
{:csv-parser (make-essential-csv-parser)
|
||||
:skip-bad-rows? true})]
|
||||
(is (= 5 (ds/column-count result)))))
|
||||
|
||||
|
||||
(deftest simple-write-test
|
||||
(let [initial-ds (ds/->dataset
|
||||
test-file
|
||||
{:num-rows 20
|
||||
:column-whitelist ["1stFlrSF" "2ndFlrSF" "3SsnPorch"]})
|
||||
_ (ds/write! initial-ds "test.tsv")
|
||||
new-ds (ds/->dataset "test.tsv")]
|
||||
(is (dfn/equals (initial-ds "1stFlrSF")
|
||||
(new-ds "1stFlrSF")))
|
||||
(is (dfn/equals (initial-ds "2ndFlrSF")
|
||||
(new-ds "2ndFlrSF"))))
|
||||
(let [missing-ds (-> (ds/->dataset
|
||||
test-file
|
||||
{:n-records 20
|
||||
:column-whitelist [43 44 69]})
|
||||
(ds/update-column
|
||||
"1stFlrSF"
|
||||
#(ds-col/set-missing % [2 4 7 9])))
|
||||
_ (ds/write! missing-ds "test.tsv")
|
||||
new-ds (ds/->dataset "test.tsv")]
|
||||
(is (dfn/equals (missing-ds "1stFlrSF")
|
||||
(new-ds "1stFlrSF")))
|
||||
(is (= #{2 4 7 9}
|
||||
(set (ds-col/missing (new-ds "1stFlrSF")))))))
|
||||
|
||||
|
||||
(deftest date-time-format-test-1
|
||||
(let [stock-ds (ds/->dataset "test/data/stocks.csv")]
|
||||
(is (= :packed-local-date (dtype/get-datatype (stock-ds "date")))))
|
||||
(let [temp-ds (ds/->dataset "test/data/seattle-temps.csv")]
|
||||
(is (= :zoned-date-time (dtype/get-datatype (temp-ds "date")))))
|
||||
(let [stock-ds (ds/->dataset "test/data/stocks.csv"
|
||||
{:parser-fn
|
||||
{"date" :local-date}})]
|
||||
(is (= :local-date (dtype/get-datatype (stock-ds "date"))))))
|
||||
|
||||
|
||||
(deftest custom-reader
|
||||
(is (= 560 (ds/row-count (ds/->dataset (io/reader "test/data/stocks.csv")
|
||||
{:file-type :csv})))))
|
||||
|
||||
|
||||
(defn verify-relaxed-parse
|
||||
[ds]
|
||||
(let [date-col (ds "date")
|
||||
col-meta (meta date-col)
|
||||
^List unparsed-data (:unparsed-data col-meta)
|
||||
^RoaringBitmap unparsed-indexes (:unparsed-indexes col-meta)]
|
||||
(is (= :packed-local-date (dtype/get-datatype date-col)))
|
||||
;;Make sure unparsed data came through intact
|
||||
(is (= #{"hello" "1212"}
|
||||
(set unparsed-data)))))
|
||||
|
||||
|
||||
(deftest bad-csv-relaxed-1
|
||||
(let [ds (ds/->dataset "test/data/stocks-bad-date.csv")]
|
||||
(is (= :string (dtype/get-datatype (ds "date"))))
|
||||
;;Make sure unparsed data came through intact
|
||||
(is (= #{"hello" "1212"}
|
||||
(set/intersection #{"hello" "1212"}
|
||||
(set (ds-col/unique (ds "date"))))))
|
||||
(let [updated-ds (ds/update-column
|
||||
ds "date" (partial ds-col/parse-column
|
||||
[:packed-local-date :relaxed?]))]
|
||||
(verify-relaxed-parse updated-ds))))
|
||||
|
||||
|
||||
(deftest bad-csv-relaxed-2
|
||||
(let [ds (ds/->dataset "test/data/stocks-bad-date.csv"
|
||||
{:parser-fn
|
||||
{"date" [:packed-local-date :relaxed?]}})]
|
||||
(verify-relaxed-parse ds)))
|
||||
|
||||
|
||||
(deftest csv-keyword-colnames
|
||||
(let [stocks (ds/->dataset "test/data/stocks.csv" {:key-fn keyword})]
|
||||
(is (every? keyword? (ds/column-names stocks)))))
|
||||
|
||||
|
||||
(deftest parse-empty-column-name
|
||||
(let [data (ds/->dataset "test/data/rcsv.csv")]
|
||||
(is (= #{"column-0" "Urban Female" "Urban Male" "Rural Female" "Rural Male"}
|
||||
(set (ds/column-names data))))))
|
||||
|
||||
|
||||
(deftest parse-ip-addrs-as-string
|
||||
(let [data (ds/->dataset "test/data/ip-addrs.csv")]
|
||||
(is (= :string (dtype/get-datatype (data "ip"))))))
|
||||
|
||||
|
||||
(def arrow-file "test/data/iris.feather")
|
||||
(def parquet-file "test/data/parquet/userdata1.parquet")
|
||||
|
||||
|
||||
;;We will get back to this one. Potentially there are good ways into this
|
||||
;;via arrow.
|
||||
#_(deftest parse-parquet
|
||||
(let [ds (ds/->dataset parquet-file)]
|
||||
(is (= 13 (ds/column-count ds)))
|
||||
(is (= 1000 (ds/row-count ds)))
|
||||
(is (= #{:local-date-time :float64 :int32 :string}
|
||||
(->> (map dtype/get-datatype (vals ds))
|
||||
set)))))
|
||||
|
||||
|
||||
(deftest parse-ragged
|
||||
(let [ds (ds/->dataset "test/data/ragged.csv"
|
||||
{:header-row? false
|
||||
:key-fn keyword})]
|
||||
(is (= [:column-0 :column-1 :column-2 :column-3 :column-4 :column-5
|
||||
:column-6 :column-7 :column-8 :column-9 :column-10 :column-11]
|
||||
(vec (ds/column-names ds))))
|
||||
(is (= 12 (ds/column-count ds)))
|
||||
(is (= [4 24 31 33 65 67 68 71 75 76 93 97]
|
||||
(vec ((ds/value-reader ds) 4))))
|
||||
(is (= [10 33 51 66 67 84 nil nil nil nil nil nil]
|
||||
(vec ((ds/value-reader ds) 10))))))
|
||||
|
||||
|
||||
(deftest parse-small-doubles
|
||||
(let [ds (ds/->dataset "test/data/double_parse_test.csv")]
|
||||
(is (= 197 (count (filter #(not= 0.0 % ) (ds "pvalue")))))))
|
||||
|
||||
|
||||
(deftest string-separators
|
||||
(let [ds (ds/->dataset "test/data/double_parse_test.csv" {:separator ","})]
|
||||
(is (= 197 (count (filter #(not= 0.0 % ) (ds "pvalue")))))
|
||||
(is (thrown? Throwable (ds/->dataset "test/data/double_parse_test.csv"
|
||||
{:separator ",n"})))))
|
||||
|
||||
|
||||
(deftest quoted-column-data
|
||||
(try
|
||||
(let [ds (ds/->dataset [{:a "onelongstring"}])]
|
||||
(ds/write! ds "quoted.csv" {:quote? true})
|
||||
(is (= "\"a\"\n\"onelongstring\"\n"
|
||||
(slurp "quoted.csv"))))
|
||||
(finally
|
||||
(.delete (java.io.File. "quoted.csv")))))
|
||||
|
||||
|
||||
(deftest text-data
|
||||
(try
|
||||
(let [ds (ds/->dataset [{:a "onestring"}
|
||||
{:a "anotherstring"}
|
||||
{}]
|
||||
{:parser-fn :text})
|
||||
_ (is (= :text (-> (ds :a) meta :datatype)))
|
||||
_ (ds/write! ds "text.csv")
|
||||
_ (ds/write! ds "text.nippy")
|
||||
csv-ds (ds/->dataset "text.csv" {:parser-fn {"a" :text}
|
||||
:key-fn keyword})
|
||||
_ (is (= :text (-> (csv-ds :a) meta :datatype)))
|
||||
;;_ (is (= 3 (ds/row-count csv-ds)))
|
||||
nippy-ds (ds/->dataset "text.nippy")
|
||||
_ (is (= :text (-> (nippy-ds :a) meta :datatype)))
|
||||
_ (is (= 3 (ds/row-count nippy-ds)))
|
||||
_ (arrow/write-dataset-to-stream! ds "text.arrow")
|
||||
ds-copy (arrow/read-stream-dataset-copying "text.arrow" {:key-fn keyword})
|
||||
_ (is (= :text (-> (ds-copy :a) meta :datatype)))
|
||||
_ (is (= 3 (ds/row-count nippy-ds)))
|
||||
ds-inplace (arrow/read-stream-dataset-inplace "text.arrow")]
|
||||
(is (= :text (-> (ds-inplace "a") meta :datatype)))
|
||||
(is (= 3 (ds/row-count nippy-ds))))
|
||||
(finally
|
||||
(.delete (java.io.File. "text.csv"))
|
||||
(.delete (java.io.File. "text.nippy"))
|
||||
(.delete (java.io.File. "text.arrow")))))
|
||||
|
||||
|
||||
(deftest custom-parse-method
|
||||
(try
|
||||
(let [src-ds (ds/->dataset {:a ["1" "missing" "parse-failure" "2" "3"]})
|
||||
_ (ds/write! src-ds "custom-parse.csv")
|
||||
ds (ds/->dataset
|
||||
"custom-parse.csv"
|
||||
{:parser-fn {"a" [:int64
|
||||
(fn [str-val]
|
||||
(cond
|
||||
(= str-val "missing")
|
||||
:tech.v3.dataset/missing
|
||||
(= str-val "parse-failure")
|
||||
:tech.v3.dataset/parse-failure
|
||||
:else
|
||||
(Long/parseLong str-val)))]}})]
|
||||
(is (= [1 nil nil 2 3]
|
||||
(vec (ds "a"))))
|
||||
(is (= #{1 2} (set (ds/missing ds))))
|
||||
(is (= #{2}
|
||||
(set (:unparsed-indexes (meta (ds "a"))))))
|
||||
(is (= ["parse-failure"]
|
||||
(vec (:unparsed-data (meta (ds "a")))))))
|
||||
(finally
|
||||
(.delete (java.io.File. "custom-parse.csv")))))
|
||||
|
||||
|
||||
(deftest stocks-v5
|
||||
(let [v5 (ds/->dataset "test/data/stocks-v5.nippy")
|
||||
cur (ds/->dataset "test/data/stocks.csv")]
|
||||
(is (= (vec (v5 "date"))
|
||||
(vec (cur "date"))))))
|
||||
|
||||
|
||||
|
||||
(deftest gzipped-input-stream-issue-247
|
||||
(let [ds (ds/->dataset (io/input-stream "test/data/ames-train.csv.gz")
|
||||
{:file-type :csv
|
||||
:gzipped? true})
|
||||
correct-ds (ds/->dataset "test/data/ames-train.csv.gz")]
|
||||
(is (= (ds/row-count correct-ds) (ds/row-count ds)))))
|
||||
|
||||
|
||||
(deftest pokemon-csv
|
||||
(let [ds (ds/->dataset "test/data/pokemon.csv")]
|
||||
(is (= "['Overgrow', 'Chlorophyll']" (first (ds "abilities"))))))
|
||||
|
||||
(deftest issue-292
|
||||
(let [ds (ds/->dataset "test/data/issue-292.csv" )]
|
||||
(is (== 3 (ds/column-count ds)))))
|
||||
|
||||
|
||||
(deftest json-test
|
||||
(try
|
||||
(let [ds (-> (ds/->dataset "test/data/stocks.csv")
|
||||
(ds/column-map "date" str ["date"]))
|
||||
_ (ds/write! ds "stocks.json")
|
||||
jds (ds/->dataset "stocks.json")]
|
||||
(is (= (vec (ds "date")) (vec (jds "date"))))
|
||||
(is (dfn/equals (ds "price") (jds "price"))))
|
||||
(finally
|
||||
(.delete (java.io.File. "stocks.json")))))
|
||||
|
||||
|
||||
(deftest nippy-column
|
||||
(let [ds (ds/->dataset {:a [1 2 3] :b [4 5 6]})
|
||||
frozen (nippy/freeze (ds :a))
|
||||
thawed (nippy/thaw frozen)]
|
||||
(is (dfn/equals (ds :a) thawed))
|
||||
(is (ds-proto/is-column? thawed))))
|
||||
|
||||
|
||||
(deftest empty-csv
|
||||
(let [ds (ds/->dataset "test/data/empty-csv-header.csv")]
|
||||
(is (= 7 (ds/column-count ds))))
|
||||
(let [ds (ds/->dataset "test/data/empty-csv.csv")]
|
||||
(is (= 0 (ds/column-count ds)))
|
||||
(is (ds/dataset? ds))))
|
||||
|
||||
|
||||
(deftest comment-char
|
||||
(let [ds (ds/->dataset "test/data/csv-comment.csv")
|
||||
rows (ds/rows ds)]
|
||||
(is (= 5 (ds/row-count ds)))
|
||||
(is (= (rows -1) (rows -2)))))
|
||||
|
||||
(deftest issue-304
|
||||
(let [ds (ds/->dataset "test/data/issue-292.csv" {:n-initial-skip-rows 10})]
|
||||
(is (= 11 (-> (ds "10") (first))))))
|
||||
|
||||
|
||||
(deftest issue-362
|
||||
(let [ds-seq (zip/zipfile->dataset-seq "test/data/unknown.zip")]
|
||||
(is (= 2 (count ds-seq)))))
|
||||
|
||||
|
||||
(deftest issue-388-transit-support
|
||||
(let [ds (ds/->dataset {:a [1 2 3]
|
||||
:b [:one :two :three]})
|
||||
str-data (ds-transit/dataset->transit-str ds)
|
||||
nds (ds-transit/transit-str->dataset str-data)]
|
||||
(is (= (ds :a) (nds :a)))
|
||||
(is (= (ds :b) (nds :b)))))
|
||||
|
||||
|
||||
(deftest issue-434-transit-support
|
||||
(let [ds (ds/->dataset {:a [1 2 3]
|
||||
:b [:one :two :three]
|
||||
;;transit encoding is milli instants
|
||||
:c (dtype/make-container :packed-milli-instant [(java.time.Instant/now) (java.time.Instant/now)])})
|
||||
str-data (ds-transit/dataset->transit-str ds)
|
||||
nds (ds-transit/transit-str->dataset str-data)]
|
||||
(is (= (ds :a) (nds :a)))
|
||||
(is (= (ds :b) (nds :b)))
|
||||
(is (= (ds :c) (nds :c)))))
|
||||
|
||||
|
||||
(deftest issue-414-json-parser-fn
|
||||
(is (= [1 2 3] (get (ds/->dataset "test/data/local_date.json"
|
||||
{:parser-fn {:time-period :local-date}})
|
||||
"test"))))
|
||||
|
||||
(deftest dataset-parser-clear-packed-column
|
||||
(let [p (ds/dataset-parser)]
|
||||
(ds-proto/add-row p {:date (java.time.Instant/now)})
|
||||
(ds-proto/ds-clear p)
|
||||
(ds-proto/add-row p {:date (java.time.Instant/now)})
|
||||
(is (= 1 (count (@p :date))))))
|
||||
@@ -0,0 +1,13 @@
|
||||
(ns tech.v3.dataset.parser-test
|
||||
(:require [tech.v3.dataset :as ds]
|
||||
[tech.v3.dataset.protocols :as ds-proto]
|
||||
[clojure.test :refer [deftest is]]))
|
||||
|
||||
|
||||
(deftest all-missing-ds
|
||||
(let [p (ds/dataset-parser)
|
||||
_ (ds-proto/add-row p {})
|
||||
ds @p]
|
||||
(is (not (ds/dataset? ds)))
|
||||
(is (= 1 (:tech.v3.dataset/row-count ds))
|
||||
(= :all (:tech.v3.dataset/missing ds)))))
|
||||
@@ -0,0 +1,461 @@
|
||||
(ns tech.v3.dataset.reductions-test
|
||||
(:require [tech.v3.dataset.reductions :as ds-reduce]
|
||||
[tech.v3.dataset :as ds]
|
||||
[tech.v3.dataset.column :as ds-col]
|
||||
[tech.v3.datatype.functional :as dfn]
|
||||
[tech.v3.datatype :as dtype]
|
||||
[tech.v3.datatype.datetime :as dtype-dt]
|
||||
[tech.v3.datatype.argops :as argops]
|
||||
[tech.v3.datatype.statistics :as stats]
|
||||
[tech.v3.dataset.reductions.apache-data-sketch :as ds-sketch]
|
||||
[tech.v3.dataset.categorical :as dsc]
|
||||
[tech.v3.parallel.for :as pfor]
|
||||
[ham-fisted.api :as hamf]
|
||||
[ham-fisted.function :as hamf-fn]
|
||||
[ham-fisted.reduce :as hamf-rf]
|
||||
[ham-fisted.lazy-noncaching :as lznc]
|
||||
[clojure.test :refer [deftest is]]
|
||||
[clojure.core.protocols :as cl-proto])
|
||||
(:import [tech.v3.datatype UnaryPredicate FastStruct$FMapEntry]
|
||||
[java.time LocalDate YearMonth]
|
||||
[ham_fisted Consumers$IncConsumer Reductions IAMapEntry]
|
||||
[java.util ArrayList Map$Entry Arrays]))
|
||||
|
||||
|
||||
(deftest simple-reduction
|
||||
(let [stocks (ds/->dataset "test/data/stocks.csv" {:key-fn keyword})
|
||||
agg-ds (-> (ds-reduce/group-by-column-agg
|
||||
:symbol
|
||||
{:n-elems (ds-reduce/row-count)
|
||||
:price-avg (ds-reduce/mean :price)
|
||||
:price-sum (ds-reduce/sum :price)
|
||||
:symbol (ds-reduce/first-value :symbol)
|
||||
:n-dates (ds-reduce/count-distinct :date :int32)}
|
||||
[stocks stocks stocks])
|
||||
(ds/sort-by-column :symbol))
|
||||
_ (println agg-ds)
|
||||
single-price (-> (->> (ds/group-by-column stocks :symbol)
|
||||
(map (fn [[k ds]]
|
||||
{:symbol k
|
||||
:n-elems (ds/row-count ds)
|
||||
:price-sum (dfn/sum (ds :price))
|
||||
:price-avg (dfn/mean (ds :price))}))
|
||||
(ds/->>dataset))
|
||||
(ds/sort-by-column :symbol))]
|
||||
(is (= 5 (ds/row-count agg-ds)))
|
||||
(is (dfn/equals (agg-ds :n-elems)
|
||||
(dfn/* 3 (single-price :n-elems))))
|
||||
(is (dfn/equals (agg-ds :price-sum)
|
||||
(dfn/* 3 (single-price :price-sum))))
|
||||
(is (dfn/equals (agg-ds :price-avg)
|
||||
(single-price :price-avg)))))
|
||||
|
||||
|
||||
(deftest simple-reduction-filtered
|
||||
(let [stocks (ds/->dataset "test/data/stocks.csv" {:key-fn keyword})
|
||||
agg-ds (-> (ds-reduce/group-by-column-agg
|
||||
:symbol
|
||||
{:n-elems (ds-reduce/row-count)
|
||||
:price-avg (ds-reduce/mean :price)
|
||||
:price-sum (ds-reduce/sum :price)
|
||||
:symbol (ds-reduce/first-value :symbol)
|
||||
:n-dates (ds-reduce/count-distinct :date :int32)}
|
||||
{:index-filter (fn [dataset]
|
||||
(let [rdr (dtype/->reader (dataset :price))]
|
||||
(hamf-fn/long-predicate
|
||||
idx (> (.readDouble rdr idx) 100.0))))}
|
||||
[stocks stocks stocks])
|
||||
(ds/sort-by-column :symbol))
|
||||
fstocks (ds/filter-column stocks :price #(> % 100.0))
|
||||
single-price (->
|
||||
(->> (ds/group-by-column fstocks :symbol)
|
||||
(map (fn [[k ds]]
|
||||
{:symbol k
|
||||
:n-elems (ds/row-count ds)
|
||||
:price-sum (dfn/sum (ds :price))
|
||||
:price-avg (dfn/mean (ds :price))}))
|
||||
(ds/->>dataset))
|
||||
(ds/sort-by-column :symbol))]
|
||||
(is (= 4 (ds/row-count agg-ds)))
|
||||
(is (dfn/equals (agg-ds :n-elems)
|
||||
(dfn/* 3 (single-price :n-elems))))
|
||||
(is (dfn/equals (agg-ds :price-sum)
|
||||
(dfn/* 3 (single-price :price-sum))))
|
||||
(is (dfn/equals (agg-ds :price-avg)
|
||||
(single-price :price-avg)))))
|
||||
|
||||
|
||||
(deftest issue-201-incorrect-result-column-count
|
||||
(let [stocks (ds/->dataset "test/data/stocks.csv" {:key-fn keyword})
|
||||
agg-ds (ds-reduce/group-by-column-agg
|
||||
:symbol
|
||||
{:n-elems (ds-reduce/row-count)
|
||||
:price-avg (ds-reduce/mean :price)
|
||||
:price-avg2 (ds-reduce/mean :price)
|
||||
:price-avg3 (ds-reduce/mean :price)
|
||||
:price-sum (ds-reduce/sum :price)
|
||||
:price-med (ds-reduce/prob-median :price)
|
||||
:symbol (ds-reduce/first-value :symbol)
|
||||
:n-dates (ds-reduce/count-distinct :date :int32)}
|
||||
[stocks stocks stocks])
|
||||
simple-agg-ds (ds-reduce/aggregate
|
||||
{:n-elems (ds-reduce/row-count)
|
||||
:price-avg (ds-reduce/mean :price)
|
||||
:price-avg2 (ds-reduce/mean :price)
|
||||
:price-avg3 (ds-reduce/mean :price)
|
||||
:price-sum (ds-reduce/sum :price)
|
||||
:price-med (ds-reduce/prob-median :price)
|
||||
:symbol (ds-reduce/first-value :symbol)
|
||||
:n-dates (ds-reduce/count-distinct :date :int32)}
|
||||
[stocks stocks stocks])]
|
||||
(is (= 8 (ds/column-count agg-ds)))
|
||||
(is (= 8 (ds/column-count simple-agg-ds)))))
|
||||
|
||||
|
||||
(deftest data-sketches-test
|
||||
(let [stocks (ds/->dataset "test/data/stocks.csv" {:key-fn keyword})
|
||||
result (ds-reduce/aggregate
|
||||
{:n-elems (ds-reduce/row-count)
|
||||
:n-dates (ds-reduce/count-distinct :date :int32)
|
||||
:n-dates-hll (ds-sketch/prob-set-cardinality :date {:datatype :string})
|
||||
:n-symbols-hll (ds-sketch/prob-set-cardinality
|
||||
:symbol {:datatype :string})
|
||||
:quantiles (ds-sketch/prob-quantiles :price [0.25 0.5 0.75])
|
||||
:cdfs (ds-sketch/prob-cdfs :price [50 100 150])
|
||||
:pmfs (ds-sketch/prob-pmfs :price [50 100 150])}
|
||||
[stocks stocks stocks])
|
||||
{:keys [n-dates-hll n-symbols-hll]} (first (ds/mapseq-reader result))]
|
||||
(is (dfn/equals [123 5]
|
||||
[n-dates-hll
|
||||
n-symbols-hll]
|
||||
0.1))))
|
||||
|
||||
|
||||
(deftest reservoir-sampling-test
|
||||
(let [stocks (ds/->dataset "test/data/stocks.csv" {:key-fn keyword})
|
||||
ds-seq [stocks stocks stocks]
|
||||
small-ds-seq [(-> (ds/shuffle stocks)
|
||||
(ds/select-rows (range 50)))]
|
||||
agg-map {:n-elems (ds-reduce/row-count)
|
||||
:price-std (ds-reduce/reservoir-desc-stat
|
||||
:price 100 :standard-deviation)
|
||||
:sub-ds (ds-reduce/reservoir-dataset 100)}
|
||||
straight (ds-reduce/aggregate agg-map ds-seq)
|
||||
straight-small (ds-reduce/aggregate agg-map small-ds-seq)
|
||||
grouped (ds-reduce/group-by-column-agg :symbol agg-map ds-seq)
|
||||
grouped-small (ds-reduce/group-by-column-agg :symbol agg-map ds-seq)]
|
||||
|
||||
;;Mainly ensuring that nothing throws.
|
||||
(is (every? #(or (= 3 (ds/column-count %))
|
||||
(= 4 (ds/column-count %)))
|
||||
[straight straight-small
|
||||
grouped grouped-small])))
|
||||
(let [missing-ds (ds/new-dataset [(ds-col/new-column
|
||||
:missing (range 1000)
|
||||
nil
|
||||
(->> (range 1000)
|
||||
(map (fn [^long idx]
|
||||
(when (== 0 (rem idx 3))
|
||||
idx)))
|
||||
(remove nil?)))])
|
||||
agg-ds
|
||||
(ds-reduce/aggregate {:sub-ds (ds-reduce/reservoir-dataset 50)}
|
||||
[missing-ds])
|
||||
sub-ds (first (:sub-ds agg-ds))]
|
||||
;;Make sure we carry the missing set across
|
||||
(is (not (.isEmpty ^org.roaringbitmap.RoaringBitmap (ds/missing sub-ds))))
|
||||
(is (every? #(or (nil? %)
|
||||
(not= 0 (rem (long %) 3)))
|
||||
(:missing sub-ds)))))
|
||||
|
||||
|
||||
(defn- create-otfrom-init-dataset
|
||||
[& [{:keys [n-simulations n-placements n-expansion n-rows]
|
||||
:or {n-simulations 100
|
||||
n-placements 50
|
||||
n-expansion 20
|
||||
n-rows 1000000}}]]
|
||||
(->> (for [idx (range n-rows)]
|
||||
(let [sd (.minusDays (dtype-dt/local-date) (+ 200 (rand-int 365)))
|
||||
ed (.plusDays sd (rand-int n-expansion))]
|
||||
{:simulation (rand-int n-simulations)
|
||||
:placement (rand-int n-placements)
|
||||
:start sd
|
||||
:end ed}))
|
||||
(ds/->>dataset)))
|
||||
|
||||
|
||||
;;Slightly less efficient than implementing an inline IReduceInit impl is to create
|
||||
;;a record with a custom IReduceInit implementation.
|
||||
(defrecord YMC [year-month ^long count]
|
||||
clojure.lang.IReduceInit
|
||||
(reduce [this rfn init]
|
||||
(let [init (hamf/reduced-> rfn init
|
||||
(clojure.lang.MapEntry/create :year-month year-month)
|
||||
(clojure.lang.MapEntry/create :count count))]
|
||||
(if (and __extmap (not (reduced? init)))
|
||||
(reduce rfn init __extmap)
|
||||
init))))
|
||||
|
||||
|
||||
(def inc-cons-fn (hamf-fn/function k (Consumers$IncConsumer.)))
|
||||
|
||||
(defn- tally-days-as-year-months
|
||||
[{:keys [^LocalDate start ^LocalDate end]}]
|
||||
;;Using a hash provider with equals semantics allows the hamf hashtable to
|
||||
;;compete on equal terms with the java hashtable. In that we find that compute,
|
||||
;;computeIfAbsent and reduce perform as fast as anything on the jvm when we are using
|
||||
;;Object/equals and Object/hashCode for the map functionality.
|
||||
(let [tally (hamf/java-hashmap)]
|
||||
(dotimes [idx (.until start end java.time.temporal.ChronoUnit/DAYS)]
|
||||
(let [ym (YearMonth/from (.plusDays start idx))]
|
||||
;;Compute if absent is ever so slightly faster than compute as it involves
|
||||
;;less mutation of the original hashtable. It does, however, require the
|
||||
;;value in the node itself to be mutable.
|
||||
(.inc ^Consumers$IncConsumer (.computeIfAbsent tally ym inc-cons-fn))))
|
||||
(hamf/custom-ireduce
|
||||
rfn acc
|
||||
(Reductions/iterReduce (.entrySet tally)
|
||||
acc
|
||||
(fn [acc ^Map$Entry kv]
|
||||
(rfn acc
|
||||
(hamf/custom-ireduce
|
||||
rrfn aacc
|
||||
(-> aacc
|
||||
(rrfn (hamf/make-map-entry :year-month (.getKey kv)))
|
||||
(rrfn (hamf/make-map-entry :count (deref (.getValue kv))))))))))))
|
||||
|
||||
|
||||
(defn- otfrom-pathway
|
||||
[ds]
|
||||
(->> (ds/row-mapcat ds tally-days-as-year-months
|
||||
;;generate a sequence of datasets
|
||||
{:result-type :as-seq})
|
||||
;;sequence of datasets
|
||||
(ds-reduce/group-by-column-agg
|
||||
[:simulation :placement :year-month]
|
||||
{:count (ds-reduce/sum :count)})
|
||||
;;single dataset - do joins and such here
|
||||
(#(let [ds %
|
||||
count (ds :count)]
|
||||
(assoc ds :count2 (dfn/sq count))))
|
||||
(ds-reduce/group-by-column-agg
|
||||
[:placement :year-month]
|
||||
{:min-count (ds-reduce/prob-quantile :count 0.0)
|
||||
:low-95-count (ds-reduce/prob-quantile :count 0.05)
|
||||
:q1-count (ds-reduce/prob-quantile :count 0.25)
|
||||
:median-count (ds-reduce/prob-quantile :count 0.50)
|
||||
:q3-count (ds-reduce/prob-quantile :count 0.75)
|
||||
:high-95-count (ds-reduce/prob-quantile :count 0.95)
|
||||
:max-count (ds-reduce/prob-quantile :count 1.0)
|
||||
:count (ds-reduce/sum :count)})))
|
||||
|
||||
|
||||
(defn- tally-days-columnwise
|
||||
[ds]
|
||||
(let [starts (dtype/->buffer (ds :start))
|
||||
ends (dtype/->buffer (ds :end))
|
||||
n-rows (.lsize starts)
|
||||
indexes (dtype/prealloc-list :int64 n-rows)
|
||||
year-months (dtype/prealloc-list :object n-rows) ;;ArrayList works fine here also.
|
||||
counts (dtype/prealloc-list :int32 n-rows)
|
||||
incrementor (hamf-fn/bi-function k v
|
||||
(if v
|
||||
(unchecked-inc (long v))
|
||||
1))
|
||||
tally (hamf/java-hashmap)]
|
||||
;;Loop through dataset and append results columnwise.
|
||||
(dotimes [row-idx n-rows]
|
||||
;;minimize hashtable resize operations
|
||||
(.clear tally)
|
||||
(let [^LocalDate start (starts row-idx)
|
||||
^LocalDate end (ends row-idx)
|
||||
nd (.until start end java.time.temporal.ChronoUnit/DAYS)]
|
||||
(dotimes [day-idx nd]
|
||||
(.inc ^Consumers$IncConsumer (.computeIfAbsent tally (YearMonth/from (.plusDays start day-idx)) inc-cons-fn)))
|
||||
(.forEach tally (hamf-fn/bi-consumer
|
||||
k v
|
||||
(.addLong indexes row-idx)
|
||||
(.add year-months k)
|
||||
(.add counts (deref v))))))
|
||||
(-> (ds/select-rows ds indexes)
|
||||
;;avoid datatype and missing scans
|
||||
(assoc :year-month #:tech.v3.dataset{:data year-months
|
||||
:force-datatype? true
|
||||
:missing (tech.v3.datatype.bitmap/->bitmap)}
|
||||
:count counts))))
|
||||
|
||||
|
||||
(defn- otfrom-columnwise-pathway
|
||||
[ds]
|
||||
(->> (ds/pmap-ds ds tally-days-columnwise
|
||||
;;generate a sequence of datasets
|
||||
{:result-type :as-seq})
|
||||
;;sequence of datasets
|
||||
(ds-reduce/group-by-column-agg
|
||||
[:simulation :placement :year-month]
|
||||
{:count (ds-reduce/sum :count)})
|
||||
;;single dataset - do joins and such here
|
||||
(#(let [ds %
|
||||
count (ds :count)]
|
||||
;;return a sequence of datasets for next step
|
||||
[(assoc ds :count2 (dfn/sq count))]))
|
||||
(ds-reduce/group-by-column-agg
|
||||
[:placement :year-month]
|
||||
{:min-count (ds-reduce/prob-quantile :count 0.0)
|
||||
:low-95-count (ds-reduce/prob-quantile :count 0.05)
|
||||
:q1-count (ds-reduce/prob-quantile :count 0.25)
|
||||
:median-count (ds-reduce/prob-quantile :count 0.50)
|
||||
:q3-count (ds-reduce/prob-quantile :count 0.75)
|
||||
:high-95-count (ds-reduce/prob-quantile :count 0.95)
|
||||
:max-count (ds-reduce/prob-quantile :count 1.0)
|
||||
:count (ds-reduce/sum :count)})))
|
||||
|
||||
|
||||
(deftest otfrom-pathway-test
|
||||
(let [ds (create-otfrom-init-dataset)
|
||||
start (ds :start)
|
||||
end (ds :end)
|
||||
total-count (->> (dtype/emap #(dtype-dt/between %1 %2 :days) :int64 start end)
|
||||
(dfn/sum))
|
||||
;;warmup
|
||||
_ (do (otfrom-pathway ds)
|
||||
(otfrom-columnwise-pathway ds))
|
||||
_ (println "otfrom pathway timing")
|
||||
ofds (time (otfrom-pathway ds))
|
||||
_ (println "otfrom columnwise pathway timing")
|
||||
of-cwise-ds (time (otfrom-columnwise-pathway ds))
|
||||
ofsum (dfn/sum (ofds :count))
|
||||
of-cwise-sum (dfn/sum (of-cwise-ds :count))]
|
||||
(is (= ofsum total-count))
|
||||
(is (= of-cwise-sum total-count))))
|
||||
|
||||
|
||||
(deftest issue-314
|
||||
(let [dstds (->
|
||||
(ds-reduce/group-by-column-agg
|
||||
:foo
|
||||
{:foos (ds-reduce/distinct :value)}
|
||||
(ds/->dataset (into [] (map (fn [i] {:foo 'foo :value (str i)})) (range 3))))
|
||||
(ds/column-map :foos-2 (fn [values] values) [:foos]))]
|
||||
(is (= ["0" "1" "2"]
|
||||
(vec (first (dstds :foos-2)))))))
|
||||
|
||||
|
||||
(deftest issue-312
|
||||
(let [ds (ds-reduce/aggregate
|
||||
{:n-elems (ds-reduce/count-distinct :genre)}
|
||||
[(ds/->dataset "test/data/example-genres.nippy")])]
|
||||
(is (pos? (first (ds :n-elems))))))
|
||||
|
||||
|
||||
(deftest group-by-agg-changes-source
|
||||
(let [ds (-> [{:job "Professional" :sex "Male" :age "[35-40)" :salary 3991.2}
|
||||
{:job "Professional" :sex "Male" :age "[35-40)" :salary 2364.6}
|
||||
{:job "Professional" :sex "Male" :age "[35-40)" :salary 3114.7}
|
||||
{:job "Artist" :sex "Female" :age "[35-35)" :salary 2345.1}
|
||||
{:job "Artist" :sex "Female" :age "[35-35)" :salary 4562.1}
|
||||
{:job "Artist" :sex "Female" :age "[35-35)" :salary 1214.1}
|
||||
{:job "Artist" :sex "Female" :age "[35-35)" :salary 4531.1}]
|
||||
(ds/->dataset)
|
||||
(assoc "salary (binned)" ["a" "b" "c" "d" "e" "f" "g"]))
|
||||
ds2 (ds-reduce/group-by-column-agg
|
||||
[:job :sex :age]
|
||||
{:fj (ds-reduce/row-count)}
|
||||
[ds])]
|
||||
(is (= #{:job :sex :age :salary "salary (binned)"}
|
||||
(set (keys (.-colmap ds)))))
|
||||
|
||||
))
|
||||
|
||||
(deftest maximum-test
|
||||
(let [ds (ds/->dataset {:x (repeatedly 100 rand)})
|
||||
ev (last (:x (ds/sort-by-column ds :x)))
|
||||
out-ds (ds-reduce/aggregate {:max-x (ds-reduce/maximum :x)} ds)]
|
||||
(is (= 1 (ds/row-count out-ds)))
|
||||
(is (= (first (:max-x out-ds))
|
||||
ev))))
|
||||
|
||||
(comment
|
||||
|
||||
(do
|
||||
(defn max-int64
|
||||
[colname]
|
||||
(ds-reduce/reducer->column-reducer
|
||||
(hamf-rf/parallel-reducer (fn ^long [] Long/MIN_VALUE)
|
||||
(fn ^long [^long a ^long b] (Long/max a b))
|
||||
(fn ^long [^long a ^long b] (Long/max a b)))
|
||||
:int64
|
||||
colname))
|
||||
|
||||
(defn sum-int64
|
||||
[colname]
|
||||
(ds-reduce/reducer->column-reducer
|
||||
(hamf-rf/parallel-reducer (fn ^long [] 0)
|
||||
(fn ^long [^long a ^long b] (Long/sum a b))
|
||||
(fn ^long [^long a ^long b] (Long/sum a b)))
|
||||
:int64
|
||||
colname))
|
||||
|
||||
(defn max-float64
|
||||
[colname]
|
||||
(ds-reduce/reducer->column-reducer
|
||||
(hamf-rf/parallel-reducer (fn ^double [] 0.0)
|
||||
(fn ^double [^double a ^double b] (Double/max a b))
|
||||
(fn ^double [^double a ^double b] (Double/max a b)))
|
||||
:float64
|
||||
colname))
|
||||
|
||||
(defn sum-float64
|
||||
[colname]
|
||||
(ds-reduce/reducer->column-reducer
|
||||
(hamf-rf/parallel-reducer (fn ^double [] 0.0)
|
||||
(fn ^double [^double a ^double b] (Double/sum a b))
|
||||
(fn ^double [^double a ^double b] (Double/sum a b)))
|
||||
:float64
|
||||
colname))
|
||||
|
||||
(deftype FloatSumObj [^{:unsynchronized-mutable true
|
||||
:tag double} dval]
|
||||
java.util.function.DoubleConsumer
|
||||
(accept [this v] (set! dval (+ dval v)))
|
||||
ham_fisted.Reducible
|
||||
(reduce [this other] (set! dval (+ dval (.- dval ^FloatSumObj other))))
|
||||
clojure.lang.IDeref
|
||||
(deref [this] dval))
|
||||
|
||||
(defn sum-float64-consumer
|
||||
[colname]
|
||||
(ds-reduce/reducer->column-reducer
|
||||
(hamf-rf/double-consumer-reducer #(FloatSumObj. 0.0))
|
||||
:float64
|
||||
colname))
|
||||
|
||||
(def n-rows 500000)
|
||||
|
||||
(def ds (ds/->dataset (repeatedly n-rows
|
||||
(fn [] {:a (rand-int 50000)
|
||||
:b (rand-int 500)}))))
|
||||
|
||||
(def one-hot (dsc/fit-one-hot ds :b)))
|
||||
|
||||
|
||||
(dotimes [idx 100]
|
||||
(time
|
||||
(ds-reduce/group-by-column-agg
|
||||
:a
|
||||
(into {} (for [col (-> one-hot :one-hot-table vals)
|
||||
:when (not= col :a)]
|
||||
{col (sum-float64 col)}))
|
||||
{:parser-fn :float64}
|
||||
(dsc/transform-one-hot ds one-hot))))
|
||||
|
||||
(for [[name reducer] {:ds-reduce/sum ds-reduce/sum
|
||||
:max-int64 max-int64
|
||||
:sum-int64 sum-int64
|
||||
:max-float64 max-float64
|
||||
:sum-float64 sum-float64}]
|
||||
{name (repeatedly 3 (fn []
|
||||
))})
|
||||
|
||||
)
|
||||
@@ -0,0 +1,14 @@
|
||||
(ns tech.v3.dataset.set-test
|
||||
(:require [tech.v3.dataset :as ds]
|
||||
[tech.v3.dataset.set :as ds-set]
|
||||
[clojure.test :refer [deftest is]]))
|
||||
|
||||
|
||||
|
||||
(deftest union-intersection-test
|
||||
(let [ds-a (ds/->dataset [{:a 1 :b 2} {:a 1 :b 2} {:a 2 :b 3}])
|
||||
ds-b (ds/->dataset [{:a 1 :b 2} {:a 1 :b 2} {:a 3 :b 3}])]
|
||||
(is (= [{:a 2, :b 3} {:a 3, :b 3} {:a 1, :b 2} {:a 1, :b 2}]
|
||||
(ds/rows (ds-set/reduce-union [ds-a ds-b]))))
|
||||
(is (= [{:a 1, :b 2} {:a 1, :b 2}]
|
||||
(ds/rows (ds-set/reduce-intersection [ds-a ds-b]))))))
|
||||
@@ -0,0 +1,26 @@
|
||||
(ns tech.v3.dataset.test-utils
|
||||
(:require [tech.v3.io :as io]
|
||||
[clojure.string :as s]
|
||||
[tech.v3.dataset :as ds]
|
||||
[camel-snake-kebab.core :refer [->kebab-case]]))
|
||||
|
||||
(defn load-mapseq-fruit-dataset
|
||||
[]
|
||||
(let [fruit-ds (slurp (io/input-stream "test/data/fruit_data_with_colors.txt"))
|
||||
dataset (->> (s/split fruit-ds #"\n")
|
||||
(mapv #(s/split % #"\s+")))
|
||||
ds-keys (->> (first dataset)
|
||||
(mapv (comp keyword ->kebab-case)))]
|
||||
(->> (rest dataset)
|
||||
(map (fn [ds-line]
|
||||
(->> ds-line
|
||||
(map (fn [ds-val]
|
||||
(try
|
||||
(Double/parseDouble ^String ds-val)
|
||||
(catch Throwable e
|
||||
(-> (->kebab-case ds-val)
|
||||
keyword)))))
|
||||
(zipmap ds-keys))))
|
||||
(ds/->dataset))))
|
||||
|
||||
(def mapseq-fruit-dataset (memoize load-mapseq-fruit-dataset))
|
||||
@@ -0,0 +1,30 @@
|
||||
(ns tech.v3.dataset.update-columns-test
|
||||
(:require [tech.v3.dataset :as ds]
|
||||
[tech.v3.dataset.column-filters :as cf]
|
||||
[tech.v3.datatype.functional :as dfn]
|
||||
[clojure.test :refer [deftest is]]))
|
||||
|
||||
(deftest update-columns-selector-fn
|
||||
(let [ds (ds/->dataset {:a [1. 2. 3. 4.]
|
||||
:b [5 6 7 8]
|
||||
:c ["A" "B" "C" "D"]})
|
||||
ds' (-> ds
|
||||
(ds/update-columns cf/numeric
|
||||
#(dfn// (dfn/- % (dfn/mean %))
|
||||
(dfn/standard-deviation %)))
|
||||
)]
|
||||
(is (> 0.001 (Math/abs (reduce + (map - [-1.16189 -0.38729 0.38729 1.16189] (vec (ds' :a)))))))
|
||||
(is (> 0.001 (Math/abs (reduce + (map - [-1.16189 -0.38729 0.38729 1.16189] (vec (ds' :d)))))))
|
||||
(is (= ["A" "B" "C" "D"] (vec (ds' :c)))))
|
||||
|
||||
(let [ds (ds/->dataset {:a [1. 2. 3. 4.]
|
||||
:b [5 6 7 8]
|
||||
:c ["A" "B" "C" "D"]})
|
||||
ds' (as-> ds $
|
||||
(ds/update-columns $ (ds/column-names (cf/numeric $))
|
||||
#(dfn// (dfn/- % (dfn/mean %))
|
||||
(dfn/standard-deviation %)))
|
||||
)]
|
||||
(is (> 0.001 (Math/abs (reduce + (map - [-1.16189 -0.38729 0.38729 1.16189] (vec (ds' :a)))))))
|
||||
(is (> 0.001 (Math/abs (reduce + (map - [-1.16189 -0.38729 0.38729 1.16189] (vec (ds' :d)))))))
|
||||
(is (= ["A" "B" "C" "D"] (vec (ds' :c))))))
|
||||
+1895
File diff suppressed because it is too large
Load Diff
+354
@@ -0,0 +1,354 @@
|
||||
(ns tech.v3.libs.arrow-test
|
||||
(:require [tech.v3.libs.arrow :as arrow]
|
||||
[tech.v3.dataset :as ds]
|
||||
[tech.v3.dataset.column :as ds-col]
|
||||
[tech.v3.dataset.impl.sparse-column :as sparse-col]
|
||||
[tech.v3.datatype.functional :as dfn]
|
||||
[tech.v3.datatype :as dtype]
|
||||
[tech.v3.libs.parquet]
|
||||
[tech.v3.datatype.datetime :as dtype-dt]
|
||||
[tech.v3.resource :as resource]
|
||||
[clojure.test :refer [deftest is]])
|
||||
(:import [java.time LocalTime]
|
||||
[tech.v3.dataset Text]
|
||||
[java.util Map]
|
||||
[java.io ByteArrayOutputStream ByteArrayInputStream]))
|
||||
|
||||
|
||||
(tech.v3.dataset.utils/set-slf4j-log-level :info)
|
||||
|
||||
|
||||
(defn supported-datatype-ds
|
||||
([n]
|
||||
(-> (ds/->dataset {:boolean [true false true true false false true false false true]
|
||||
:bytes (byte-array (range n))
|
||||
:ubytes (dtype/make-container :uint8 (dfn/rem (range n) 256))
|
||||
:shorts (short-array (range n))
|
||||
:ushorts (dtype/make-container :uint16 (range n))
|
||||
:ints (int-array (range n))
|
||||
:uints (dtype/make-container :uint32 (range n))
|
||||
:longs (long-array (range n))
|
||||
:floats (float-array (range n))
|
||||
:doubles (double-array (range n))
|
||||
:strings (map str (range n))
|
||||
:text (map (comp #(Text. %) str) (range n))
|
||||
:instants (repeatedly n dtype-dt/instant)
|
||||
:bigdec (repeatedly n #(BigDecimal/valueOf (+ 100 (rand-int 1700)) 2))
|
||||
;; :bigint (let [rng (java.util.Random.)]
|
||||
;; (repeatedly n #(BigInteger. 256 rng )))
|
||||
;;external formats often don't support dash-case
|
||||
:local_dates (repeatedly n dtype-dt/local-date)
|
||||
:local_times (repeatedly n dtype-dt/local-time)
|
||||
:uuids (repeatedly n #(java.util.UUID/randomUUID))})
|
||||
(vary-meta assoc :name :testtable)))
|
||||
([]
|
||||
(supported-datatype-ds 10)))
|
||||
|
||||
|
||||
|
||||
(comment
|
||||
(arrow/dataset->stream! (supported-datatype-ds 1000) "test/data/alldtypes.arrow-ipc-lz4"
|
||||
{:compression :lz4})
|
||||
|
||||
(arrow/dataset->stream! (supported-datatype-ds 1000) "test/data/alldtypes.arrow-ipc-zstd"
|
||||
{:compression :zstd})
|
||||
|
||||
|
||||
(let [sds (supported-datatype-ds 1000)]
|
||||
(arrow/dataset-seq->stream! "test/data/alldtypes.arrow-file-zstd"
|
||||
{:compression :zstd
|
||||
:format :file
|
||||
:strings-as-text? true}
|
||||
[(ds/select-rows sds (range 500))
|
||||
;;test when you have to add more string dictionary values
|
||||
(ds/select-rows sds (range 500 1000))]))
|
||||
|
||||
(def ignored (arrow/stream->dataset-seq "test/data/alldtypes.arrow-file-zstd"))
|
||||
|
||||
(def ignored (arrow/stream->dataset "test/data/alldtypes.arrow-ipc-zstd"))
|
||||
|
||||
)
|
||||
|
||||
|
||||
(deftest base-datatype-test
|
||||
(try
|
||||
(resource/stack-resource-context
|
||||
(let [ds (supported-datatype-ds)
|
||||
_ (arrow/dataset->stream! ds "alldtypes.arrow")
|
||||
mmap-ds (arrow/stream->dataset "alldtypes.arrow" {:open-type :mmap
|
||||
:key-fn keyword})
|
||||
copy-ds (arrow/stream->dataset "alldtypes.arrow" {:key-fn keyword})]
|
||||
(doseq [col (vals ds)]
|
||||
(let [cname ((meta col) :name)
|
||||
dt (dtype/elemwise-datatype col)
|
||||
inp-col (mmap-ds cname)
|
||||
cp-col (copy-ds cname)]
|
||||
(is (= dt (dtype/elemwise-datatype inp-col)) (str "inplace failure " cname))
|
||||
(is (= dt (dtype/elemwise-datatype cp-col)) (str "copy failure " cname))
|
||||
|
||||
(is (= (vec col) (vec inp-col)) (str "inplace failure " cname))
|
||||
(is (= (vec col) (vec cp-col)) (str "copy failure " cname))))))
|
||||
(finally
|
||||
(.delete (java.io.File. "alldtypes.arrow")))))
|
||||
|
||||
|
||||
(deftest base-sparse-datatype-test
|
||||
(try
|
||||
(resource/stack-resource-context
|
||||
(let [ds (sparse-col/->sparse-ds (supported-datatype-ds) 0.0)
|
||||
_ (arrow/dataset->stream! ds "alldtypes-sparse.arrow")
|
||||
mmap-ds (arrow/stream->dataset "alldtypes-sparse.arrow" {:open-type :mmap
|
||||
:key-fn keyword})
|
||||
copy-ds (arrow/stream->dataset "alldtypes-sparse.arrow" {:key-fn keyword})]
|
||||
(is (every? sparse-col/is-sparse? (.values ^Map ds)))
|
||||
(is (every? sparse-col/is-sparse? (.values ^Map mmap-ds)))
|
||||
(is (every? sparse-col/is-sparse? (.values ^Map copy-ds)))
|
||||
(doseq [col (vals ds)]
|
||||
(let [cname ((meta col) :name)
|
||||
dt (dtype/elemwise-datatype col)
|
||||
inp-col (mmap-ds cname)
|
||||
cp-col (copy-ds cname)]
|
||||
(is (= dt (dtype/elemwise-datatype inp-col)) (str "inplace failure " cname))
|
||||
(is (= dt (dtype/elemwise-datatype cp-col)) (str "copy failure " cname))
|
||||
|
||||
(is (= (vec col) (vec inp-col)) (str "inplace failure " cname))
|
||||
(is (= (vec col) (vec cp-col)) (str "copy failure " cname))))))
|
||||
(finally
|
||||
(.delete (java.io.File. "alldtypes-sparse.arrow")))))
|
||||
|
||||
|
||||
(deftest arrow-file-types
|
||||
;;lz4 compression
|
||||
(let [all-files ["test/data/alldtypes.arrow-feather" ;lz4
|
||||
"test/data/alldtypes.arrow-feather-compressed" ;zstd
|
||||
"test/data/alldtypes.arrow-feather-v1" ;v1
|
||||
]]
|
||||
(doseq [file all-files]
|
||||
(is (= 1000 (ds/row-count (arrow/stream->dataset file)))))
|
||||
;; lz4 with dependent frames))))))
|
||||
(is (= 31962 (ds/row-count (arrow/stream->dataset "test/data/tweets_sentiment.feather"))))))
|
||||
|
||||
|
||||
(deftest base-ds-seq-test
|
||||
(try
|
||||
(let [ds (supported-datatype-ds)
|
||||
_ (arrow/dataset-seq->stream! "alldtypes-seq.arrow" {:strings-as-text? false} [ds ds ds])
|
||||
mmap-ds-seq (arrow/stream->dataset-seq "alldtypes-seq.arrow" {:key-fn keyword
|
||||
:open-type :mmap})
|
||||
copy-ds-seq (arrow/stream->dataset-seq "alldtypes-seq.arrow" {:key-fn keyword})]
|
||||
(is (= 3 (count mmap-ds-seq)))
|
||||
(is (= 3 (count copy-ds-seq)))
|
||||
(let [mmap-ds (last mmap-ds-seq)
|
||||
copy-ds (last copy-ds-seq)]
|
||||
(doseq [col (vals ds)]
|
||||
(let [cname ((meta col) :name)
|
||||
dt (dtype/elemwise-datatype col)
|
||||
inp-col (mmap-ds cname)
|
||||
cp-col (copy-ds cname)]
|
||||
(is (= dt (dtype/elemwise-datatype inp-col)) (str "inplace failure " cname))
|
||||
(is (= dt (dtype/elemwise-datatype cp-col)) (str "copy failure " cname))
|
||||
(is (= (vec col) (vec inp-col)) (str "inplace failure " cname))
|
||||
(is (= (vec col) (vec cp-col)) (str "copy failure " cname))))))
|
||||
(finally
|
||||
(.delete (java.io.File. "alldtypes-seq.arrow")))))
|
||||
|
||||
|
||||
(deftest simple-stocks
|
||||
(try
|
||||
(let [stocks (ds/->dataset "test/data/stocks.csv")
|
||||
_ (arrow/dataset->stream! stocks "temp.stocks.arrow")
|
||||
stocks-copying (arrow/stream->dataset "temp.stocks.arrow")
|
||||
stocks-inplace (arrow/stream->dataset "temp.stocks.arrow" {:open-type :mmap})
|
||||
pystocks-copying (arrow/stream->dataset "test/data/stocks.pyarrow.stream")
|
||||
pystocks-inplace (arrow/stream->dataset "test/data/stocks.pyarrow.stream")]
|
||||
;;This is here just to make sure that the data isn't cleaned up until it
|
||||
;;actually can safely be cleaned up. This was a bug that caused datatype to
|
||||
;;bump from 5.11 to 5.12
|
||||
(System/gc)
|
||||
(is (dfn/equals (stocks "price") (stocks-copying "price")))
|
||||
(is (dfn/equals (stocks "price") (stocks-inplace "price")))
|
||||
(is (dfn/equals (stocks "price") (pystocks-copying "price")))
|
||||
(is (dfn/equals (stocks "price") (pystocks-inplace "price")))
|
||||
|
||||
(is (= (vec (stocks "symbol")) (vec (stocks-copying "symbol"))))
|
||||
(is (= (vec (stocks "symbol")) (vec (stocks-inplace "symbol"))))
|
||||
;;python saves strings inline in the file - equivalent to :strings-as-text?
|
||||
;;save option
|
||||
(is (= (vec (stocks "symbol")) (mapv str (pystocks-copying "symbol"))))
|
||||
(is (= (vec (stocks "symbol")) (mapv str (pystocks-inplace "symbol")))))
|
||||
(finally
|
||||
(.delete (java.io.File. "temp.stocks.arrow")))))
|
||||
|
||||
|
||||
(deftest ames-house-prices
|
||||
(try
|
||||
(let [ames (ds/->dataset "test/data/ames-house-prices/train.csv")
|
||||
_ (arrow/dataset->stream! ames "temp.ames.arrow")
|
||||
ames-copying (arrow/stream->dataset "temp.ames.arrow")
|
||||
ames-inplace (arrow/stream->dataset "temp.ames.arrow" {:open-type :mmap})
|
||||
pyames-copying (arrow/stream->dataset "test/data/ames.pyarrow.stream")
|
||||
pyames-inplace (arrow/stream->dataset "test/data/ames.pyarrow.stream")]
|
||||
(System/gc)
|
||||
(is (dfn/equals (ames "SalePrice") (ames-copying "SalePrice")))
|
||||
(is (dfn/equals (ames "SalePrice") (ames-inplace "SalePrice")))
|
||||
(is (= (ds-col/missing (ames "LotFrontage"))
|
||||
(ds-col/missing (ames-copying "LotFrontage"))))
|
||||
(is (= (ds-col/missing (ames "LotFrontage"))
|
||||
(ds-col/missing (ames-inplace "LotFrontage"))))
|
||||
(is (not= 0 (dtype/ecount (ds-col/missing (ames-inplace "LotFrontage")))))
|
||||
(is (dfn/equals (ames "SalePrice") (pyames-copying "SalePrice")))
|
||||
(is (dfn/equals (ames "SalePrice") (pyames-inplace "SalePrice")))
|
||||
(is (= (ds-col/missing (ames "LotFrontage"))
|
||||
(ds-col/missing (pyames-copying "LotFrontage"))))
|
||||
(is (= (ds-col/missing (ames "LotFrontage"))
|
||||
(ds-col/missing (pyames-inplace "LotFrontage")))))
|
||||
(finally
|
||||
(.delete (java.io.File. "temp.ames.arrow")))))
|
||||
|
||||
|
||||
(deftest ames-compression-test
|
||||
(try
|
||||
(let [ames (ds/->dataset "test/data/ames-house-prices/train.csv")
|
||||
_ (arrow/dataset->stream! ames "ames-uncompressed.arrow")
|
||||
_ (arrow/dataset->stream! ames "ames-zstd.arrow" {:compression
|
||||
{:compression-type :zstd
|
||||
;;default is 3
|
||||
:level 5}})
|
||||
_ (arrow/dataset->stream! ames "ames-lz4.arrow" {:compression :lz4})
|
||||
_ (arrow/dataset->stream! (sparse-col/->sparse-ds ames)
|
||||
"ames-sparse-zstd.arrow" {:compression
|
||||
{:compression-type :zstd
|
||||
;;default is 3
|
||||
:level 5}})
|
||||
file-len (fn [path] (.length (java.io.File. (str path))))
|
||||
_ (println (ds/->dataset {:save-type [:uncompressed :zstd :sparse-zstd :lz4]
|
||||
:file-size [(file-len "ames-uncompressed.arrow")
|
||||
(file-len "ames-zstd.arrow")
|
||||
(file-len "ames-sparse-zstd.arrow")
|
||||
(file-len "ames-lz4.arrow")]}))
|
||||
uncomp (arrow/stream->dataset "ames-uncompressed.arrow")
|
||||
zstd (arrow/stream->dataset "ames-zstd.arrow")
|
||||
sparse-zstd (arrow/stream->dataset "ames-sparse-zstd.arrow")
|
||||
lz4 (arrow/stream->dataset "ames-lz4.arrow")]
|
||||
(System/gc)
|
||||
(is (dfn/equals (uncomp "SalePrice") (zstd "SalePrice")))
|
||||
(is (dfn/equals (uncomp "LotFrontage") (sparse-zstd "LotFrontage")))
|
||||
(is (dfn/equals (uncomp "SalePrice") (lz4 "SalePrice"))))
|
||||
(finally
|
||||
(.delete (java.io.File. "ames-uncompressed.arrow"))
|
||||
(.delete (java.io.File. "ames-zstd.arrow"))
|
||||
(.delete (java.io.File. "ames-sparse-zstd.arrow"))
|
||||
(.delete (java.io.File. "ames-lz4.arrow")))))
|
||||
|
||||
|
||||
(deftest date-arrow-test
|
||||
(let [date-data (arrow/read-stream-dataset-copying "test/data/with_date.arrow"
|
||||
{:integer-datetime-types? true})]
|
||||
(is (= [18070 18072 18063]
|
||||
(date-data "date")))
|
||||
(is (= :epoch-days (dtype/elemwise-datatype (date-data "date")))))
|
||||
(let [date-data (arrow/read-stream-dataset-copying "test/data/with_date.arrow")]
|
||||
(is (= (mapv #(java.time.LocalDate/parse %)
|
||||
["2019-06-23" "2019-06-25" "2019-06-16"])
|
||||
(date-data "date")))
|
||||
(is (= :packed-local-date (dtype/elemwise-datatype (date-data "date"))))))
|
||||
|
||||
(deftest odd-parquet-crash
|
||||
(let [test-data (ds/->dataset "test/data/part-00000-74d3eb51-bc9c-4ba5-9d13-9e0d71eea31f.c000.snappy.parquet")]
|
||||
(try
|
||||
(arrow/write-dataset-to-stream! test-data "test.arrow")
|
||||
(let [arrow-ds (arrow/read-stream-dataset-copying "test.arrow")]
|
||||
(is (= (ds/missing test-data)
|
||||
(ds/missing arrow-ds))))
|
||||
(finally
|
||||
(.delete (java.io.File. "test.arrow"))))))
|
||||
|
||||
|
||||
(deftest failed-R-file
|
||||
(let [cp-data (arrow/read-stream-dataset-copying "test/data/part-8981.ipc_stream")
|
||||
inp-data (arrow/read-stream-dataset-inplace "test/data/part-8981.ipc_stream")]
|
||||
(is (= (vec (ds/column-names cp-data))
|
||||
(vec (ds/column-names inp-data))))))
|
||||
|
||||
|
||||
(deftest large-var-char-file
|
||||
(let [cp-data (arrow/read-stream-dataset-copying "test/data/largeVarChar.ipc")
|
||||
inp-data (arrow/read-stream-dataset-inplace "test/data/largeVarChar.ipc")]
|
||||
(is (= (vec (ds/column-names cp-data))
|
||||
(vec (ds/column-names inp-data))))
|
||||
(is (= (vec (first (ds/columns cp-data)))
|
||||
(vec (first (ds/columns inp-data)))))))
|
||||
|
||||
|
||||
(deftest uuid-test
|
||||
(let [py-uuid (ds/->dataset "test/data/uuid_ext.arrow" {:key-fn keyword})]
|
||||
|
||||
(is (= :uuid (dtype/elemwise-datatype (py-uuid :id))))
|
||||
(is (= (mapv #(java.util.UUID/fromString %)
|
||||
["8be643d6-0df7-4e5e-837c-f94170c87914"
|
||||
"24bc9cf4-e2e8-444f-bb2d-82394f33ff76"
|
||||
"e8149e1b-aef6-4671-b1b4-3b7a21eed92a"])
|
||||
(py-uuid :id))))
|
||||
(try
|
||||
(let [uuid-ds (ds/->dataset "test/data/uuid.parquet"
|
||||
{:parser-fn {"uuids" :uuid}})
|
||||
_ (arrow/write-dataset-to-stream! uuid-ds "test-uuid.arrow")
|
||||
copying-ds (arrow/read-stream-dataset-copying "test-uuid.arrow")
|
||||
inplace-ds (arrow/read-stream-dataset-inplace "test-uuid.arrow")]
|
||||
(is (= :uuid ((comp :datatype meta) (copying-ds "uuids"))))
|
||||
(is (= :uuid ((comp :datatype meta) (inplace-ds "uuids"))))
|
||||
(is (= (vec (copying-ds "uuids"))
|
||||
(vec (inplace-ds "uuids"))))
|
||||
(is (= (vec (uuid-ds "uuids"))
|
||||
(vec (copying-ds "uuids")))))
|
||||
(finally
|
||||
(.delete (java.io.File. "test-uuid.arrow")))))
|
||||
|
||||
|
||||
(deftest local-time
|
||||
(try
|
||||
(let [ds (ds/->dataset {"a" (range 10)
|
||||
"b" (repeat 10 (java.time.LocalTime/now))})
|
||||
_ (arrow/write-dataset-to-stream! ds "test-local-time.arrow")
|
||||
copying-ds (arrow/read-stream-dataset-copying "test-local-time.arrow")
|
||||
inplace-ds (arrow/read-stream-dataset-inplace "test-local-time.arrow")]
|
||||
(is (= :packed-local-time (dtype/elemwise-datatype (copying-ds "b"))))
|
||||
(is (= :packed-local-time (dtype/elemwise-datatype (inplace-ds "b"))))
|
||||
(is (= (vec (copying-ds "b"))
|
||||
(vec (inplace-ds "b"))))
|
||||
;;Making a primitive container will use the packed data.
|
||||
(is (= (vec (ds "b"))
|
||||
(vec (copying-ds "b")))))
|
||||
(finally
|
||||
(.delete (java.io.File. "test-local-time.arrow")))))
|
||||
|
||||
|
||||
(deftest string-arrow
|
||||
(let [dataset (ds/->dataset [{"col1" "a"}] {:parser-fn :string})
|
||||
baos (ByteArrayOutputStream.)]
|
||||
(resource/stack-resource-context
|
||||
(arrow/dataset->stream! dataset baos {:compression :lz4})
|
||||
(let [written-bytes (.toByteArray baos)
|
||||
arrow-ds-rtt (arrow/stream->dataset written-bytes)
|
||||
_ (.reset baos)
|
||||
_ (arrow/dataset->stream! arrow-ds-rtt baos {:compression :lz4})
|
||||
b2 (.toByteArray baos)
|
||||
final-ds (arrow/stream->dataset b2)]
|
||||
(is (= (vec (dataset "col1"))
|
||||
(vec (final-ds "col1"))))))))
|
||||
|
||||
|
||||
(deftest nullcol
|
||||
(let [ds (arrow/stream->dataset "test/data/withnullcol.arrow")]
|
||||
(is (= (vec (range (ds/row-count ds)))
|
||||
(vec (ds/missing (ds "nullcol")))))))
|
||||
|
||||
(deftest list-datatypes-read-only
|
||||
(let [ds (ds/->dataset "test/data/arrow_list.arrow")]
|
||||
(is (= [["dog" "car"]
|
||||
["dog" "flower"]
|
||||
["car" "flower"]]
|
||||
(mapv vec (ds "class-name"))))))
|
||||
|
||||
(deftest empty-array-dataset
|
||||
(is (nil? (arrow/stream->dataset "test/data/empty.arrow"))))
|
||||
+28
@@ -0,0 +1,28 @@
|
||||
(ns tech.v3.libs.csv-test
|
||||
(:require [clojure.test :refer [deftest is testing]]
|
||||
[tech.v3.dataset :as ds]
|
||||
[tech.v3.dataset.io.csv :as csv-parse]))
|
||||
|
||||
(def duplicate-headers-file "test/data/duplicate-headers.csv")
|
||||
|
||||
(deftest ensure-unique-headers-test
|
||||
(testing "that all headers are are forced to be unique"
|
||||
(let [ds (ds/->dataset duplicate-headers-file
|
||||
{:ensure-unique-column-names? true})]
|
||||
(is (ds/column-count ds) 7)
|
||||
(is (count (set (ds/column-names ds))) 7))
|
||||
(let [ds (csv-parse/csv->dataset duplicate-headers-file
|
||||
{:ensure-unique-column-names? true})]
|
||||
(is (ds/column-count ds) 7)
|
||||
(is (count (set (ds/column-names ds))) 7)))
|
||||
|
||||
(testing "that exception is thrown on duplicate headers"
|
||||
(is (thrown? RuntimeException (ds/->dataset duplicate-headers-file))))
|
||||
|
||||
(testing "that custom postfix-fn works correctly"
|
||||
(let [ds (ds/->dataset duplicate-headers-file
|
||||
{:ensure-unique-column-names? true
|
||||
:unique-column-name-fn (fn [col-idx colname] (str colname "::" col-idx))})]
|
||||
(is (some? (ds/column ds "column::2")))
|
||||
(is (some? (ds/column ds "column::4")))
|
||||
(is (some? (ds/column ds "column-1::6"))))))
|
||||
@@ -0,0 +1,99 @@
|
||||
(ns tech.v3.libs.fastexcel-test
|
||||
(:require [tech.v3.libs.fastexcel :as xlsx-parse]
|
||||
[tech.v3.dataset :as ds]
|
||||
[tech.v3.datatype :as dtype]
|
||||
[clojure.test :refer [deftest is testing]]))
|
||||
|
||||
(def xls-file "test/data/file_example_XLS_1000.xls")
|
||||
(def xlsx-file "test/data/file_example_XLSX_1000.xlsx")
|
||||
(def sparse-file "test/data/sparsefile.xlsx")
|
||||
(def stocks-file "test/data/stocks.xlsx")
|
||||
(def stocks-bad-date-file "test/data/stocks-bad-date.xlsx")
|
||||
(def duplicate-headers-file "test/data/duplicate-headers.xlsx")
|
||||
|
||||
|
||||
|
||||
(deftest happy-path-parse-test
|
||||
(let [ds (first (xlsx-parse/workbook->datasets xlsx-file))]
|
||||
(is (= #{"column-0" "Age" "Country" "First Name" "Gender" "Date" "Last Name" "Id"}
|
||||
(set (ds/column-names ds))))
|
||||
(is (= #{:float64 :string}
|
||||
(set (map dtype/get-datatype (ds/columns ds)))))
|
||||
(is (= 1000 (ds/row-count ds)))
|
||||
(is (= 8 (ds/column-count ds)))))
|
||||
|
||||
|
||||
|
||||
(deftest sparse-file-parse-test
|
||||
(let [ds (first (xlsx-parse/workbook->datasets sparse-file))]
|
||||
(is (= 8 (ds/row-count ds)))
|
||||
(is (= 8 (ds/column-count ds)))
|
||||
(is (every? #(= (set (range 8)) %)
|
||||
(map (comp set ds/missing ds) ["column-0" "a" "column-6"])))
|
||||
(is (= [1.0 1.0 1.0 "a" 2.0 23.0]
|
||||
(->> (ds/columns ds)
|
||||
(mapcat (comp dtype/->reader ds/drop-missing))
|
||||
vec)))))
|
||||
|
||||
(deftest datetime-test
|
||||
(let [ds (first (xlsx-parse/workbook->datasets
|
||||
stocks-file
|
||||
{:parser-fn {"date" :packed-local-date}}))]
|
||||
(is (= :packed-local-date (dtype/get-datatype (ds "date"))))))
|
||||
|
||||
|
||||
(deftest bad-datetime-test
|
||||
(let [ds (first (xlsx-parse/workbook->datasets stocks-bad-date-file))]
|
||||
(is (= :string (dtype/get-datatype (ds "date"))))
|
||||
(is (= {java.lang.String 29}
|
||||
(->> (ds "date")
|
||||
(map type)
|
||||
frequencies)))))
|
||||
|
||||
|
||||
(deftest skip-rows-test
|
||||
(let [ds (ds/->dataset "test/data/holdings-daily-us-en-mdy.xlsx"
|
||||
{:n-initial-skip-rows 4
|
||||
:parser-fn {"Identifier" :string
|
||||
"Weight" :float64}})]
|
||||
;;column-8 had no data
|
||||
(is (= #{:float64 :string :boolean}
|
||||
(set (map dtype/get-datatype (vals ds)))))
|
||||
(is (= ["Name"
|
||||
"Ticker"
|
||||
"Identifier"
|
||||
"SEDOL"
|
||||
"Weight"
|
||||
"Sector"
|
||||
"Shares Held"
|
||||
"Local Currency"
|
||||
"column-8"]
|
||||
(vec (ds/column-names ds))))))
|
||||
|
||||
(deftest ensure-unique-headers-test
|
||||
(testing "that all headers are are forced to be unique"
|
||||
(let [ds (ds/->dataset duplicate-headers-file
|
||||
{:ensure-unique-column-names? true})]
|
||||
(is (ds/column-count ds) 7)
|
||||
(is (count (set (ds/column-names ds))) 7))
|
||||
(let [ds (first (xlsx-parse/workbook->datasets duplicate-headers-file
|
||||
{:ensure-unique-column-names? true}))]
|
||||
(is (ds/column-count ds) 7)
|
||||
(is (count (set (ds/column-names ds))) 7)))
|
||||
|
||||
(testing "that exception is thrown on duplicate headers"
|
||||
(is (thrown? RuntimeException (ds/->dataset duplicate-headers-file)))
|
||||
(is (thrown? RuntimeException (xlsx-parse/workbook->datasets duplicate-headers-file))))
|
||||
|
||||
(testing "that custom postfix-fn works correctly"
|
||||
(let [ds (ds/->dataset duplicate-headers-file
|
||||
{:ensure-unique-column-names? true
|
||||
:unique-column-name-fn (fn [col-idx colname] (str colname "::" col-idx))})]
|
||||
(is (some? (ds/column ds "column::2")))
|
||||
(is (some? (ds/column ds "column::4")))
|
||||
(is (some? (ds/column ds "column-1::6"))))))
|
||||
|
||||
|
||||
(deftest number-colname
|
||||
(let [ds (ds/->dataset "test/data/number_column.xlsx")]
|
||||
(is (= (first (ds/column-names ds)) 0.0))))
|
||||
+124
@@ -0,0 +1,124 @@
|
||||
(ns tech.v3.libs.parquet-test
|
||||
(:require [tech.v3.dataset :as ds]
|
||||
[tech.v3.datatype :as dtype]
|
||||
[tech.v3.datatype.functional :as dfn]
|
||||
[tech.v3.libs.parquet :as parquet]
|
||||
[tech.v3.dataset.utils :as ds-utils]
|
||||
[tech.v3.dataset.column :as ds-col]
|
||||
[tech.v3.datatype.datetime :as dtype-dt]
|
||||
[clojure.test :refer [deftest is]]))
|
||||
|
||||
(ds-utils/set-slf4j-log-level :info)
|
||||
|
||||
|
||||
(deftest stocks-test
|
||||
(try
|
||||
(let [stocks (ds/->dataset "test/data/stocks.csv")
|
||||
_ (ds/write! stocks "stocks.parquet")
|
||||
stocks-p (ds/->dataset "stocks.parquet")]
|
||||
(is (= (vec (stocks "symbol"))
|
||||
(mapv str (stocks-p "symbol"))))
|
||||
(is (dfn/equals (stocks "price")
|
||||
(stocks-p "price")))
|
||||
(is (= (vec (stocks "date"))
|
||||
(vec (stocks-p "date")))))
|
||||
(finally
|
||||
(.delete (java.io.File. "stocks.parquet")))))
|
||||
|
||||
|
||||
(deftest userdata1-test
|
||||
(try
|
||||
(let [testd (ds/->dataset "test/data/parquet/userdata1.parquet")
|
||||
_ (ds/write! testd "userdata1.parquet")
|
||||
newd (ds/->dataset "userdata1.parquet")
|
||||
_ (ds/write! newd "userdata1.nippy")
|
||||
nippy-d (ds/->dataset "userdata1.nippy")]
|
||||
(is (= (vec (testd "registration_dttm"))
|
||||
(vec (newd "registration_dttm"))))
|
||||
(is (= (vec (testd "comments"))
|
||||
(vec (newd "comments"))))
|
||||
(is (= (vec (testd "comments"))
|
||||
(vec (nippy-d "comments")))))
|
||||
|
||||
(finally
|
||||
(.delete (java.io.File. "userdata1.parquet"))
|
||||
(.delete (java.io.File. "userdata1.nippy")))))
|
||||
|
||||
|
||||
(deftest whitelist-test
|
||||
(let [testd (ds/->dataset "test/data/parquet/userdata1.parquet"
|
||||
{:column-whitelist ["first_name" "last_name" "gender"]})]
|
||||
(is (= 3 (ds/column-count testd)))))
|
||||
|
||||
|
||||
(deftest ames-ds
|
||||
(try
|
||||
(let [ames (ds/->dataset "test/data/ames-house-prices/train.csv")
|
||||
_ (ds/write! ames "ames.parquet")
|
||||
newd (ds/->dataset "ames.parquet")]
|
||||
(is (= (ds/missing (ames "LotFrontage"))
|
||||
(ds/missing (newd "LotFrontage"))))
|
||||
(is (= (vec (ames "CentralAir"))
|
||||
(vec (newd "CentralAir"))))
|
||||
(is (dfn/equals (ames "SalePrice") (newd "SalePrice"))))
|
||||
(finally
|
||||
(.delete (java.io.File. "ames.parquet")))))
|
||||
|
||||
|
||||
(deftest uuid-test
|
||||
(try
|
||||
(let [uuid-ds (ds/->dataset "test/data/uuid.parquet"
|
||||
{:parser-fn {"uuids" :uuid}})
|
||||
_ (ds/write! uuid-ds "test-uuid.parquet")
|
||||
new-ds (ds/->dataset "test-uuid.parquet"
|
||||
{:parser-fn {"uuids" :uuid}})]
|
||||
(is (= :uuid ((comp :datatype meta) (uuid-ds "uuids"))))
|
||||
(is (= :uuid ((comp :datatype meta) (new-ds "uuids")))))
|
||||
(finally
|
||||
(.delete (java.io.File. "test-uuid.parquet")))))
|
||||
|
||||
|
||||
(deftest missing-uint8-data
|
||||
;;Use a large enough value the the system is forced to use uint8 columns else
|
||||
;;it will default to int8 columns based on the column data min/max
|
||||
(let [ds (ds/->dataset {:a (dtype/make-container :uint8 [10 20 245])})
|
||||
ds (ds/update-column ds :a #(ds-col/set-missing % [1 5]))]
|
||||
(try
|
||||
(parquet/ds->parquet ds "test.parquet")
|
||||
(let [nds (ds/->dataset "test.parquet" {:key-fn keyword})]
|
||||
(is (= 3 (ds/row-count nds)))
|
||||
(is (= [1] (vec (dtype/->reader (ds/missing ds)))))
|
||||
(is (= :uint8 (dtype/elemwise-datatype (ds :a))))
|
||||
(is (= :uint8 (dtype/elemwise-datatype (nds :a))))
|
||||
(is (= [1] (vec (dtype/->reader (ds/missing nds))))))
|
||||
(finally
|
||||
(.delete (java.io.File. "test.parquet"))))))
|
||||
|
||||
|
||||
(deftest nested-parquet
|
||||
(let [ds (ds/->dataset "test/data/nested.parquet")]
|
||||
(is (= [1 nil 2 nil 3 nil nil] (vec (ds "id"))))
|
||||
(is (= ["a" "b" "a" "b" "a" "b" "c"] (vec (ds "val.key_value.key"))))
|
||||
(is (= ["va" "vb" nil nil "vb" nil nil] (vec (ds "val2.key_value.key"))))))
|
||||
|
||||
|
||||
(deftest local-time
|
||||
(try
|
||||
(let [ds (ds/->dataset {:a (range 10)
|
||||
:b (repeat 10 (java.time.LocalTime/now))})
|
||||
_ (parquet/ds->parquet ds "test.parquet")
|
||||
pds (ds/->dataset "test.parquet" {:key-fn keyword})]
|
||||
(is (= (vec (ds :b))
|
||||
(vec (pds :b)))))
|
||||
(finally
|
||||
(.delete (java.io.File. "test.parquet")))))
|
||||
|
||||
|
||||
(deftest decimaltable
|
||||
(let [table (ds/->dataset "test/data/decimaltable.parquet")
|
||||
decimals (table "decimals")]
|
||||
(is (dfn/equals [3.420 1.246] decimals))))
|
||||
|
||||
|
||||
(deftest issue-401-paruet-missing-column
|
||||
(is (= 4 (ds/column-count (ds/->dataset "test/data/2024-03-03.parquet")))))
|
||||
+115
@@ -0,0 +1,115 @@
|
||||
(ns tech.v3.libs.poi-test
|
||||
(:require [tech.v3.libs.poi :as xlsx-parse]
|
||||
[tech.v3.dataset :as ds]
|
||||
[tech.v3.dataset.column :as ds-col]
|
||||
[tech.v3.datatype.functional :as dfn]
|
||||
[tech.v3.datatype :as dtype]
|
||||
[clojure.test :refer [deftest is testing]]))
|
||||
|
||||
|
||||
(def xls-file "test/data/file_example_XLS_1000.xls")
|
||||
(def xlsx-file "test/data/file_example_XLSX_1000.xlsx")
|
||||
(def sparse-file "test/data/sparsefile.xlsx")
|
||||
(def stocks-file "test/data/stocks.xlsx")
|
||||
(def duplicate-headers-file "test/data/duplicate-headers.xls")
|
||||
|
||||
|
||||
(deftest happy-path-parse-test
|
||||
(let [ds (first (xlsx-parse/workbook->datasets xlsx-file))
|
||||
ds2 (first (xlsx-parse/workbook->datasets xlsx-file))]
|
||||
(is (= #{"column-0" "Age" "Country" "First Name" "Gender" "Date" "Last Name" "Id"}
|
||||
(set (ds/column-names ds))))
|
||||
(is (= #{"column-0" "Age" "Country" "First Name" "Gender" "Date" "Last Name" "Id"}
|
||||
(set (ds/column-names ds2))))
|
||||
(is (= #{:float64 :string}
|
||||
(set (map dtype/get-datatype (ds/columns ds)))))
|
||||
(is (= 1000 (ds/row-count ds)))
|
||||
(is (= 1000 (ds/row-count ds2)))
|
||||
(is (= 8 (ds/column-count ds)))
|
||||
(is (= 8 (ds/column-count ds2)))
|
||||
(is (dfn/equals (ds "Age") (ds2 "Age")))
|
||||
(is (dfn/equals (ds "Id") (ds2 "Id")))))
|
||||
|
||||
|
||||
(deftest sparse-file-parse-test
|
||||
(let [ds (first (xlsx-parse/workbook->datasets sparse-file))]
|
||||
(is (= 8 (ds/row-count ds)))
|
||||
(is (= 8 (ds/column-count ds)))
|
||||
(is (every? #(= (set (range 8)) %)
|
||||
(map (comp set ds-col/missing ds) ["column-0" "a" "column-6"])))
|
||||
(is (= [1.0 1.0 1.0 "a" 2.0 23.0]
|
||||
(->> (ds/columns ds)
|
||||
(mapcat (comp dtype/->reader ds/drop-missing))
|
||||
vec)))))
|
||||
|
||||
|
||||
(deftest datetime-test
|
||||
(let [ds (first (xlsx-parse/workbook->datasets
|
||||
stocks-file
|
||||
{:parser-fn {"date" :packed-local-date}}))]
|
||||
(is (= :packed-local-date (dtype/get-datatype (ds "date"))))))
|
||||
|
||||
|
||||
(deftest custom-parser-test
|
||||
(let [ds (first (xlsx-parse/workbook->datasets
|
||||
xls-file
|
||||
{:parser-fn {"Date" [:local-date
|
||||
"dd/MM/yyyy"]}}))]
|
||||
(is (= :local-date (dtype/get-datatype (ds "Date"))))))
|
||||
|
||||
|
||||
(deftest integer-field-test
|
||||
(let [ds (first (xlsx-parse/workbook->datasets
|
||||
xls-file
|
||||
{:parser-fn {"Id" :int64}}))]
|
||||
(is (= :int64 (dtype/get-datatype (ds "Id"))))))
|
||||
|
||||
|
||||
(deftest xls-keyword-colnames
|
||||
(let [ds (first (xlsx-parse/workbook->datasets
|
||||
xls-file
|
||||
{:key-fn keyword}))]
|
||||
;;The first column is an integer so keyword returns nil for that.
|
||||
;;This is also a good example in that the system produces keywords with spaces
|
||||
;;in them...that definitely isn't ideal.
|
||||
(is (every? keyword? (rest (ds/column-names ds))))))
|
||||
|
||||
|
||||
(deftest key-fn-number-columns
|
||||
(let [ds (first (xlsx-parse/workbook->datasets xlsx-file {:key-fn keyword}))]
|
||||
(is (= 0 (count (filter nil? (ds/column-names ds)))))
|
||||
(is (= #{:column-0 :Age :Country (keyword "First Name") :Gender :Date
|
||||
(keyword "Last Name") (keyword "Id")}
|
||||
(set (ds/column-names ds))))))
|
||||
|
||||
|
||||
(deftest auto-infer-dates
|
||||
(let [ds (first (xlsx-parse/workbook->datasets "test/data/stocks-with-dates.xlsx"))]
|
||||
(is (= #{:string :packed-local-date :float64}
|
||||
(->> (vals ds)
|
||||
(map (comp :datatype meta))
|
||||
set)))))
|
||||
|
||||
|
||||
(deftest ensure-unique-headers-test
|
||||
(testing "that all headers are are forced to be unique"
|
||||
(let [ds (ds/->dataset duplicate-headers-file
|
||||
{:ensure-unique-column-names? true})]
|
||||
(is (ds/column-count ds) 7)
|
||||
(is (count (set (ds/column-names ds))) 7))
|
||||
(let [ds (first (xlsx-parse/workbook->datasets duplicate-headers-file
|
||||
{:ensure-unique-column-names? true}))]
|
||||
(is (ds/column-count ds) 7)
|
||||
(is (count (set (ds/column-names ds))) 7)))
|
||||
|
||||
(testing "that exception is thrown on duplicate headers"
|
||||
(is (thrown? RuntimeException (ds/->dataset duplicate-headers-file)))
|
||||
(is (thrown? RuntimeException (xlsx-parse/workbook->datasets duplicate-headers-file))))
|
||||
|
||||
(testing "that custom postfix-fn works correctly"
|
||||
(let [ds (ds/->dataset duplicate-headers-file
|
||||
{:ensure-unique-column-names? true
|
||||
:unique-column-name-fn (fn [col-idx colname] (str colname "::" col-idx))})]
|
||||
(is (some? (ds/column ds "column::2")))
|
||||
(is (some? (ds/column ds "column::4")))
|
||||
(is (some? (ds/column ds "column-1::6"))))))
|
||||
Reference in New Issue
Block a user