1896 lines
71 KiB
Clojure
Vendored
1896 lines
71 KiB
Clojure
Vendored
(ns tech.v3.dataset-test
|
|
(:require [tech.v3.datatype :as dtype]
|
|
[tech.v3.datatype.functional :as dfn]
|
|
[tech.v3.datatype.datetime :as dtype-dt]
|
|
[tech.v3.datatype.struct :as dt-struct]
|
|
[tech.v3.datatype.argops :as argops]
|
|
[tech.v3.datatype.bitmap :as bitmap]
|
|
[tech.v3.tensor :as dtt]
|
|
[tech.v3.dataset :as ds]
|
|
[tech.v3.dataset-api :as ds-api]
|
|
[tech.v3.dataset.protocols :as ds-proto]
|
|
[tech.v3.dataset.base :as ds-base]
|
|
[tech.v3.dataset.column :as ds-col]
|
|
[tech.v3.dataset.tensor :as ds-tens]
|
|
[tech.v3.dataset.string-table :as str-table]
|
|
[tech.v3.dataset.join :as ds-join]
|
|
[tech.v3.datatype.rolling :as rolling]
|
|
[tech.v3.dataset.test-utils :as test-utils]
|
|
[tech.v3.dataset.rolling :as ds-roll]
|
|
[tech.v3.dataset.column-filters :as cf]
|
|
[tech.v3.dataset.impl.column :as col-impl]
|
|
[tech.v3.dataset.print :as ds-print]
|
|
;;Loading multimethods required to load the files
|
|
[tech.v3.libs.poi]
|
|
[tech.v3.libs.fastexcel]
|
|
[tech.v3.io :as tech-io]
|
|
[taoensso.nippy :as nippy]
|
|
[clojure.test :refer [deftest is]]
|
|
[ham-fisted.api :as hamf]
|
|
[ham-fisted.lazy-noncaching :as lznc])
|
|
(:import [java.util List HashSet UUID Random BitSet Map]
|
|
[java.util.concurrent ConcurrentHashMap]
|
|
[java.time LocalDate]
|
|
[java.io File ByteArrayInputStream]
|
|
[tech.v3 TMD]
|
|
[ham_fisted IFnDef$OD IMutList]))
|
|
|
|
|
|
(deftest datatype-parser
|
|
(let [ds (ds/->dataset "test/data/datatype_parser.csv")]
|
|
(is (= :int16 (dtype/get-datatype (ds/column ds "id"))))
|
|
(is (= [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] (ds/column ds "id")))
|
|
(is (= :string (dtype/get-datatype (ds/column ds "char"))))
|
|
(is (= ["t", "f", "y", "n", "T", "F", "Y", "N", "A", "z"]
|
|
(ds/column ds "char")))
|
|
(is (= :string (dtype/get-datatype (ds/column ds "word"))))
|
|
(is (= ["true", "False", "YES", "NO", "positive", "negative", "yep", "not", "pos", "neg"]
|
|
(ds/column ds "word")))
|
|
(is (= :boolean (dtype/get-datatype (ds/column ds "bool"))))
|
|
(is (= [true, true, false, false, true, false, true, false, false, false]
|
|
(ds/column ds "bool")))
|
|
(is (= :string (dtype/get-datatype (ds/column ds "boolstr"))))
|
|
(is (= ["true", "true", "false", "false", "true", "false", "true", "false", "False", "false"]
|
|
(ds/column ds "boolstr")))
|
|
(is (= :string (dtype/get-datatype (ds/column ds "boolean"))))
|
|
(is (= ["t", "y", "n", "f", "true", "false", "positive", "negative", "negative", "negative"]
|
|
(ds/column ds "boolean"))))
|
|
(let [ds (ds/->dataset "test/data/datatype_parser.csv" {:parser-fn {"boolean" :boolean
|
|
"boolstr" :boolean}})]
|
|
(is (= :int16 (dtype/get-datatype (ds/column ds "id"))))
|
|
(is (= [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] (ds/column ds "id")))
|
|
(is (= :string (dtype/get-datatype (ds/column ds "char"))))
|
|
(is (= ["t", "f", "y", "n", "T", "F", "Y", "N", "A", "z"]
|
|
(ds/column ds "char")))
|
|
(is (= :string (dtype/get-datatype (ds/column ds "word"))))
|
|
(is (= ["true", "False", "YES", "NO", "positive", "negative", "yep", "not", "pos", "neg"]
|
|
(ds/column ds "word")))
|
|
(is (= :boolean (dtype/get-datatype (ds/column ds "bool"))))
|
|
(is (= [true, true, false, false, true, false, true, false, false, false]
|
|
(ds/column ds "boolean")))
|
|
(is (= :boolean (dtype/get-datatype (ds/column ds "boolstr"))))
|
|
(is (= [true, true, false, false, true, false, true, false, false, false]
|
|
(ds/column ds "boolstr")))
|
|
(is (= :boolean (dtype/get-datatype (ds/column ds "boolean"))))
|
|
(is (= [true, true, false, false, true, false, true, false, false, false]
|
|
(ds/column ds "boolean")))))
|
|
|
|
(deftest iterable
|
|
(let [ds (ds/->dataset (test-utils/mapseq-fruit-dataset))]
|
|
(is (= (ds/column-names ds)
|
|
(map ds-col/column-name (vals ds))))))
|
|
|
|
|
|
(deftest string-column-add-or-update
|
|
(let [ds (-> (ds/->dataset (test-utils/mapseq-fruit-dataset))
|
|
(ds/update-column :fruit-name (partial dtype/emap #(str (name %) "-fn") :string)))]
|
|
(is (= ["apple-fn" "apple-fn" "apple-fn" "mandarin-fn" "mandarin-fn"
|
|
"mandarin-fn" "mandarin-fn" "mandarin-fn" "apple-fn" "apple-fn"]
|
|
(->> (ds :fruit-name)
|
|
(take 10)
|
|
vec)))))
|
|
|
|
|
|
(deftest name-values-seq->dataset-test
|
|
(is (= [{:a 0.0, :b "a"} {:a 1.0, :b "b"} {:a 2.0, :b "c"}
|
|
{:a 3.0, :b "d"} {:a 4.0, :b "e"}]
|
|
(-> (ds/->dataset
|
|
{:a (double-array (range 5))
|
|
:b ["a" "b" "c" "d" "e"]})
|
|
(ds/mapseq-reader))))
|
|
|
|
(is (= #{4}
|
|
(-> (ds/->dataset
|
|
{:a (double-array (range 5))
|
|
:b ["a" "b" "c" "d"]})
|
|
(ds/missing)
|
|
(set))))
|
|
|
|
(is (= [{:a 0, :b "a"} {:a 1, :b "b"} {:a 2, :b "c"}
|
|
{:a 3, :b "d"} {:a 4, :b "e"}]
|
|
(-> (ds/->dataset
|
|
{:a (long-array (range 5))
|
|
:b ["a" "b" "c" "d" "e"]})
|
|
(ds/mapseq-reader)))))
|
|
|
|
|
|
(deftest unique-by-test
|
|
(let [ds (test-utils/mapseq-fruit-dataset)]
|
|
(is (= [7 4]
|
|
(dtype/shape (ds/unique-by ds :fruit-name))))
|
|
(is (= [7 4]
|
|
(dtype/shape (ds/unique-by-column ds :fruit-name))))
|
|
(is (= #{:apple :orange :lemon :mandarin}
|
|
(-> (ds/column (ds/unique-by-column ds :fruit-name)
|
|
:fruit-name)
|
|
set)))
|
|
|
|
(is (= [7 24]
|
|
(dtype/shape (ds/unique-by ds :width))))
|
|
(is (= [7 24]
|
|
(dtype/shape (ds/unique-by-column ds :width))))
|
|
(is (dfn/equals [5.8 5.9 6.0 6.1 6.2 6.3 6.5
|
|
6.7 6.8 6.9 7.0 7.1 7.2 7.3
|
|
7.4 7.5 7.6 7.7 7.8 8.0 8.4
|
|
9.0 9.2 9.6]
|
|
(->> (ds/column (ds/unique-by-column ds :width)
|
|
:width)
|
|
sort
|
|
vec)))))
|
|
|
|
|
|
(deftest ds-concat-nil-pun
|
|
(let [ds (-> (ds/->dataset (test-utils/mapseq-fruit-dataset))
|
|
(ds/select :all (range 10)))
|
|
d1 (ds/concat nil ds)
|
|
d2 (ds/concat ds nil nil)
|
|
nothing (ds/concat nil nil nil)]
|
|
(is (= (vec (ds :fruit-name))
|
|
(vec (d1 :fruit-name))))
|
|
(is (= (vec (ds :fruit-name))
|
|
(vec (d2 :fruit-name))))
|
|
(is (nil? nothing))))
|
|
|
|
|
|
(deftest ds-concat-copying-nil-pun
|
|
(let [ds (-> (ds/->dataset (test-utils/mapseq-fruit-dataset))
|
|
(ds/select :all (range 10)))
|
|
d1 (ds/concat-copying nil ds)
|
|
d2 (ds/concat-copying ds nil nil)
|
|
nothing (ds/concat nil nil nil)]
|
|
(is (= (vec (ds :fruit-name))
|
|
(vec (d1 :fruit-name))))
|
|
(is (= (vec (ds :fruit-name))
|
|
(vec (d2 :fruit-name))))
|
|
(is (nil? nothing))))
|
|
|
|
|
|
(deftest ds-concat-missing
|
|
(let [ds (-> (ds/->dataset (test-utils/mapseq-fruit-dataset))
|
|
(ds/select [:fruit-name] (range 10))
|
|
(ds/update-column :fruit-name #(ds-col/set-missing % [3 6])))
|
|
d1 (ds/concat ds ds)]
|
|
(is (= (set [3 6 13 16]) (set (ds-col/missing (d1 :fruit-name)))))
|
|
(is (= [:apple :apple :apple nil :mandarin
|
|
:mandarin nil :mandarin :apple :apple
|
|
:apple :apple :apple nil :mandarin
|
|
:mandarin nil :mandarin :apple :apple ]
|
|
(vec (d1 :fruit-name))))))
|
|
|
|
|
|
(deftest concat-copying-missing
|
|
(let [ds (-> (ds/->dataset (test-utils/mapseq-fruit-dataset))
|
|
(ds/select [:fruit-name] (range 10))
|
|
(ds/update-column :fruit-name #(ds-col/set-missing % [3 6])))
|
|
d1 (ds/concat-copying ds ds)]
|
|
(is (= (set [3 6 13 16]) (set (ds-col/missing (d1 :fruit-name)))))
|
|
(is (= [:apple :apple :apple nil :mandarin
|
|
:mandarin nil :mandarin :apple :apple
|
|
:apple :apple :apple nil :mandarin
|
|
:mandarin nil :mandarin :apple :apple ]
|
|
(vec (d1 :fruit-name))))))
|
|
|
|
|
|
(deftest update-column-datatype-detect
|
|
(let [ds (-> (ds/->dataset (test-utils/mapseq-fruit-dataset))
|
|
(ds/select :all (range 10)))
|
|
updated (ds/update-column ds :width #(->> %
|
|
(map (fn [data]
|
|
(* 10 data)))))
|
|
add-or-updated (ds/add-or-update-column
|
|
ds :width (->> (ds :width)
|
|
(map (fn [data]
|
|
(* 10 data)))))
|
|
width-answer (->> (ds :width)
|
|
(mapv (fn [data]
|
|
(* 10 data))))]
|
|
|
|
(is (dfn/equals width-answer
|
|
(updated :width)))
|
|
(is (dfn/equals width-answer
|
|
(add-or-updated :width)))))
|
|
|
|
|
|
(deftest filter-fail-regression
|
|
(let [ds (ds/->dataset (test-utils/mapseq-fruit-dataset))]
|
|
(is (= [:mandarin :mandarin :mandarin :mandarin]
|
|
(vec (dtype/sub-buffer (ds :fruit-name) 4 4))))))
|
|
|
|
|
|
|
|
(deftest simple-select-test
|
|
(let [ds (ds/->dataset (test-utils/mapseq-fruit-dataset))
|
|
sel-col (ds-col/select (ds :fruit-name) (range 5 10))]
|
|
(is (= [:mandarin :mandarin :mandarin :apple :apple]
|
|
(vec sel-col)))
|
|
(is (= 5 (dtype/ecount sel-col)))))
|
|
|
|
|
|
(deftest generic-sort-numbers
|
|
(let [ds (-> (ds/->dataset (test-utils/mapseq-fruit-dataset))
|
|
(ds/sort-by-column :mass >))
|
|
ds2 (-> (ds/->dataset (test-utils/mapseq-fruit-dataset))
|
|
(ds/sort-by-column :mass))]
|
|
|
|
(is (= (vec (take 10 (ds :mass)))
|
|
(vec (take 10 (reverse (ds2 :mass))))))))
|
|
|
|
|
|
(deftest selection-map
|
|
(let [ds (ds/->dataset (test-utils/mapseq-fruit-dataset))
|
|
colname-map (->> (take 3 (ds/column-names ds))
|
|
(map (juxt identity #(keyword (str (name %) "-selected"))))
|
|
(into {}))
|
|
|
|
;;Enforce the order in the map.
|
|
ordered-ds (ds/select-columns ds colname-map)
|
|
;;ensure normal ordering rules apply, make a dataset with random column
|
|
;;name order
|
|
shuffled-ds (-> (ds/select-columns ds (shuffle (ds/column-names ds)))
|
|
(ds/select-columns colname-map))
|
|
shuffled-unordered (-> (ds/select-columns ds (reverse (ds/column-names ds)))
|
|
(ds/unordered-select colname-map :all))
|
|
colname-vals (vec (vals colname-map))]
|
|
(is (= colname-vals
|
|
(vec (ds/column-names ordered-ds))))
|
|
(is (= colname-vals
|
|
(vec (ds/column-names shuffled-ds))))
|
|
(is (not= colname-vals
|
|
(vec (ds/column-names shuffled-unordered))))))
|
|
|
|
|
|
(deftest boolean-double-arrays
|
|
(let [d (ds/->dataset [{:a true} {:a true} {:a false}])]
|
|
(is (= [1.0 1.0 0.0]
|
|
(vec (ds-col/to-double-array (d :a)))))))
|
|
|
|
|
|
(deftest remove-rows
|
|
(let [d (ds/->dataset (test-utils/mapseq-fruit-dataset))
|
|
d2 (ds/remove-rows d (range 5))]
|
|
(is (= (vec (drop 5 (d :fruit-name)))
|
|
(vec (d2 :fruit-name))))))
|
|
|
|
|
|
(deftest long-double-promotion
|
|
(is (= #{:float64}
|
|
(->> (ds/->dataset [{:a 1 :b (float 2.2)} {:a 1.2 :b 2}])
|
|
(vals)
|
|
(map dtype/get-datatype)
|
|
set))))
|
|
|
|
|
|
(deftest set-missing-range
|
|
(let [ds (-> (ds/->dataset (test-utils/mapseq-fruit-dataset))
|
|
(ds/update-column :fruit-name #(ds-col/set-missing % (range))))]
|
|
(is (= (vec (range (ds/row-count ds)))
|
|
(vec (dtype/->reader (ds-col/missing (ds :fruit-name))))))))
|
|
|
|
|
|
(deftest columnwise-concat
|
|
(let [ds (-> [{:a 1 :b 2 :c 3 :d 1} {:a 4 :b 5 :c 6 :d 2}]
|
|
(ds/->dataset)
|
|
(ds/columnwise-concat [:c :a :b]))]
|
|
(is (= (vec [:c :c :a :a :b :b])
|
|
(vec (ds :column))))
|
|
(is (= (vec [3 6 1 4 2 5])
|
|
(vec (ds :value))))
|
|
(is (= (vec [1 2 1 2 1 2])
|
|
(vec (ds :d))))))
|
|
|
|
|
|
(deftest default-names
|
|
(is (= "test/data/stocks.csv"
|
|
(ds/dataset-name (ds/->dataset "test/data/stocks.csv"))))
|
|
(is (= "test/data/stocks.xlsx"
|
|
(ds/dataset-name (ds/->dataset "test/data/stocks.xlsx")))))
|
|
|
|
|
|
(deftest unroll
|
|
(let [ds (-> (ds/->dataset [{:a 1 :b [2 3]}
|
|
{:a 2 :b [4 5]}
|
|
{:a 3 :b :a}])
|
|
(ds/unroll-column :b))]
|
|
(is (= [1 1 2 2 3]
|
|
(vec (ds :a))))
|
|
(is (= [2 3 4 5 :a]
|
|
(vec (ds :b)))))
|
|
(let [ds (-> (ds/->dataset (flatten (repeat 20
|
|
[{:a 1 :b [:a :b]}
|
|
{:a 2 :b [:c :d]}
|
|
{:a 3 :b :a}])))
|
|
(ds/unroll-column :b {:datatype :keyword}))]
|
|
(is (= (flatten (repeat 20 [1 1 2 2 3]))
|
|
(vec (ds :a))))
|
|
(is (= (flatten (repeat 20 [:a :b :c :d :a]))
|
|
(vec (ds :b)))))
|
|
(let [ds (-> (ds/->dataset [{:a 1 :b [2 3]}
|
|
{:a 2 :b [4 5]}
|
|
{:a 3 :b :a}])
|
|
(ds/unroll-column :b {:indexes? true}))]
|
|
(is (= [1 1 2 2 3]
|
|
(vec (ds :a))))
|
|
(is (= [2 3 4 5 :a]
|
|
(vec (ds :b))))
|
|
(is (= [0 1 0 1 0]
|
|
(vec (ds :indexes)))))
|
|
(let [ds (-> (ds/->dataset [{:a 1 :b (int-array [2 3])}
|
|
{:a 2 :b [4 5]}
|
|
{:a 3 :b :a}])
|
|
(ds/unroll-column :b {:indexes? :unroll-indexes}))]
|
|
(is (= [1 1 2 2 3]
|
|
(vec (ds :a))))
|
|
(is (= [2 3 4 5 :a]
|
|
(vec (ds :b))))
|
|
(is (= [0 1 0 1 0]
|
|
(vec (ds :unroll-indexes))))))
|
|
|
|
|
|
(deftest empty-bitmap
|
|
(let [ds (ds/->dataset [{:a 1 :b 1} {:a 2 :b 2}])]
|
|
(is (= 0 (ds/row-count (ds/select-rows ds (ds/missing ds))))))
|
|
(let [ds (ds/->dataset [{:a 1 :b 1} {:b 2}])]
|
|
(is (= 1 (ds/row-count (ds/select-rows ds (ds/missing ds)))))))
|
|
|
|
|
|
(deftest concat-columns-widening
|
|
(let [ds (ds/->dataset [{:a (int 1) :b (float 1)}])
|
|
ds2 (ds/->dataset [{:a (byte 2) :b 2}])
|
|
cds1 (ds/concat ds ds2)
|
|
cds2 (ds/concat ds2 ds)]
|
|
(is (= #{:int64 :float64}
|
|
(set (map dtype/get-datatype (vals cds1)))))
|
|
(is (= #{:int64 :float64}
|
|
(set (map dtype/get-datatype (vals cds2))))))
|
|
(let [ds (ds/->dataset [{:a (int 1) :b (float 1)}
|
|
{:b (float 2)}])
|
|
ds2 (ds/->dataset [{:a (byte 2) :b 2}])
|
|
cds1 (ds/concat ds ds2)
|
|
cds2 (ds/concat ds2 ds)]
|
|
(is (= #{:int64 :float64}
|
|
(set (map dtype/get-datatype (vals cds1)))))
|
|
(is (= #{:int64 :float64}
|
|
(set (map dtype/get-datatype (vals cds2)))))
|
|
(is (= [1 nil 2]
|
|
(vec (cds1 :a))))))
|
|
|
|
|
|
(deftest concat-copying-columns-widening
|
|
(let [ds (ds/->dataset [{:a (int 1) :b (float 1)}])
|
|
ds2 (ds/->dataset [{:a (byte 2) :b 2}])
|
|
cds1 (ds/concat ds ds2)
|
|
cds2 (ds/concat ds2 ds)]
|
|
(is (= #{:int64 :float64}
|
|
(set (map dtype/get-datatype (vals cds1)))))
|
|
(is (= #{:int64 :float64}
|
|
(set (map dtype/get-datatype (vals cds2))))))
|
|
(let [ds (ds/->dataset [{:a (int 1) :b (float 1)}
|
|
{:b (float 2)}])
|
|
ds2 (ds/->dataset [{:a (byte 2) :b 2}])
|
|
cds1 (ds/concat-copying ds ds2)
|
|
cds2 (ds/concat-copying ds2 ds)]
|
|
(is (= #{:int64 :float64}
|
|
(set (map dtype/get-datatype (vals cds1)))))
|
|
(is (= #{:int64 :float64}
|
|
(set (map dtype/get-datatype (vals cds2)))))
|
|
(is (= [1 nil 2]
|
|
(vec (cds1 :a))))))
|
|
|
|
|
|
(deftest concat-columns-various-datatypes
|
|
(let [stocks (ds/->dataset "test/data/stocks.csv")
|
|
ds1 (ds/select-rows stocks (range 10))
|
|
ds2 (ds/select-rows stocks (range 10 20))
|
|
res (ds/concat ds1 ds2)]
|
|
(is (= :packed-local-date
|
|
(dtype/get-datatype (res "date")))))
|
|
(let [ds (ds/->dataset [{:a "a" :b 0}])
|
|
res (ds/concat ds ds)]
|
|
(is (= :string (dtype/get-datatype (res :a))))))
|
|
|
|
|
|
(deftest concat-copying-columns-various-datatypes
|
|
(let [stocks (ds/->dataset "test/data/stocks.csv")
|
|
ds1 (ds/select-rows stocks (range 10))
|
|
ds2 (ds/select-rows stocks (range 10 20))
|
|
res (ds/concat ds1 ds2)]
|
|
(is (= :packed-local-date
|
|
(dtype/get-datatype (res "date")))))
|
|
(let [ds (ds/->dataset [{:a "a" :b 0}])
|
|
res (ds/concat-copying ds ds)]
|
|
(is (= :string (dtype/get-datatype (res :a))))))
|
|
|
|
|
|
(deftest set-datatype-lose-missing
|
|
(let [ds (-> (ds/->dataset [{:a 1 :b 1} {:b 2}])
|
|
(ds/update-column :a #(dtype/set-datatype % :int32)))]
|
|
(is (== 1 (dtype/ecount (ds-col/missing (ds :a)))))
|
|
(is (= :int32 (dtype/get-datatype (ds :a))))
|
|
(is (= [1 nil]
|
|
(vec (ds :a))))))
|
|
|
|
|
|
(deftest set-datatype-with-new-column
|
|
(let [ds (-> (ds/->dataset [{:a 1 :b 1} {:b 2}])
|
|
(ds/update-column :a #(ds-col/new-column
|
|
(ds-col/column-name %)
|
|
(dtype/emap int :int32 %)
|
|
{}
|
|
(ds-col/missing %))))]
|
|
(is (== 1 (dtype/ecount (ds-col/missing (ds :a)))))
|
|
(is (= :int32 (dtype/get-datatype (ds :a))))
|
|
(is (= [1 nil]
|
|
(vec (ds :a))))))
|
|
|
|
|
|
(deftest typed-column-map
|
|
(let [ds (-> (ds/->dataset [{:a 1.0} {:a 2.0}])
|
|
(ds/update-column
|
|
:a
|
|
#(dtype/emap (fn ^double [^double in]
|
|
(if (< in 2.0) (- in) in))
|
|
nil
|
|
%)))]
|
|
(is (= :float64 (dtype/get-datatype (ds :a))))
|
|
(is (= [-1.0 2.0]
|
|
(vec (ds :a))))))
|
|
|
|
|
|
(deftest typed-column-map-missing
|
|
(let [ds (ds/bind-> (ds/->dataset [{:a 1} {:b 2.0} {:a 2 :b 3.0}]) ds
|
|
(assoc :a (ds-col/column-map (fn [lhs rhs]
|
|
(when (and lhs rhs)
|
|
(+ (double lhs)
|
|
(double rhs))))
|
|
nil
|
|
(ds :a) (ds :b))))]
|
|
(is (= :float64 (dtype/get-datatype (ds :a))))
|
|
(is (= [false false true]
|
|
(vec (dfn/finite? (ds :a)))))
|
|
(is (= #{0 1}
|
|
(set (ds/missing (ds :a))))))
|
|
|
|
(let [ds (ds/bind-> (ds/->dataset [{:a 1} {:b 2.0} {:a 2 :b 3.0}]) ds
|
|
(assoc :a (ds-col/column-map (fn [lhs rhs]
|
|
(if (and lhs rhs)
|
|
(+ (double lhs)
|
|
(double rhs))
|
|
Double/NaN))
|
|
:float64
|
|
(ds :a) (ds :b))))]
|
|
(is (= :float64 (dtype/get-datatype (ds :a))))
|
|
(is (= [false false true]
|
|
(vec (dfn/finite? (ds :a))))))
|
|
;; Never remove these tests. Actual users are relying on this behavior to simplify
|
|
;; their processing chains.
|
|
(let [ds (ds/bind-> (ds/->dataset [{:a 1} {:b 2.0} {:a 2 :b 3.0}]) ds
|
|
(assoc :a (ds-col/column-map (fn [^double lhs ^double rhs]
|
|
(+ (double lhs)
|
|
(double rhs)))
|
|
{:missing-fn ds-col/union-missing-sets
|
|
:datatype :float64}
|
|
(ds :a) (ds :b))))]
|
|
(is (= :float64 (dtype/get-datatype (ds :a))))
|
|
(is (= [false false true]
|
|
(vec (dfn/finite? (ds :a))))))
|
|
(let [ds (-> (ds/->dataset [{:a 1} {:b 2.0} {:a 2 :b 3.0}])
|
|
(ds/column-map-m :a [:a :b]
|
|
(when (and a b)
|
|
(+ (double a) (double b)))))]
|
|
(is (= :float64 (dtype/get-datatype (ds :a))))
|
|
(is (= [false false true]
|
|
(vec (dfn/finite? (ds :a)))))
|
|
(is (= #{0 1}
|
|
(set (ds/missing (ds :a))))))
|
|
(let [ds (-> (ds/->dataset [{:a.a 1} {:b 2.0} {:a.a 2 :b 3.0}])
|
|
(ds/column-map-m :a [:a.a :b]
|
|
(when (and a-a b)
|
|
(+ (double a-a) (double b)))))]
|
|
(is (= :float64 (dtype/get-datatype (ds :a))))
|
|
(is (= [false false true]
|
|
(vec (dfn/finite? (ds :a)))))
|
|
(is (= #{0 1}
|
|
(set (ds/missing (ds :a)))))))
|
|
|
|
|
|
(deftest mean-object-column
|
|
(let [ds (-> (ds/->dataset [])
|
|
(ds/add-or-update-column :a (map (fn [arg] (* 2 arg)) (range 9))))]
|
|
(is (= :int64 (dtype/get-datatype (ds :a))))
|
|
(is (= 8.0 (dfn/mean (ds :a))))))
|
|
|
|
|
|
(deftest column-cast-test
|
|
(let [ds (ds/->dataset "test/data/stocks.csv" {:key-fn keyword})
|
|
price-dtype (dtype/get-datatype (ds :price))
|
|
_ (is (dfn/equals (ds :price)
|
|
(-> (ds/column-cast ds :price :string)
|
|
(ds/column-cast :price price-dtype)
|
|
(ds/column :price))))
|
|
date-dtype (dtype/get-datatype (ds :date))
|
|
_ (is (dfn/equals (dtype-dt/datetime->milliseconds (ds :date))
|
|
(-> (ds/column-cast ds :date :string)
|
|
(ds/column-cast :date date-dtype)
|
|
(ds/column :date)
|
|
(dtype-dt/datetime->milliseconds))))]
|
|
;;Custom cast fn
|
|
(is (= [40 36 43 28 25]
|
|
(->> (ds/column-cast ds :price [:int32 #(Math/round (double %))])
|
|
(#(ds/column % :price))
|
|
(take 5)
|
|
(vec))))
|
|
(is (nil? (->> (ds/column-cast ds :price [:int32 #(Math/round (double %))])
|
|
(#(ds/column % :price))
|
|
(meta)
|
|
(:unparsed-indexes))))
|
|
(is (not
|
|
(nil? (->> (ds/column-cast ds :price [:int32 #(Math/round (double %))] {:track-parse-errors true})
|
|
(#(ds/column % :price))
|
|
(meta)
|
|
(:unparsed-indexes)))))))
|
|
|
|
|
|
(deftest column-clone-double-read
|
|
(let [ds (ds/->dataset "test/data/stocks.csv"
|
|
{:key-fn keyword})
|
|
read-indexes (HashSet.)
|
|
new-ds (assoc ds
|
|
:price-2
|
|
(dtype/clone
|
|
(dtype/make-reader
|
|
:boolean
|
|
(ds/row-count ds)
|
|
(do
|
|
(locking read-indexes
|
|
(when (.contains read-indexes idx)
|
|
(throw (Exception. "Double read!!")))
|
|
(.add read-indexes idx))
|
|
true))))]
|
|
(is (= [true true true true true]
|
|
(vec (take 5 (new-ds :price-2)))))))
|
|
|
|
|
|
(deftest stats-with-missing
|
|
(let [DSm2 (ds/->dataset {:a [nil nil nil 1 2 nil 3
|
|
4 nil nil nil 11 nil]
|
|
:b [nil 2 2 2 2 3 nil 3 nil
|
|
3 nil 4 nil]})]
|
|
(is (> (:mean (ds-col/stats (DSm2 :a) #{:mean})) 0.0))
|
|
(is (> (:mean (ds-col/stats (DSm2 :b) #{:mean})) 0.0))))
|
|
|
|
|
|
(deftest uuids-test
|
|
(let [uuids (repeatedly 5 #(UUID/randomUUID))
|
|
ds (ds/->dataset
|
|
(->> uuids
|
|
(map-indexed (fn [idx uuid]
|
|
{:a uuid
|
|
:b uuid
|
|
:c idx}))))]
|
|
(is (= :uuid (dtype/get-datatype (ds :a))))
|
|
(is (= :uuid (dtype/get-datatype (ds :b))))
|
|
(is (= (vec uuids)
|
|
(vec (ds :a))))
|
|
(let [test-fname (str (UUID/randomUUID) ".csv")
|
|
_ (ds/write! ds test-fname)
|
|
loaded-ds (try (ds/->dataset test-fname
|
|
{:key-fn keyword})
|
|
(finally
|
|
(.delete (File. test-fname))))]
|
|
(is (= (vec (ds :a))
|
|
(vec (loaded-ds :a)))))))
|
|
|
|
|
|
(deftest filter-empty
|
|
(let [ds (ds/->dataset {:V1 (take 9 (cycle [1 2]))
|
|
:V2 (range 1 10)
|
|
:V3 (take 9 (cycle [0.5 1.0 1.5]))
|
|
:V4 (take 9 (cycle [\A \B \C]))})
|
|
result (ds/filter ds (constantly false))]
|
|
(is (= 0 (ds/row-count result)))
|
|
(is (= (ds/column-count ds)
|
|
(ds/column-count result)))
|
|
(is (string? (.toString ^Object result)))))
|
|
|
|
|
|
(deftest nil-mapseq-values
|
|
(let [ds (ds/->dataset [{:a nil} {:a 1} {}])]
|
|
(is (= #{0 2}
|
|
(set (ds/missing ds))))
|
|
(is (= [nil 1 nil]
|
|
(vec (dtype/->reader (ds :a)))))))
|
|
|
|
|
|
(deftest select-row
|
|
(let [ds (ds/->dataset {:V1 (take 9 (cycle [1 2]))
|
|
:V2 (range 1 10)
|
|
:V3 (take 9 (cycle [0.5 1.0 1.5]))
|
|
:V4 (take 9 (cycle [\A \B \C]))})]
|
|
|
|
(is (= [2 6 1.5 \C]
|
|
(-> (ds/select-rows ds 5)
|
|
(ds/value-reader)
|
|
(first)
|
|
(vec))))
|
|
|
|
(is (= [2 6 1.5 \C]
|
|
(-> (ds/select-rows ds [5])
|
|
(ds/value-reader)
|
|
(first)
|
|
(vec))))
|
|
))
|
|
|
|
(deftest select-by-index
|
|
(let [ds (ds/->dataset {:V1 (take 9 (cycle [1 2]))
|
|
:V2 (range 1 10)
|
|
:V3 (take 9 (cycle [0.5 1.0 1.5]))
|
|
:V4 (take 9 (cycle [\A \B \C]))})]
|
|
|
|
(is (= [1 \A]
|
|
(-> (ds/select-by-index ds [0 3] [0 8])
|
|
(ds/value-reader)
|
|
(first)
|
|
(vec))
|
|
(-> (ds/select-by-index ds [-4 -1] [-9 -1])
|
|
(ds/value-reader)
|
|
(first)
|
|
(vec))))
|
|
|
|
(is (= [\C]
|
|
(-> (ds/select-by-index ds 3 8)
|
|
(ds/value-reader)
|
|
(first)
|
|
(vec))
|
|
(-> (ds/select-by-index ds -1 -1)
|
|
(ds/value-reader)
|
|
(first)
|
|
(vec))
|
|
(-> (ds/select-by-index ds [3] [8])
|
|
(ds/value-reader)
|
|
(first)
|
|
(vec))
|
|
(-> (ds/select-by-index ds [-1] [-1])
|
|
(ds/value-reader)
|
|
(first)
|
|
(vec))))
|
|
|
|
(is (= [\A \B \C \A \B \C \A \B \C]
|
|
(vec ((ds/select-columns-by-index ds 3) :V4))
|
|
(vec ((ds/select-columns-by-index ds [3]) :V4))
|
|
(vec ((ds/select-columns-by-index ds -1) :V4))
|
|
(vec ((ds/select-columns-by-index ds [-1]) :V4))))
|
|
|
|
(is (= [2 6 1.5 \C]
|
|
(-> (ds/select-rows ds -4)
|
|
(ds/value-reader)
|
|
(first)
|
|
(vec))
|
|
(-> (ds/select-rows ds [-4])
|
|
(ds/value-reader)
|
|
(first)
|
|
(vec))))))
|
|
|
|
(deftest columns-named-false
|
|
(let [DS (ds/->dataset [{false 1} {false 2}])]
|
|
(is (= [1 2]
|
|
(vec (DS false)))))
|
|
(let [DS (ds/->dataset [{:a 1} {:a 2}])]
|
|
(is (= [1 2]
|
|
(-> (ds/rename-columns DS {:a false})
|
|
(ds/column false)
|
|
vec))))
|
|
(let [DS (ds/->dataset [{:a 1} {:a 2}])]
|
|
(is (= [1 2]
|
|
(-> (ds/select-columns DS {:a false})
|
|
(ds/column false)
|
|
vec)))))
|
|
|
|
(deftest positional-column-rename
|
|
(let [DS (ds/->dataset
|
|
(-> "id,a,ab\n0,aa,bb\n1,cc,dd"
|
|
.getBytes
|
|
ByteArrayInputStream.)
|
|
{:file-type :csv})
|
|
new-cols-incorrect [:a1 :a2]
|
|
new-cols-correct [:id :a1 :a2]]
|
|
(is (= new-cols-correct
|
|
(-> DS
|
|
(ds/rename-columns new-cols-correct)
|
|
ds/column-names)))
|
|
(is (thrown? Throwable
|
|
(ds/rename-columns DS new-cols-incorrect)))
|
|
(is (thrown? Throwable
|
|
(ds/rename-columns DS (set new-cols-correct))))))
|
|
|
|
(deftest column-sequences-use-nil-missing
|
|
(let [ds (ds/->dataset [{:a 1} {:b 2}])]
|
|
(is (= [1 nil] (vec (ds :a))))
|
|
(is (= [nil 2] (vec (ds :b))))))
|
|
|
|
|
|
(deftest ->dataset-nvs-parse-test
|
|
(let [ds (ds/->dataset {:a [1 2 3]
|
|
:b [4 5 6]})]
|
|
(is (= [1 2 3]
|
|
(vec (ds :a))))
|
|
(is (= [4 5 6]
|
|
(vec (ds :b))))))
|
|
|
|
|
|
(deftest apply-works-with-columns-and-vectors
|
|
(let [ds (ds/->dataset {:a [1 2 3]
|
|
:b [4 5 6]})
|
|
a-col (ds :a)]
|
|
(is (= 2 (apply a-col [1])))
|
|
(is (= 2 (apply (dtype/->reader a-col) [1])))))
|
|
|
|
|
|
(deftest vector-of-test
|
|
(let [ds (ds/->dataset {:a (vector-of :float 1 2 3 4)
|
|
:b (vector-of :short 1 2 3 4)})]
|
|
(is (= #{:float32 :int16}
|
|
(set (map dtype/get-datatype (vals ds)))))
|
|
(let [cds (dtype/clone ds)]
|
|
(is (every? #(not (nil? %))
|
|
(map dtype/->array (vals cds)))))))
|
|
|
|
|
|
(deftest serialize-datetime
|
|
(let [ds (ds/->dataset "test/data/stocks.csv")
|
|
_ (ds/write! ds "test.tsv.gz")
|
|
save-ds (ds/->dataset "test.tsv.gz")
|
|
fdata (java.io.File. "test.tsv.gz")]
|
|
(is (= (ds/row-count ds) (ds/row-count save-ds)))
|
|
(is (= (ds/column-count ds) (ds/column-count save-ds)))
|
|
(is (= (set (map dtype/get-datatype ds))
|
|
(set (map dtype/get-datatype save-ds))))
|
|
(when (.exists fdata)
|
|
(.delete fdata))))
|
|
|
|
|
|
(deftest custom-packed-local-date-parser
|
|
(let [ds (ds/->dataset "test/data/stocks.csv"
|
|
{:parser-fn {"date" [:packed-local-date
|
|
"MMM d yyyy"]}})]
|
|
(is (= 560 (ds/row-count ds)))))
|
|
|
|
|
|
(deftest stocks-to-from-nippy
|
|
(let [fname (format "%s.nippy" (java.util.UUID/randomUUID))]
|
|
(try
|
|
(let [stocks (ds/->dataset "test/data/stocks.csv")
|
|
_ (tech-io/put-nippy! fname stocks)
|
|
nip-stocks (tech-io/get-nippy fname)]
|
|
(is (= (ds/row-count stocks) (ds/row-count nip-stocks)))
|
|
(is (= (ds/column-count stocks) (ds/column-count nip-stocks)))
|
|
(is (= (vec (stocks "date"))
|
|
(vec (nip-stocks "date"))))
|
|
(is (= (mapv meta (vals stocks))
|
|
(mapv meta (vals nip-stocks)))))
|
|
(finally
|
|
(let [file (java.io.File. fname)]
|
|
(when (.exists file)
|
|
(.delete file)))))))
|
|
|
|
|
|
(deftest empty-dataset-hasheq
|
|
(let [ds (ds/->dataset [])]
|
|
(is (== 0 (.hashCode ds)))))
|
|
|
|
(deftest dataset-equality
|
|
(let [ds0 (ds/->dataset {:foo "foo" :bar "bar"}) ;;equal to 3
|
|
ds1 (ds/->dataset {:foo "foo" :bar "bar" :baz "baz"})
|
|
ds2 (ds/->dataset {:foo "foo" :bar "beer"})
|
|
ds3 (ds/->dataset {:foo "foo" :bar "bar"}) ;;equal to 0
|
|
datasets [ds0 ds1 ds2 ds3]
|
|
hashmaps (mapv (fn [ds] (into {} ds)) datasets)
|
|
mapify #(reduce (fn [^java.util.Map m [k v]]
|
|
(doto m (.put k v)))
|
|
(java.util.HashMap.) %)
|
|
mutmaps (mapv mapify datasets)
|
|
xs (range (count datasets))
|
|
dsresults (->> (for [i xs
|
|
j xs]
|
|
[i j (= (nth datasets i) (nth datasets j))])
|
|
(filter last)
|
|
(map (juxt first second))
|
|
set)
|
|
hashresults (->> (for [i xs
|
|
j xs]
|
|
[i j (= (nth datasets i) (nth hashmaps j))])
|
|
(filter last)
|
|
(map (juxt first second))
|
|
set)
|
|
mapresults (->> (for [i xs
|
|
j xs]
|
|
[i j (= (nth datasets i) (nth mutmaps j))])
|
|
(filter last)
|
|
(map (juxt first second))
|
|
set)
|
|
expected #{[0 0] [1 1] [2 2] [3 3] [3 0] [0 3]}]
|
|
(is (= dsresults expected)
|
|
"Datasets should obey map equivalence when compared to datasets.")
|
|
(is (= hashresults expected)
|
|
"Datasets should obey map equivalence when compared to IPersistentMap.")
|
|
(is (= mapresults expected)
|
|
"Datasets should obey map equivalence when compared to java.util.Map
|
|
like HashMap.")))
|
|
|
|
(deftest columns-are-persistent-vectors
|
|
(let [ds (-> (ds/->dataset "test/data/stocks.csv")
|
|
(ds/head))
|
|
sym-vec (vec (ds "symbol"))]
|
|
;;We use a clever impl of APersistentVector for the columns
|
|
(is (= sym-vec (ds "symbol")))))
|
|
|
|
|
|
(deftest replace-missing-test
|
|
(let [ds (ds/->dataset {:a [nil nil nil 1.0 2 nil nil nil
|
|
nil nil 4 nil 11 nil nil]
|
|
:b [2 2 2 nil nil nil nil nil
|
|
nil 13 nil 3 4 5 5]})]
|
|
(is (= [nil nil nil 1.0 2.0 2.0 2.0 2.0 2.0 2.0 4.0 4.0 11.0 11.0 11.0]
|
|
(vec ((ds/replace-missing ds :down) :a))))
|
|
(is (= [555.0 555.0 555.0 1.0 2.0 2.0 2.0 2.0 2.0 2.0 4.0 4.0 11.0 11.0 11.0]
|
|
(vec ((ds/replace-missing ds :all :down 555) :a))))
|
|
(is (= [1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 2.0 2.0 4.0 4.0 11.0 11.0 11.0]
|
|
(vec ((ds/replace-missing ds :all :downup) :a))))
|
|
(is (= [1.0 1.0 1.0 1.0 2.0 4.0 4.0 4.0 4.0 4.0 4.0 11.0 11.0 nil nil]
|
|
(vec ((ds/replace-missing ds :up) :a))))
|
|
(is (= [1.0 1.0 1.0 1.0 2.0 4.0 4.0 4.0 4.0 4.0 4.0 11.0 11.0 11.0 11.0]
|
|
(vec ((ds/replace-missing ds :updown) :a))))
|
|
(is (= [1.0 1.0 1.0 1.0 2.0 4.0 4.0 4.0 4.0 4.0 4.0 11.0 11.0 555.0 555.0]
|
|
(vec ((ds/replace-missing ds :all :up 555) :a))))
|
|
(is (= [1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 4.0 4.0 4.0 4.0 11.0 11.0 11.0]
|
|
(vec ((ds/replace-missing ds :mid) :a))))
|
|
(is (= [5.0 5.0 5.0 1.0 2.0 5.0 5.0 5.0 5.0 5.0 4.0 5.0 11.0 5.0 5.0]
|
|
(vec ((ds/replace-missing ds :all :value 5.0) :a))))))
|
|
|
|
(deftest replace-missing-string-table
|
|
(is (= ["one" "two" "three"]
|
|
(-> (ds/->dataset {:a ["one" nil "three"]})
|
|
(ds/replace-missing-value "two")
|
|
(ds/column :a)))))
|
|
|
|
|
|
(deftest replace-missing-all-values-missing
|
|
(let [empty-col (ds/->dataset {:a [nil nil]})]
|
|
(is (= 2 (-> empty-col
|
|
(ds/replace-missing [:a] :value dfn/mean)
|
|
(ds/missing)
|
|
(dtype/ecount))))))
|
|
|
|
|
|
(deftest replace-missing-selector-fn
|
|
(let [ds (ds/->dataset {:a [nil nil 2 4]
|
|
:b [nil nil 4 6]
|
|
:c [nil nil "A" "B"]})
|
|
ds-replaced (-> ds
|
|
(ds/replace-missing cf/numeric :value dfn/mean)
|
|
(ds/replace-missing cf/categorical :value "C"))]
|
|
(is (= [3 3 2 4] (vec (ds-replaced :a))))
|
|
(is (= [5 5 4 6] (vec (ds-replaced :b))))
|
|
(is (= ["C" "C" "A" "B"] (vec (ds-replaced :c))))))
|
|
|
|
|
|
(deftest replace-missing-ldt
|
|
(let [dtds (ds/->dataset {:dt [(java.time.LocalDateTime/of 2020 1 1 1 1 1)
|
|
nil nil nil
|
|
(java.time.LocalDateTime/of 2020 10 1 1 1 1)]})]
|
|
(is (= (seq ((ds/replace-missing dtds :lerp) :dt))
|
|
[(java.time.LocalDateTime/of 2020 1 1 1 1 1)
|
|
(java.time.LocalDateTime/of 2020 3 9 13 1 1)
|
|
(java.time.LocalDateTime/of 2020 5 17 1 1 1)
|
|
(java.time.LocalDateTime/of 2020 7 24 13 1 1)
|
|
(java.time.LocalDateTime/of 2020 10 1 1 1 1)]))))
|
|
|
|
|
|
(deftest replace-missing-abb
|
|
(let [dtds (ds/->dataset {:a [nil nil nil 1.0 2 nil nil nil
|
|
nil nil 4 nil 11 nil nil]
|
|
:b [2 2 2 nil nil nil nil nil
|
|
nil 13 nil 3 4 5 5]})
|
|
fds (ds/replace-missing dtds :abb)]
|
|
(is (= 0 (dtype/ecount (ds/missing fds))))))
|
|
|
|
|
|
(deftest dataset-column-nippy
|
|
(let [ds (ds/->dataset {:a [1 2]
|
|
:datasets [(ds/->dataset [{:a 1}])
|
|
(ds/->dataset [{:b 2}])]})
|
|
nippy-data (nippy/freeze ds)
|
|
thawed-ds (nippy/thaw nippy-data)]
|
|
(is (= (map meta (vals ds))
|
|
(map meta (vals thawed-ds))))
|
|
(is (= ds thawed-ds))))
|
|
|
|
|
|
(deftest unique-by-nil-regression
|
|
(-> (ds/->dataset [])
|
|
(ds/add-column (ds-col/new-column :abc [nil nil]))
|
|
(ds/unique-by-column :abc)))
|
|
|
|
|
|
(deftest missing-values-and-tensors
|
|
(let [ds (ds/->dataset {:a [1 nil 2]
|
|
:b [1.0 nil 2.0]
|
|
:c [5 nil 6]})]
|
|
(is (= 3
|
|
(->> (ds-tens/dataset->tensor ds :float64)
|
|
(dtype/->reader)
|
|
(filter #(Double/isNaN %))
|
|
(count))))))
|
|
|
|
|
|
(deftest bind->-test
|
|
(is (= 42
|
|
(ds/bind-> 41 x inc)))
|
|
(is (= 82
|
|
(ds/bind-> 41 x
|
|
(+ x))))
|
|
(is (= 31
|
|
(ds/bind-> 41 x
|
|
(- 10))))
|
|
|
|
(is (dfn/equals
|
|
[39.81 3.709 7.418]
|
|
(ds/bind-> (ds/->dataset "test/data/stocks.csv") ds
|
|
(assoc :logprice2 (dfn/log1p (ds "price")))
|
|
(assoc :logp3 (dfn/* 2 (ds :logprice2)))
|
|
(ds/select-columns ["price" :logprice2 :logp3])
|
|
(ds-tens/dataset->tensor)
|
|
(first)))))
|
|
|
|
|
|
(deftest parse-nils
|
|
(let [ds-a (ds/->dataset {:a [nil nil]})
|
|
ds-b (ds/->dataset [{:a nil} {:a nil}])]
|
|
(is (= (ds/row-count ds-a)
|
|
(ds/row-count ds-b)))
|
|
(is (= 2 (dtype/ecount (ds/missing ds-a)))
|
|
(= 2 (dtype/ecount (ds/missing ds-b))))))
|
|
|
|
|
|
(deftest parser-fn-failing-on-csv-entries
|
|
(let [stocks (ds/->dataset "test/data/stocks.csv"
|
|
{:key-fn keyword
|
|
:parser-fn {:date [:string #(subs % 0 5)]}})]
|
|
(is (= "Jan 1"
|
|
(first (stocks :date))))))
|
|
|
|
(deftest one-hot-failing
|
|
(let [str-ds (-> (ds/->dataset [{"a" 1 "b" "AA"}
|
|
{"a" 2 "b" "AA"}
|
|
{"a" 3 "b" "BB"}
|
|
{"a" 4 "b" "BB"}])
|
|
(ds/categorical->one-hot ["b"]))
|
|
kwd-ds (-> (ds/->dataset [{:a 1 :b "AA"}
|
|
{:a 2 :b "AA"}
|
|
{:a 3 :b "BB"}
|
|
{:a 4 :b "BB"}])
|
|
(ds/categorical->one-hot [:b]))]
|
|
(is (= #{"a" "b-AA" "b-BB"} (set (ds/column-names str-ds))))
|
|
(is (= #{:a :b-AA :b-BB} (set (ds/column-names kwd-ds))))))
|
|
|
|
|
|
(deftest select-memory
|
|
(let [original (ds/->dataset [{:a 0} {:a 1} {:a 2} {:a 3} {:a 4}])
|
|
new-ds (ds/select-rows original (range 4))]
|
|
(is (= (vec (range 4)) (vec (new-ds :a))))
|
|
(is (thrown? Throwable (vec (:a (ds/select-rows new-ds 4)))))))
|
|
|
|
|
|
(deftest custom-sort-by-column
|
|
(let [DS (-> (tech.v3.dataset/->dataset {:a [5 4 3 2 8 7 6]})
|
|
(ds/sort-by-column :a compare))]
|
|
(is (= (vec (sort [5 4 3 2 8 7 6]))
|
|
(vec (DS :a))))))
|
|
|
|
|
|
(deftest set-missing-new-column
|
|
(let [col (ds-col/new-column "abc" (repeat 10 1) nil [1 2 3])]
|
|
(is (= [1 nil nil nil 1 1 1 1 1 1] (vec col)))))
|
|
|
|
|
|
(deftest join-on-date
|
|
(let [A (ds/->dataset {:a [(java.time.LocalDate/of 2001 01 01)]
|
|
:b [11]})
|
|
B (ds/->dataset {:a [(java.time.LocalDate/of 2001 01 01)]
|
|
:c [22]})]
|
|
(ds-join/left-join :a A B)))
|
|
|
|
|
|
(deftest sample-repeatable-seed
|
|
(let [ds (ds/->dataset "test/data/stocks.csv")]
|
|
(is (= (vec (get (ds/sample ds 5 {:seed 20}) "symbol"))
|
|
(vec (get (ds/sample ds 5 {:seed 20}) "symbol"))))))
|
|
|
|
|
|
(deftest sample-arities
|
|
(let [ds (ds/->dataset "test/data/stocks.csv")]
|
|
(is (= (dtype/ecount (get (ds/sample ds) "symbol"))
|
|
(dtype/ecount (get (ds/sample ds 5) "symbol"))))))
|
|
|
|
|
|
(deftest string-table-addall
|
|
(let [data ["one" "two" "three"]
|
|
strt (str-table/make-string-table 0)]
|
|
(.addAll strt data)
|
|
(is (= (vec strt)
|
|
data))))
|
|
|
|
|
|
(deftest concat-copying-object-fail
|
|
(let [ds1 (ds/->dataset {:a [["A" 1]["B" 1]]})
|
|
ds2 (ds/->dataset {:a [["A" 2]["B" 2]]})
|
|
dsc (ds/concat-copying ds1 ds2)]
|
|
(is (= [["A" 1] ["B" 1] ["A" 2] ["B" 2]]
|
|
(vec (dsc :a))))))
|
|
|
|
|
|
(deftest concat-inplace-desc-stats
|
|
(let [ds (ds/->dataset [{"A" 1 "B" 2} {"A" 2 "B" 3}])]
|
|
(is (dfn/equals [1.5 2.5]
|
|
(-> (ds/concat ds ds)
|
|
(ds/descriptive-stats)
|
|
(:mean))))))
|
|
|
|
|
|
(deftest replace-missing-regression-181
|
|
[]
|
|
(let [ds (ds/->dataset {:a [nil nil 2 2]})]
|
|
(is (= [2 2 2 2]
|
|
(-> (ds/replace-missing ds :all :value dfn/mean)
|
|
:a
|
|
vec)))))
|
|
|
|
|
|
(deftest replace-missing-regression-184
|
|
(let [date-dtype (java.time.LocalDate/parse "2020-12-11")
|
|
ds (ds/->dataset {:a [nil 2 nil nil 4 nil 6 nil]
|
|
:b [3. nil nil 6. nil 9. nil 12.]
|
|
:c [nil "A" nil nil "B" nil "C" nil]
|
|
:d ["A" nil nil "B" nil "C" nil "D"]
|
|
:e (dtype-dt/plus-temporal-amount
|
|
(dtype/make-container
|
|
:local-date
|
|
[nil date-dtype nil nil date-dtype nil date-dtype nil])
|
|
(dfn/* 10 (range 8))
|
|
:days)})
|
|
ds' (ds/replace-missing ds :midpoint)]
|
|
(is (= [2.0 2.0 3.0 3.0 4.0 5.0 6.0 6.0] (vec (ds' :a))))
|
|
(is (= [3.0 4.5 4.5 6.0 7.5 9.0 10.5 12.0] (vec (ds' :b))))
|
|
(is (= [nil "A" "A" "A" "B" "B" "C" "C"] (vec (ds' :c))))
|
|
(is (= ["A" "A" "A" "B" "B" "C" "C" "D"] (vec (ds' :d))))
|
|
(is (= ["2020-12-21" "2020-12-21" "2021-01-05" "2021-01-05" "2021-01-20"
|
|
"2021-01-30" "2021-02-09" "2021-02-09"]
|
|
(mapv str (:e ds'))))
|
|
(let [ds (ds/->dataset {:a [nil 2 nil nil nil 4 nil 6 nil]
|
|
:b [3. nil nil nil 6. nil 9. nil 12.]
|
|
:c [nil "A" nil nil "B" nil nil "C" nil]
|
|
:d ["A" nil nil "B" nil nil "C" nil "D"]
|
|
:e (dtype-dt/plus-temporal-amount
|
|
(dtype/make-container
|
|
:local-date
|
|
[nil date-dtype nil nil nil date-dtype nil
|
|
date-dtype nil])
|
|
(dfn/* 10 (range 9))
|
|
:days)})
|
|
ds' (ds/replace-missing ds :nearest)
|
|
ds'' (ds/replace-missing ds :mid)]
|
|
(is (= [2 2 2 2 4 4 4 6 6] (vec (ds' :a))))
|
|
(is (= [2 2 2 2 4 4 4 6 6] (vec (ds'' :a))))
|
|
(is (= [3.0 3.0 3.0 6.0 6.0 6.0 9.0 9.0 12.0] (vec (ds' :b)))))))
|
|
|
|
|
|
(deftest column-to-double-regression-187
|
|
(let [col1 (ds-col/new-column :col1 [1 2 3])]
|
|
(is (dfn/equals [1 2 3]
|
|
(ds-col/to-double-array col1))))
|
|
(let [col1 (ds-col/new-column :col1 (int-array [1 2 3]))]
|
|
(is (dfn/equals (ds-col/to-double-array col1) [1 2 3]))))
|
|
|
|
|
|
(deftest boolean-csv-column-names
|
|
(try
|
|
(ds/write!
|
|
(ds/->dataset {false [1]}) "test/out.csv")
|
|
(is (= ["false"] (-> (ds/->dataset "test/out.csv")
|
|
(ds/column-names))))
|
|
(finally (.delete (java.io.File. "test/out.csv")))))
|
|
|
|
|
|
(deftest to-double-array-returns-double-array
|
|
(let [data (ds/->dataset [{:a 1.0 :b 2.0}
|
|
{:a 3.0}])]
|
|
(is (instance? (Class/forName "[D") (ds-col/to-double-array (data :a))))
|
|
(is (every? identity (dfn/eq [2.0 Double/NaN]
|
|
(ds-col/to-double-array (data :b)))))))
|
|
|
|
|
|
(deftest write-with-nil-name
|
|
(let [data (-> (ds/->dataset [{:a 1.0 :b 2.0}
|
|
{:a 3.0}])
|
|
(vary-meta assoc :name nil))]
|
|
(try
|
|
(ds/write! data "test/data/nil-name.csv")
|
|
(finally
|
|
(.delete (java.io.File. "test/data/nil-name.csv"))))))
|
|
|
|
|
|
(deftest create-dataset-scalars
|
|
(let [data (ds/->dataset {:a [1 2 3 4]
|
|
:b "hey"
|
|
:c (range)
|
|
:d 1})]
|
|
(is (= ["hey" "hey" "hey" "hey"]
|
|
(vec (data :b))))
|
|
(is (= [:int64 :string :int64 :int64]
|
|
(mapv (comp :datatype meta) (vals data))))))
|
|
|
|
|
|
(deftest create-dataset-seq
|
|
(let [data (ds/->dataset {:calendar-year '(2020 2021 2020 2021)
|
|
:setting '("A" "A" "B" "B")
|
|
:bigdata (cycle [1 2 3 4])})]
|
|
(is (= 4 (ds/row-count data)))))
|
|
|
|
|
|
(deftest empty-dataset-on-select-nothing
|
|
(let [dataset (ds/->dataset "test/data/stocks.csv")]
|
|
(is (= 0 (ds/row-count (ds/select-columns dataset nil))))
|
|
(is (= 0 (ds/row-count (ds/select-rows dataset nil))))
|
|
(is (= (ds/column-count dataset)
|
|
(ds/column-count (ds/select-rows dataset nil))))))
|
|
|
|
|
|
(deftest column-cast-test-cce-fail
|
|
(let [ds (ds/->dataset {:col1 [1 2 3 "NaN"]} {:parser-fn :string})]
|
|
(is (= [1.0 2.0 3.0]
|
|
(->> (ds/column-cast ds :col1 [:float64 :relaxed?])
|
|
(#(ds/column % :col1))
|
|
(take 3)
|
|
(vec))))))
|
|
|
|
|
|
(deftest desc-stats-ok
|
|
(let [ds (ds/->dataset [])]
|
|
(is '()
|
|
(ds/brief ds))))
|
|
|
|
|
|
(deftest desc-stats-also-ok
|
|
(let [ds (ds/->dataset {"col1" [] "col2" [1]})]
|
|
(is '()
|
|
(ds/brief ds))))
|
|
|
|
|
|
(deftest desc-stats-oob
|
|
(let [ds (ds/->dataset {"col1" []})]
|
|
(is '()
|
|
(ds/brief ds))))
|
|
|
|
|
|
(deftest column-map-regression-1
|
|
(let [testds (ds/->dataset [{:a 1.0 :b 2.0} {:a 3.0 :b 5.0} {:a 4.0 :b nil}])]
|
|
;;result scanned for both datatype and missing set
|
|
(is (= (vec [3.0 6.0 nil])
|
|
(:b2 (ds/column-map testds :b2 #(when % (inc %)) [:b]))))
|
|
;;result scanned for missing set only. Result used in-place.
|
|
(is (= (vec [3.0 6.0 nil])
|
|
(:b2 (ds/column-map testds :b2 #(when % (inc %))
|
|
{:datatype :float64} [:b]))))
|
|
;;Nothing scanned at all.
|
|
(is (= (vec [3.0 6.0 nil])
|
|
(:b2 (ds/column-map testds :b2 #(inc %)
|
|
{:datatype :float64
|
|
:missing-fn ds-col/union-missing-sets} [:b]))))
|
|
;;Missing used to scanning causes NPE at inc.
|
|
;;Now data is casted to Double/NaN
|
|
#_(is (thrown? Throwable
|
|
(ds/column-map testds :b2 #(inc %)
|
|
{:datatype :float64}
|
|
[:b])))))
|
|
|
|
|
|
(deftest remove-columns-issue-242
|
|
(is (= [:a "c" :d :e]
|
|
(vec (-> (tech.v3.dataset/->dataset {:a [1] :b [2] "c" [3]
|
|
:d [4] :e [5]})
|
|
(tech.v3.dataset/drop-columns [:b])
|
|
(ds/column-names))))))
|
|
|
|
|
|
(deftest column-cast-packed-date
|
|
(let [x (ds/->dataset [{:a 0 :b "2020-03-05"} {:a 1 :b nil}])
|
|
y (ds/column-cast x :b :packed-local-date)]
|
|
(is (instance? java.time.LocalDate ((y :b) 0)))
|
|
(is (nil? ((y :b) 1)))))
|
|
|
|
|
|
(deftest dataset->data-regression-249
|
|
(let [src-ds (ds/concat (ds/->dataset {:x ["1"]
|
|
:y ["2" "3"]})
|
|
(ds/->dataset {:x ["4"]
|
|
:y ["5"]}))
|
|
ds-data (ds/dataset->data src-ds)
|
|
rehydrated (ds/data->dataset ds-data)]
|
|
(is (= (vec (src-ds :x))
|
|
(vec (rehydrated :x))))
|
|
(is (= (ds/missing src-ds)
|
|
(ds/missing rehydrated)))))
|
|
|
|
|
|
(deftest dataset->data-regression-250
|
|
(let [src-ds (ds/->dataset {:x [1]
|
|
:y [[3 4]]})
|
|
new-ds (-> (nippy/freeze src-ds)
|
|
(nippy/thaw))]
|
|
(is (= (vec (src-ds :y))
|
|
(vec (new-ds :y))))))
|
|
|
|
|
|
(deftest freeze-thaw-column
|
|
(let [{:keys [date price symbol]}
|
|
(ds/->dataset "test/data/stocks.csv" {:key-fn keyword})
|
|
date-data (nippy/freeze date)
|
|
symbol-data (nippy/freeze symbol)
|
|
ndate (nippy/thaw date-data)
|
|
nsym (nippy/thaw symbol-data)
|
|
nds (ds/new-dataset [ndate nsym])]
|
|
(is (= (vec date)
|
|
(nds :date)))
|
|
(is (= (vec symbol)
|
|
(nds :symbol)))))
|
|
|
|
|
|
(deftest negative-index-on-columns-gets-last
|
|
(let [ds (ds/->dataset "test/data/stocks.csv")
|
|
last-idx (dec (ds/row-count ds))
|
|
symbol (ds "symbol")]
|
|
(is (= (symbol last-idx) (symbol -1)))))
|
|
|
|
|
|
;; This was a bad idea. Concatenating, just the same as concatenating sequences of maps
|
|
;; should not require the same columns across all datasets. That creates extremely
|
|
;; error prone code.
|
|
(deftest concat-doesnt-require-same-columns
|
|
(let [ds (ds/concat-copying
|
|
(ds/->dataset {:a (range 10)
|
|
:c (repeat 10 (dtype-dt/local-date))})
|
|
(ds/->dataset {:b (range 10)}))]
|
|
(is (= 20 (ds/row-count ds)))
|
|
(is (= 10 (dtype/ecount (ds/missing (ds :a)))))
|
|
(is (= 10 (dtype/ecount (ds/missing (ds :b)))))))
|
|
|
|
|
|
;;It is way too confusing for users to have to navigate pack/unpack code in any
|
|
;;normal situation.
|
|
(deftest filter-sort-columns-uses-unpacked-datatypes
|
|
(let [stocks (ds/->dataset "test/data/stocks.csv")
|
|
test-val (second (stocks "date"))]
|
|
(is (not= 0 (ds/row-count (ds/filter-column stocks "date" #(= % test-val)))))
|
|
;;make sure sorting still works
|
|
(is (= (ds/row-count stocks)
|
|
(ds/row-count (ds/sort-by-column stocks "date"))))))
|
|
|
|
|
|
(deftest binary-ops-on-integer-missing-results-in-nan
|
|
(let [src-ds (ds/->dataset {:a [1 2 nil 4]})
|
|
dst-ds (assoc src-ds :b (dfn/+ (:a src-ds ) 1))]
|
|
(is (= 1 (dtype/ecount (ds/missing (dst-ds :b)))))
|
|
(is (= [2.0 3.0 nil 5.0]
|
|
(vec (dst-ds :b))))))
|
|
|
|
|
|
(deftest sort-works-with-nan
|
|
(let [ds (ds/->dataset {:a [1 nil 2 nil nil 4]} )
|
|
ds-first (ds/sort-by-column ds :a nil {:nan-strategy :first})
|
|
ds-last (ds/sort-by-column ds :a nil {:nan-strategy :last})]
|
|
(is (= [nil nil nil 1 2 4] (vec (ds-first :a))))
|
|
(is (= [1 2 4 nil nil nil] (vec (ds-last :a))))
|
|
(is (thrown? Exception (ds/sort-by-column ds :a nil {:nan-strategy :exception})))))
|
|
|
|
|
|
(deftest concat-packed-date-with-date-results-in-local-date-or-packed-local-date
|
|
(let [ds (ds/->dataset (repeat 10 {:a (dtype-dt/local-date)})
|
|
{:parser-fn {:a :local-date}})
|
|
ds-packed (ds/->dataset {:a (repeat 10 (dtype-dt/local-date))}
|
|
{:parser-fn {:a :packed-local-date}})
|
|
res-inp (ds/concat-inplace ds ds-packed)
|
|
res-cp (ds/concat-copying ds ds-packed)]
|
|
(is (#{:local-date :packed-local-date} (dtype/elemwise-datatype (res-inp :a))))
|
|
(is (#{:local-date :packed-local-date} (dtype/elemwise-datatype (res-cp :a))))))
|
|
|
|
|
|
(deftest row-map-test
|
|
(let [ds (ds/->dataset "test/data/stocks.csv")]
|
|
(is (thrown? Exception (ds/row-map ds #(hash-map :price2 (* (% :price) (% :price))))))
|
|
(is (dfn/equals (dfn/sq (ds "price"))
|
|
(-> (ds/row-map ds #(hash-map :price2 (* (% "price") (% "price"))))
|
|
(ds/column :price2))))))
|
|
|
|
|
|
(deftest extend-packed-date-with-empty
|
|
(let [ds-a (ds/->dataset {:b (range 20)})
|
|
ds (ds/->dataset (repeat 10 {:a (dtype-dt/local-date)})
|
|
{:parser-fn {:a :packed-local-date}})
|
|
fin-ds (merge ds-a ds)]
|
|
(is (not (nil? (.toString (fin-ds :a)))))))
|
|
|
|
|
|
(deftest desc-stats-date-col
|
|
(let [src-ds (tech.v3.dataset/->dataset
|
|
{:date-time-with-nil ["Jul 1, 2011" nil]}
|
|
{:parser-fn :local-date})
|
|
{:keys [min mean max]} (tech.v3.dataset/descriptive-stats src-ds)
|
|
val ((src-ds :date-time-with-nil) 0)]
|
|
(is (every? #(= % val) [(min 0) (mean 0) (max 0)]))))
|
|
|
|
|
|
(deftest nth-col-neg-indexes
|
|
(let [data ((ds/->dataset {:a (range 10)}) :a)]
|
|
(is (thrown? Throwable (nth data 10)))
|
|
(is (= :a (nth data 10 :a)))
|
|
(is (thrown? Throwable (nth data -11)))
|
|
(is (= :a (nth data -11 :a)))
|
|
(is (= 0 (nth data -10 :a)))))
|
|
|
|
|
|
(deftest column-rolling-regression
|
|
(is (every? identity (dfn/eq
|
|
[##NaN 2.0 2.5 3.5]
|
|
(rolling/fixed-rolling-window
|
|
((ds/->dataset {:a [##NaN 2 3 4]}) :a)
|
|
2 dfn/mean))))
|
|
(is (every? identity (dfn/eq
|
|
[##NaN 2.0 2.5 3.5]
|
|
(rolling/fixed-rolling-window
|
|
(ds-col/new-column [nil 2 3 4])
|
|
2 dfn/mean))))
|
|
(is (every? identity (dfn/eq
|
|
[##NaN 2.0 2.5 3.5]
|
|
(rolling/fixed-rolling-window
|
|
(ds-col/new-column (double-array [##NaN 2 3 4]))
|
|
2 dfn/mean)))))
|
|
|
|
|
|
(deftest concat-nil-is-nil
|
|
(is (= nil (apply ds/concat nil)))
|
|
(is (= nil (apply ds/concat-copying nil)))
|
|
(is (= nil (apply ds/concat-inplace nil))))
|
|
|
|
|
|
(deftest replace-missing-whacks-metadata-274
|
|
(let [ds (-> (ds/->dataset {:a [0 nil 1 nil 2]})
|
|
(ds/update-column :a (fn [a-col]
|
|
(with-meta a-col {:a :b}))))
|
|
dsm (ds/replace-missing-value ds [:a] 10)
|
|
dsmm (ds/replace-missing ds [:a] :down)]
|
|
(is (= {:a :b} (select-keys (meta (ds :a)) [:a])))
|
|
(is (= {:a :b} (select-keys (meta (dsm :a)) [:a])))
|
|
(is (= {:a :b} (select-keys (meta (dsmm :a)) [:a])))))
|
|
|
|
|
|
(deftest induction-test
|
|
(let [induct-ds (-> (ds/->dataset {:a [0 1 2 3] :b [1 2 3 4]})
|
|
(ds/induction (fn [ds]
|
|
{:sum-of-previous-row (dfn/sum (ds/rowvec-at ds -1))
|
|
:sum-a (dfn/sum (ds :a))
|
|
:sum-b (dfn/sum (ds :b))})))]
|
|
(is (= [0.0 1.0 3.0 6.0]
|
|
(induct-ds :sum-b)))
|
|
|
|
(is (= [0.0 0.0 1.0 3.0]
|
|
(induct-ds :sum-a)))
|
|
|
|
(is (= [0.0 1.0 5.0 14.0]
|
|
(induct-ds :sum-of-previous-row)))))
|
|
|
|
|
|
(deftest row-mapcat
|
|
(let [ds (ds/->dataset {:rid (range 10)
|
|
:data (repeatedly 10 #(rand-int 3))})
|
|
mds (ds/row-mapcat ds (fn [row]
|
|
(for [idx (range (row :data))]
|
|
{:idx idx})))
|
|
n-rows (long (dfn/sum (ds :data)))]
|
|
(is (= n-rows (ds/row-count mds)))))
|
|
|
|
|
|
(deftest array-of-structs-all-dtypes
|
|
(let [sdef (dt-struct/define-datatype! :alldtypes
|
|
[{:name :i8 :datatype :int8}
|
|
{:name :u8 :datatype :uint8}
|
|
{:name :i16 :datatype :int16}
|
|
{:name :u16 :datatype :uint16}
|
|
{:name :i32 :datatype :int32}
|
|
{:name :u32 :datatype :uint32}
|
|
{:name :i64 :datatype :int64}
|
|
{:name :u64 :datatype :uint64}
|
|
{:name :f32 :datatype :float32}
|
|
{:name :f64 :datatype :float64}])
|
|
ary (dt-struct/new-array-of-structs :alldtypes 10)
|
|
cmap (dt-struct/column-map ary)
|
|
_ (doseq [col (vals cmap)]
|
|
(dtype/copy! (range 10) col))
|
|
ds (ds/->dataset cmap)
|
|
props (sdef :data-layout)]
|
|
(doseq [prop props]
|
|
(let [col (ds/column ds (:name prop))
|
|
cmeta (meta col)]
|
|
(is (= (:datatype cmeta) (:datatype prop)) (str prop))
|
|
(is (= (vec (cmap (:name prop)))
|
|
(vec col))
|
|
(str prop))))))
|
|
|
|
|
|
(deftest replace-missing-packed-local-date
|
|
(let [date (dtype-dt/local-date)
|
|
ds (-> (ds/->dataset {:a [date nil nil date nil]})
|
|
(ds/replace-missing :all :value date))]
|
|
(is (== 0 (dtype/ecount (ds/missing ds))))
|
|
(is (= (vec (repeat 5 date))
|
|
(vec (ds :a))))))
|
|
|
|
|
|
(deftest variable-rolling-window-doubles
|
|
(let [ds (ds/->dataset {:a (double-array (range 100))
|
|
:b (range 100)})
|
|
small-win (ds/head (ds-roll/rolling ds {:window-type :variable
|
|
:window-size 10
|
|
:column-name :a}
|
|
{:b-mean (ds-roll/mean :b)}))
|
|
big-win (ds/head (ds-roll/rolling ds {:window-type :variable
|
|
:window-size 20
|
|
:column-name :a}
|
|
{:b-mean (ds-roll/mean :b)}))]
|
|
(is (dfn/equals [4.5 5.5 6.5 7.5 8.5] (vec (small-win :b-mean))))
|
|
(is (dfn/equals [0.0 0.5 1.0 1.5 2.0]
|
|
(-> (ds-roll/rolling ds {:window-type :variable
|
|
:window-size 10
|
|
:column-name :a
|
|
:relative-window-position :left}
|
|
{:b-mean (ds-roll/mean :b)})
|
|
(ds/head)
|
|
(ds/column :b-mean)
|
|
(vec))))
|
|
(is (dfn/equals [2.0 2.5 3.0 3.5 4.0]
|
|
(-> (ds-roll/rolling ds {:window-type :variable
|
|
:window-size 10
|
|
:column-name :a
|
|
:relative-window-position :center}
|
|
{:b-mean (ds-roll/mean :b)})
|
|
(ds/head)
|
|
(ds/column :b-mean)
|
|
(vec))))
|
|
(is (dfn/equals [9.5 10.5 11.5 12.5 13.5] (vec (big-win :b-mean))))))
|
|
|
|
|
|
|
|
(deftest rolling-multi-column-reducer
|
|
(let [ds (ds/->dataset {:a (range 100)
|
|
:b (range 100)})
|
|
fin-ds (ds-roll/rolling ds 10 {:c {:column-name [:a :b]
|
|
:reducer (fn [a b]
|
|
(+ (dfn/sum a) (dfn/sum b)))
|
|
:datatype :float64}})]
|
|
(is (= :float64 (dtype/elemwise-datatype (fin-ds :c))))
|
|
(is (= [20.0 30.0 42.0 56.0 72.0]
|
|
(vec (take 5 (fin-ds :c)))))))
|
|
|
|
|
|
(deftest unroll-single-column
|
|
(is (= (vec (range 9))
|
|
(-> (ds/->dataset {:a [[0 1 2 3] [4 5] [6 7 8]]})
|
|
(ds/unroll-column :a)
|
|
(ds/column :a)
|
|
(vec)))))
|
|
|
|
|
|
(deftest construct-with-hashmap
|
|
(let [hm (doto (java.util.HashMap.)
|
|
(.put :a 1)
|
|
(.put :b 2))
|
|
ds (ds/->dataset [hm hm hm])]
|
|
(is (= (vector 1 1 1)
|
|
(vec (ds :a))))))
|
|
|
|
|
|
(deftest double-nan-missing
|
|
(let [ds (ds/->dataset {:a [0.0 Double/NaN 2.0]
|
|
:b [0 nil 2]
|
|
:c [:a nil :b]})]
|
|
(is (= [2.0]
|
|
(-> (ds/filter-column ds :a identity)
|
|
(ds/column :a)
|
|
(vec))))
|
|
(is (= [2.0]
|
|
(-> (ds/filter-column ds :b identity)
|
|
(ds/column :a)
|
|
(vec))))
|
|
(is (= [0.0 2.0]
|
|
(-> ds
|
|
(ds/filter-column :c identity)
|
|
(ds/column :a)
|
|
(vec))))))
|
|
|
|
|
|
(deftest issue-315
|
|
(is (not (nil? (ds/concat (ds/drop-rows (ds/->dataset [{:a 1 :b 2}]) [0])
|
|
(ds/drop-rows (ds/->dataset [{:a 1 :c3 2}]) [0]))))))
|
|
|
|
|
|
(deftest issue-259
|
|
(let [ds (ds/->dataset [{"a o" 1 "b o" 2} {"a o" 5 "b o" 3}]
|
|
{:key-fn #(keyword (clojure.string/replace % " " "-"))})]
|
|
(is (= #{:b-o :a-o} (set (map (comp :name meta) (vals ds))))))
|
|
(let [ds (ds/->dataset {"a o" [1 5] "b o" [2 3]}
|
|
{:key-fn #(keyword (clojure.string/replace % " " "-"))})]
|
|
(is (= #{:b-o :a-o} (set (map (comp :name meta) (vals ds))))))
|
|
(let [ds (ds/->dataset [{"Foo" 1 , "Bar" 2}]
|
|
{:key-fn #(keyword (.toLowerCase %))})]
|
|
(is (= #{:foo :bar}
|
|
(set (map (comp :name meta) (vals ds))))))
|
|
(let [ds (ds/->dataset (java.io.ByteArrayInputStream. (.getBytes "Foo,Bar\n1,2"))
|
|
{:key-fn #(keyword (.toLowerCase %))
|
|
:file-type :csv})]
|
|
(is (= #{:foo :bar}
|
|
(set (map (comp :name meta) (vals ds)))))))
|
|
|
|
|
|
(deftest discrete-categorical-issue-322
|
|
(let [ds (ds/->dataset "test/data/stocks.csv")]
|
|
(is (thrown? Exception (ds/categorical->number ds ["symbol"] {"AAPL" 1
|
|
"MSFT" 2.2
|
|
"AMZN" 3
|
|
"IBM" 4
|
|
"GOOG" 5})))
|
|
(is (= (set (range 1 6))
|
|
(->> (-> (ds/categorical->number ds ["symbol"] {"AAPL" 1
|
|
"MSFT" 2
|
|
"AMZN" 3
|
|
"IBM" 4
|
|
"GOOG" 5})
|
|
(ds/column "symbol"))
|
|
(map long)
|
|
(set))))))
|
|
|
|
|
|
(deftest column-meta-roundtrip
|
|
(is (= :v
|
|
(->
|
|
(ds-base/column->data (ds-col/new-column :a [0] {:k :v}))
|
|
(ds-base/data->column)
|
|
meta
|
|
:k
|
|
))))
|
|
|
|
(deftest print-all-test
|
|
(let [ds (ds/->dataset (for [i (range 1000)] {:a i}))]
|
|
(is (= (meta (ds/print-all ds))
|
|
(meta (ds-print/print-range ds :all))))
|
|
(is (> (count (with-out-str (println (ds/print-all ds))))
|
|
1000))))
|
|
|
|
(deftest column-copy-test
|
|
[]
|
|
(let [short-col (:a (ds/->dataset (interleave
|
|
(repeat 10 {:a (short 25)})
|
|
(repeat 10 {:a nil}))))]
|
|
(is (= (vec (apply concat (repeat 10 [25 -32768])))
|
|
(vec (dtype/->array short-col))))
|
|
(is (= (vec (apply concat (repeat 10 [true false])))
|
|
(vec (dfn/finite? (dtype/->array :float64 short-col)))))
|
|
(is (= (vec (apply concat (repeat 10 [true false])))
|
|
(vec (dfn/finite? short-col))))
|
|
(is (= 25
|
|
(Math/round (dfn/mean short-col))))))
|
|
|
|
(deftest select-columns-test
|
|
(let [DS (ds/->dataset {:A [1 2 3]
|
|
:B [4 5 6]
|
|
:C ["A" "B" "C"]})]
|
|
(is (= (ds/select-columns DS [:C])
|
|
(ds/select-columns DS cf/categorical)))
|
|
(is (= (ds/select-columns DS cf/numeric)
|
|
(ds/select-columns DS [:A :B])))))
|
|
|
|
(deftest drop-columns-test
|
|
(let [DS (ds/->dataset {:A [1 2 3]
|
|
:B [4 5 6]
|
|
:C ["A" "B" "C"]})]
|
|
(is (= (ds/drop-columns DS cf/categorical)
|
|
(ds/remove-columns DS cf/categorical)
|
|
(ds/select-columns DS [:A :B])))
|
|
(is (= (ds/drop-columns DS cf/numeric)
|
|
(ds/remove-columns DS cf/numeric)
|
|
(ds/select-columns DS [:C])))))
|
|
|
|
(deftest column-select-test
|
|
(let [c (ds-col/new-column :test [0 1 2 3 4 5])]
|
|
(is (= [0 1 2]
|
|
(ds-col/select c [0 1 2])))
|
|
(is (= [0 1 2]
|
|
(ds-col/select c (dfn/< c 3))))))
|
|
|
|
(deftest dataset-column-select-test
|
|
(let [ds (ds/->dataset {:A [1 2 3 4 5]
|
|
:B [2 3 4 5 6]})]
|
|
(is (= (ds/->dataset {:A [1 5]
|
|
:B [2 6]})
|
|
(ds/select ds :all [0 4])))
|
|
(is (= (ds/->dataset {:A [1 2]
|
|
:B [2 3]})
|
|
(ds/select ds :all (dfn/< (:A ds) 3))))))
|
|
|
|
|
|
(deftest basic-desc-stats
|
|
(let [ds (ds/->dataset "test/data/stocks.csv")
|
|
stats (ds/descriptive-stats ds)]
|
|
(is (not (nil? (.toString ^Object stats))))))
|
|
|
|
|
|
(deftest extend-prepend-packed-column
|
|
(let [ds (ds/->dataset {:a [(LocalDate/of 2022 12 28)]})
|
|
acol (ds :a)
|
|
pa (col-impl/prepend-column-with-empty acol 5)
|
|
ap (col-impl/extend-column-with-empty acol 5)]
|
|
(is (= [nil nil nil nil nil (LocalDate/of 2022 12 28)]
|
|
(vec pa)))
|
|
(is (= [(LocalDate/of 2022 12 28) nil nil nil nil nil]
|
|
(vec ap)))))
|
|
|
|
(deftest filter-regression-342
|
|
(ds/filter-column (ds/->dataset (repeat 1000 {:datatype :float64 :b 2} ))
|
|
:datatype #(= % :object)))
|
|
|
|
|
|
(deftest head-tail-regression-343
|
|
(let [ds (ds/->dataset {:a (repeat 1000 :a)
|
|
:b (range 1000)})]))
|
|
|
|
|
|
(deftest mixed-boolean-values
|
|
(is (= :object (:datatype (meta ((ds/->dataset {:a [1 true false]}) :a))))))
|
|
|
|
|
|
(deftest fast-parser-ds-creation
|
|
(let [test-ds (ds/->dataset {:a (range 2000) :b (range 2000) :c (range 2000)})
|
|
a-parser (ds-api/dataset-parser {:dataset-name "just/a/column"})
|
|
parser (ds-api/dataset-parser {:dataset-name "all/three/columns"})]
|
|
(ds-proto/add-rows parser (ds/rows test-ds))
|
|
(ds-proto/add-rows a-parser (ds/rows (ds/select-columns test-ds [:a])))
|
|
(dotimes [idx 4000] @parser)
|
|
(dotimes [idx 4000] @a-parser)
|
|
(dotimes [idx 4000] (nth parser -1))
|
|
(dotimes [idx 10] (vec parser))
|
|
(println "3 column creation")
|
|
(time (dotimes [idx 1000] @parser))
|
|
(println "1 column creation")
|
|
(time (dotimes [idx 1000] @a-parser))
|
|
(println "row-at time")
|
|
(println (nth parser -1))
|
|
(time
|
|
(dotimes [idx 1000] (nth parser -1)))
|
|
(time (vec parser))
|
|
(is (= {:a 1999 :b 1999 :c 1999} (nth parser -1)))))
|
|
|
|
|
|
(deftest select-columns-repeat-columns
|
|
(let [ds (-> (ds/->dataset {:a [1 2] :b [3 4]})
|
|
(ds/select-columns [:a :b :a]))]
|
|
(is (= [:a :b] (vec (ds/column-names ds))))))
|
|
|
|
|
|
(deftest vararg-column-map
|
|
(let [ds (ds/->dataset {:foo (range 0 5)
|
|
:bar (repeatedly #(rand-int 100))
|
|
:baz (repeatedly #(rand-int 100))})]
|
|
;;This threw before.
|
|
(is (not (nil?
|
|
(ds/add-or-update-column ds :quz
|
|
(apply ds-col/column-map
|
|
(fn [foo bar baz]
|
|
(if (zero? (mod (+ foo bar baz) 7)) "mod 7" "not mod 7"))
|
|
nil (ds/columns ds))))))))
|
|
|
|
(deftest ioobe-issue-360
|
|
(is (thrown? IndexOutOfBoundsException (ds/select-rows (ds/->dataset {:a []}) [0])))
|
|
(is (thrown? IndexOutOfBoundsException (ds/select-rows (ds/->dataset []) [0]))))
|
|
|
|
|
|
(deftest failed-pmap-column-issue-367
|
|
(is (== (ds/row-count (ds/->dataset {:a (tech.v3.parallel.for/pmap identity [1 2 3])})) 3))
|
|
(is (== (ds/row-count (ds/->dataset {:a (list 1 2 3)
|
|
:b (cycle [1 2 3 4])}))
|
|
3))
|
|
(is (== (ds/row-count (ds/->dataset {:a [1 2 3]
|
|
:b (cycle [1 2 3 4])}))
|
|
3))
|
|
(is (== (ds/row-count (ds/->dataset {:a (list 1 2 3)
|
|
:b 2}))
|
|
3)))
|
|
|
|
(deftest group-by-column->index-issue-372
|
|
(let [data (ds/group-by-column->indexes (ds/->dataset {:a (int-array (concat (range 10) (range 10) (range 10)))})
|
|
:a)]
|
|
|
|
(is (= 3 (count (get data (int 0)))))))
|
|
|
|
(deftest select-bool-issue-387
|
|
(let [ds (ds/->dataset {:a (range 10)})
|
|
vec-of-bools [true false true false true false true false true false]
|
|
expected [0 2 4 6 8]]
|
|
(is (= expected
|
|
(-> (ds/select ds :all (dtype/make-list :boolean vec-of-bools))
|
|
:a)))
|
|
(is (= expected
|
|
(-> (ds/select ds :all vec-of-bools)
|
|
:a)))))
|
|
|
|
(deftest disable-na-as-missing
|
|
(let [expected-column ["foo" "NA"]
|
|
ds1 (ds/->dataset {:a expected-column} {:disable-na-as-missing? true})
|
|
ds2 (ds/->dataset (for [v expected-column] {:a v}) {:disable-na-as-missing? true})]
|
|
(is (= expected-column (:a ds1)))
|
|
(is (= expected-column (:a ds2)))))
|
|
|
|
(deftest fixed-type-disable-na-as-missing
|
|
(let [data [{:a "no"} {:a "NA"} {:a "na"}]
|
|
ds1 (ds/->dataset data {:parser-fn :string :disable-na-as-missing? true})
|
|
ds2 (ds/->dataset data {:parser-fn :string :disable-na-as-missing? false})]
|
|
(is (= ["no" "NA" "na"] (:a ds1)))
|
|
(is (= ["no" nil nil] (:a ds2)))))
|
|
|
|
(deftest sub-buffer-col-incorrect-missing
|
|
(let [ds (-> (ds/->dataset {:a (range 20)})
|
|
(ds/row-map (fn [m] {:a (if (>= (:a m) 10)
|
|
nil (:a m))})
|
|
{:parallelism 2
|
|
:min-n 1}))
|
|
col (ds :a)
|
|
subcol (dtype/sub-buffer col 10 5)]
|
|
(is (= (range 10 20)
|
|
(bitmap/->random-access (ds/missing col))))
|
|
(is (= (range 5)
|
|
(bitmap/->random-access (ds/missing subcol))))))
|
|
|
|
|
|
(deftest issue-413-reduction-on-instant-column
|
|
(let [ds (ds/->dataset {:x (range 5)
|
|
:y (repeatedly 5 #(java.time.Instant/now))})]
|
|
(is (= 3 (count (into [] (take 3) (:y ds)))))))
|
|
|
|
|
|
(deftest issue-432-issue-371
|
|
(let [sds (ds/print-all (ds/->dataset {:x (repeatedly 50 rand)}))]
|
|
(is (= :all (:print-index-range (meta (ds/sort-by-column sds :x)))))
|
|
(is (= :all (:print-index-range (meta (ds/filter-column sds :x pos?)))))))
|
|
|
|
|
|
(deftest issue-447-filter-column-by-keyword
|
|
(is (= [:a :a :a :a :a]
|
|
(-> (ds/->dataset {:a [:a :b :a :c :a :d :a :e :a :f]})
|
|
(ds/filter-column :a :a)
|
|
(ds/column :a)
|
|
(vec)))))
|
|
|
|
(deftest issue-450-incorrect-distinct
|
|
(is (= 2
|
|
(->
|
|
(ds/->dataset {:y [:a :b :b :a :a :a :b :b]})
|
|
(ds/categorical->number [:y] [] :float64)
|
|
:y
|
|
distinct
|
|
count))))
|
|
|
|
(deftest clone-causes-filter-fail
|
|
(let [ds (-> (ds/->dataset "test/data/stocks.csv" {:key-fn keyword})
|
|
(ds/filter (fn [row]
|
|
(and
|
|
(.isAfter ^LocalDate (get row :date) (LocalDate/parse "2009-06-01"))
|
|
(= (get row :symbol) "AMZN")))))]
|
|
(is (= (vec (:date ds))
|
|
(vec (:date (dtype/clone ds)))))))
|
|
|
|
(deftest replace-missing-empty-column-issue-458
|
|
(is (= [100 100 100]
|
|
(-> (ds/->dataset [{:name "fred"
|
|
:age nil}
|
|
{:name "ethel"
|
|
:age nil}
|
|
{:name "sally"
|
|
:age nil}])
|
|
(ds/replace-missing [:age] :value 100)
|
|
(ds/column :age)
|
|
vec))))
|
|
|
|
(defn rolling-off-edge-fn
|
|
[]
|
|
(let [ds-fn (fn [relative-window-position]
|
|
(-> (ds/->dataset {:x (concat (repeat 20 1)
|
|
(repeat 20 0))})
|
|
(ds-roll/rolling 10 {:mean-x (ds-roll/mean :x)} {:relative-window-position relative-window-position})
|
|
(ds/print-all)))]
|
|
(is (not= (ds-fn :left)
|
|
(ds-fn :center)))
|
|
(is (not= (ds-fn :center)
|
|
(ds-fn :right)))
|
|
(is (not= (ds-fn :left)
|
|
(ds-fn :right)))))
|
|
|
|
(deftest rolling-off-edge
|
|
(rolling-off-edge-fn))
|
|
|
|
(defn stacked-rolling-fn
|
|
[]
|
|
(let [ds0 (-> (ds/->dataset {:y (repeat 20 2)
|
|
:x (range)
|
|
:t 0})
|
|
(ds-roll/rolling 10 {:mean-y (ds-roll/mean :y)} {:relative-window-position :left}))
|
|
ds1 (-> (ds/->dataset {:y (repeat 40 1)
|
|
:x (range)
|
|
:t 1})
|
|
(ds-roll/rolling 10 {:mean-y (ds-roll/mean :y)} {:relative-window-position :left}))
|
|
ds (-> (ds/concat ds0 ds1)
|
|
(ds/print-all))]
|
|
;; HH: 2025-09-08 - My condolences if this fails on your architecture
|
|
(is (every? #{1.0 2.0} (:mean-y ds)))))
|
|
|
|
(deftest stacked-rolling
|
|
(stacked-rolling-fn))
|
|
|
|
(comment
|
|
(require '[criterium.core :as crit])
|
|
(def data (vec (repeatedly 100000 (fn [] {:a (rand-int 20) :b (rand) :c (rand)}))))
|
|
(def ds (ds/->dataset data))
|
|
(crit/quick-bench (group-by :a data))
|
|
(crit/quick-bench (ds/group-by-column ds :a {:map-fn hamf/mut-long-hashtable-map}))
|
|
|
|
(crit/quick-bench (transduce (comp (filter #(> (:a %) 10))
|
|
(map #(* (:b %) (:c %))))
|
|
+ 0.0 data))
|
|
|
|
|
|
(require '[tech.v3.datatype.functional :as dfn])
|
|
|
|
(crit/quick-bench (as-> ds ds
|
|
(ds/filter-column ds :a #(> % 10))
|
|
(dfn/+ (ds :b) (ds :c))
|
|
(dfn/sum-fast ds)))
|
|
|
|
|
|
(require '[ham-fisted.api :as hamf])
|
|
(crit/quick-bench (as-> ds ds
|
|
(ds/filter-column ds :a (hamf/long-predicate a (> a 10)))
|
|
(dfn/+ (ds :b) (ds :c))
|
|
(dfn/sum-fast ds)))
|
|
|
|
|
|
(crit/quick-bench (transduce (comp (filter #(> (long (:a %)) 10))
|
|
(map #(* (double (:b %)) (double (:c %)))))
|
|
+ 0.0 data))
|
|
|
|
)
|