1712 lines
58 KiB
Clojure
Vendored
1712 lines
58 KiB
Clojure
Vendored
(ns tablecloth.playground
|
|
(:require [tablecloth.api :as tc]
|
|
[tablecloth.api.utils :refer [rank]]
|
|
[tech.v3.dataset :as ds]
|
|
[tech.v3.dataset.column :as c]
|
|
[tech.v3.dataset.join :as j]
|
|
[tech.v3.datatype.functional :as dfn]
|
|
[clojure.string :as str]
|
|
[tech.v3.datatype.argops :as aop]
|
|
[tech.v3.dataset.column :as col]
|
|
[tech.v3.io :as tio]
|
|
[tech.v3.dataset.io :as ds-io]
|
|
[tech.v3.dataset.zip :as zip]
|
|
|
|
[clojure.test :as t])
|
|
(:import [org.roaringbitmap RoaringBitmap]))
|
|
|
|
(ds/concat
|
|
(ds/new-dataset [(c/new-column :a [])])
|
|
(ds/new-dataset [(c/new-column :a [1 2 3])]))
|
|
|
|
|
|
(-> (tc/dataset {} {:column-names [:a :b]}))
|
|
|
|
(defn cartesian-product
|
|
[xxs]
|
|
(if (seq xxs)
|
|
(for [n (cartesian-product (rest xxs))
|
|
x (first xxs)]
|
|
(conj n x))
|
|
'(nil)))
|
|
|
|
(defn expand-grid
|
|
[in]
|
|
(-> (map (partial zipmap (keys in))
|
|
(cartesian-product (vals in)))
|
|
(tc/dataset)))
|
|
|
|
(def input {:height (range 60 81 5)
|
|
:weight (range 100 301 50)
|
|
:sex [:Male :Female]})
|
|
|
|
(expand-grid input)
|
|
;; => _unnamed [50 3]:
|
|
;; | :sex | :height | :weight |
|
|
;; |-------|--------:|--------:|
|
|
;; | :Male | 60 | 100 |
|
|
;; | :Male | 65 | 100 |
|
|
;; | :Male | 70 | 100 |
|
|
;; | :Male | 75 | 100 |
|
|
;; | :Male | 80 | 100 |
|
|
;; | :Male | 60 | 150 |
|
|
;; | :Male | 65 | 150 |
|
|
;; | :Male | 70 | 150 |
|
|
;; | :Male | 75 | 150 |
|
|
;; | :Male | 80 | 150 |
|
|
;; | :Male | 60 | 200 |
|
|
;; | :Male | 65 | 200 |
|
|
;; | :Male | 70 | 200 |
|
|
;; | :Male | 75 | 200 |
|
|
;; | :Male | 80 | 200 |
|
|
;; | :Male | 60 | 250 |
|
|
;; | :Male | 65 | 250 |
|
|
;; | :Male | 70 | 250 |
|
|
;; | :Male | 75 | 250 |
|
|
;; | :Male | 80 | 250 |
|
|
;; | :Male | 60 | 300 |
|
|
;; | :Male | 65 | 300 |
|
|
;; | :Male | 70 | 300 |
|
|
;; | :Male | 75 | 300 |
|
|
;; | :Male | 80 | 300 |
|
|
|
|
(def ds (-> {:tags [["app" "mobile" "travel"] ["business"] ["tool" "automation" "macos"]
|
|
["mobile" "macos"] ["travel" "app" "macos"]]}
|
|
(tc/dataset)
|
|
(tc/add-column :id (range)))) ;; we need to add artificial column
|
|
|
|
ds
|
|
;; => _unnamed [5 2]:
|
|
;; | :tags | :id |
|
|
;; |-------------------------------|----:|
|
|
;; | ["app" "mobile" "travel"] | 0 |
|
|
;; | ["business"] | 1 |
|
|
;; | ["tool" "automation" "macos"] | 2 |
|
|
;; | ["mobile" "macos"] | 3 |
|
|
;; | ["travel" "app" "macos"] | 4 |
|
|
|
|
|
|
(def unrolled-ds (-> (tc/unroll ds [:tags])
|
|
(tc/add-column :exists? true)))
|
|
|
|
unrolled-ds
|
|
;; => _unnamed [12 3]:
|
|
;; | :id | :tags | :exists? |
|
|
;; |----:|------------|----------|
|
|
;; | 0 | app | true |
|
|
;; | 0 | mobile | true |
|
|
;; | 0 | travel | true |
|
|
;; | 1 | business | true |
|
|
;; | 2 | tool | true |
|
|
;; | 2 | automation | true |
|
|
;; | 2 | macos | true |
|
|
;; | 3 | mobile | true |
|
|
;; | 3 | macos | true |
|
|
;; | 4 | travel | true |
|
|
;; | 4 | app | true |
|
|
;; | 4 | macos | true |
|
|
|
|
(tc/pivot->wider unrolled-ds :tags :exists? {:drop-missing? false})
|
|
;; => _unnamed [5 8]:
|
|
;; | :id | app | mobile | travel | business | tool | automation | macos |
|
|
;; |----:|------|--------|--------|----------|------|------------|-------|
|
|
;; | 2 | | | | | true | true | true |
|
|
;; | 3 | | true | | | | | true |
|
|
;; | 4 | true | | true | | | | true |
|
|
;; | 1 | | | | true | | | |
|
|
;; | 0 | true | true | true | | | | |
|
|
|
|
|
|
(-> (tc/pivot->wider unrolled-ds :tags :exists? {:drop-missing? false})
|
|
(tc/replace-missing :all :value false))
|
|
;; => _unnamed [5 8]:
|
|
;; | :id | app | mobile | travel | business | tool | automation | macos |
|
|
;; |----:|-------|--------|--------|----------|-------|------------|-------|
|
|
;; | 2 | false | false | false | false | true | true | true |
|
|
;; | 3 | false | true | false | false | false | false | true |
|
|
;; | 4 | true | false | true | false | false | false | true |
|
|
;; | 1 | false | false | false | true | false | false | false |
|
|
;; | 0 | true | true | true | false | false | false | false |
|
|
|
|
|
|
(-> (tech.v3.dataset/unroll-column ds :tags)
|
|
(tech.v3.dataset/categorical->one-hot [:tags]))
|
|
|
|
(def data (tech.v3.dataset/->dataset {:a [[1 2 3] [4 5] [6 7 8]]}))
|
|
|
|
data
|
|
;; => _unnamed [3 1]:
|
|
;; | :a |
|
|
;; |---------|
|
|
;; | [1 2 3] |
|
|
;; | [4 5] |
|
|
;; | [6 7 8] |
|
|
|
|
(tech.v3.dataset/unroll-column data :a)
|
|
;; exception
|
|
|
|
(def data2 (tech.v3.dataset/add-or-update-column data :b [-1 -2 -3]))
|
|
|
|
data2
|
|
;; => _unnamed [3 2]:
|
|
;; | :a | :b |
|
|
;; |---------|---:|
|
|
;; | [1 2 3] | -1 |
|
|
;; | [4 5] | -2 |
|
|
;; | [6 7 8] | -3 |
|
|
|
|
(tech.v3.dataset/unroll-column data2 :a)
|
|
;; => _unnamed [8 2]:
|
|
;; | :b | :a |
|
|
;; |---:|---:|
|
|
;; | -1 | 1 |
|
|
;; | -1 | 2 |
|
|
;; | -1 | 3 |
|
|
;; | -2 | 4 |
|
|
;; | -2 | 5 |
|
|
;; | -3 | 6 |
|
|
;; | -3 | 7 |
|
|
;; | -3 | 8 |
|
|
|
|
(require '[tech.v3.dataset.join :as j])
|
|
(def a (tc/dataset {:a [1 2 3] :b [4 5 6]}))
|
|
(def b (tc/dataset {:c [:a :b :c] :d [:x :y :z]}))
|
|
|
|
(j/pd-merge a b {:how :cross})
|
|
;; => cross-join [9 4]:
|
|
;; | :a | :b | :c | :d |
|
|
;; |---:|---:|----|----|
|
|
;; | 1 | 4 | :a | :x |
|
|
;; | 2 | 5 | :b | :y |
|
|
;; | 3 | 6 | :c | :z |
|
|
;; | 1 | 4 | :a | :x |
|
|
;; | 2 | 5 | :b | :y |
|
|
;; | 3 | 6 | :c | :z |
|
|
;; | 1 | 4 | :a | :x |
|
|
;; | 2 | 5 | :b | :y |
|
|
;; | 3 | 6 | :c | :z |
|
|
|
|
;;;;;;;
|
|
|
|
(def df (tc/dataset {:group [1 2 1]
|
|
:item-id [1 2 2]
|
|
:item-name [:a :b :b]
|
|
:value1 [1 2 3]
|
|
:value2 [4 5 6]}))
|
|
|
|
df
|
|
;; => _unnamed [3 5]:
|
|
;; | :group | :item-id | :item-name | :value1 | :value2 |
|
|
;; |-------:|---------:|------------|--------:|--------:|
|
|
;; | 1 | 1 | :a | 1 | 4 |
|
|
;; | 2 | 2 | :b | 2 | 5 |
|
|
;; | 1 | 2 | :b | 3 | 6 |
|
|
|
|
;; cross product of regular column and nested columns
|
|
(def tmp (j/pd-merge (-> (tc/select-columns df :group)
|
|
(tc/unique-by))
|
|
(-> (tc/select-columns df [:item-id :item-name])
|
|
(tc/unique-by)) {:how :cross}))
|
|
|
|
tmp
|
|
;; => cross-join [4 3]:
|
|
;; | :group | :item-id | :item-name |
|
|
;; |-------:|---------:|------------|
|
|
;; | 1 | 1 | :a |
|
|
;; | 1 | 2 | :b |
|
|
;; | 2 | 1 | :a |
|
|
;; | 2 | 2 | :b |
|
|
|
|
(-> (tc/left-join tmp df [:group :item-id :item-name])
|
|
(tc/select-columns (tc/column-names df)))
|
|
;; => left-outer-join [4 5]:
|
|
;; | :group | :item-id | :item-name | :value1 | :value2 |
|
|
;; |-------:|---------:|------------|--------:|--------:|
|
|
;; | 1 | 1 | :a | 1 | 4 |
|
|
;; | 2 | 2 | :b | 2 | 5 |
|
|
;; | 1 | 2 | :b | 3 | 6 |
|
|
;; | 2 | 1 | :a | | |
|
|
|
|
(-> (tc/left-join tmp df [:group :item-id :item-name])
|
|
(tc/select-columns (tc/column-names df))
|
|
(tc/replace-missing :all :value 0))
|
|
;; => left-outer-join [4 5]:
|
|
;; | :group | :item-id | :item-name | :value1 | :value2 |
|
|
;; |-------:|---------:|------------|--------:|--------:|
|
|
;; | 1 | 1 | :a | 1 | 4 |
|
|
;; | 2 | 2 | :b | 2 | 5 |
|
|
;; | 1 | 2 | :b | 3 | 6 |
|
|
;; | 2 | 1 | :a | 0 | 0 |
|
|
|
|
;;;
|
|
|
|
|
|
(-> (tc/dataset [{:setting "bar", :calendar-year 2020, :ay 0, :people 2, :need "foo"}
|
|
{:setting "bar", :calendar-year 2021, :ay 0, :people 4, :need "foo"}
|
|
{:setting "quux", :calendar-year 2020, :ay 1, :people 1, :need "foo"}
|
|
{:setting "quux", :calendar-year 2021, :ay 2, :people 3, :need "foo"}
|
|
{:setting "quux", :calendar-year 2022, :ay 3, :people 6, :need "foo"}])
|
|
(tc/select-rows (comp #{0 1 2} :ay))
|
|
(:ay)
|
|
(sequential?))
|
|
|
|
;; => _unnamed [4 5]:
|
|
;; | :setting | :calendar-year | :people | :ay | :need |
|
|
;; |----------|---------------:|--------:|----:|-------|
|
|
;; | bar | 2020 | 2 | 0 | foo |
|
|
;; | bar | 2021 | 4 | 0 | foo |
|
|
;; | quux | 2020 | 1 | 1 | foo |
|
|
;; | quux | 2021 | 3 | 2 | foo |
|
|
|
|
(def ds (tc/dataset {:a [1 2 3]
|
|
:b [:A :B :C]}))
|
|
|
|
(map? ds);; => true
|
|
(seqable? (:a ds));; => true
|
|
(sequential? (ds :b));; => true
|
|
|
|
(-> (tc/dataset [{:a 1 :b 1.1 :c 1.2 :e 1.3}
|
|
{:a 1 :b 1.1 :c 5 :e 1.3}
|
|
{:a 2 :b 2.2 :c 3.2 :e 5.3}
|
|
{:a 3 :b 3.3 :c 4.2 :e 6.3}])
|
|
(tc/group-by [:a :b])
|
|
(tc/unmark-group)
|
|
(tc/select-rows #(> (ds/row-count (:data %)) 1))
|
|
(tc/mark-as-group)
|
|
(tc/ungroup))
|
|
|
|
;; => _unnamed [2 4]:
|
|
;; | :b | :c | :a | :e |
|
|
;; |----:|----:|---:|----:|
|
|
;; | 1.1 | 1.2 | 1 | 1.3 |
|
|
;; | 1.1 | 5.0 | 1 | 1.3 |
|
|
|
|
(-> (tc/dataset [{:a 1 :b 1.1 :c 1.2 :e 1.3}
|
|
{:a 1 :b 1.1 :c 5 :e 1.3}
|
|
{:a 2 :b 2.2 :c 3.2 :e 5.3}
|
|
{:a 3 :b 3.3 :c 4.2 :e 6.3}])
|
|
(tc/group-by [:a :b]))
|
|
|
|
;; => _unnamed [3 3]:
|
|
;; | :group-id | :name | :data |
|
|
;; |----------:|----------------|------------------------------|
|
|
;; | 0 | {:b 1.1, :a 1} | Group: {:b 1.1, :a 1} [2 4]: |
|
|
;; | 1 | {:b 2.2, :a 2} | Group: {:b 2.2, :a 2} [1 4]: |
|
|
;; | 2 | {:b 3.3, :a 3} | Group: {:b 3.3, :a 3} [1 4]: |
|
|
|
|
|
|
;;
|
|
|
|
(-> (tc/dataset [{"id" 1, "name" "bob"} {"id" 2, "name" "bob"}, {"id" 3, "name" "alice"}])
|
|
(tc/fold-by ["name"] (partial str/join ", ")))
|
|
|
|
;; => _unnamed [2 2]:
|
|
;; | name | id |
|
|
;; |-------|------|
|
|
;; | bob | 1, 2 |
|
|
;; | alice | 3 |
|
|
|
|
|
|
|
|
(def ds (tc/dataset {:group (repeatedly 100 #(rand-nth [:u :i :o :p]))
|
|
:a (repeatedly 100 rand)
|
|
:b (repeatedly 100 #(rand 5))}))
|
|
|
|
(-> ds
|
|
(tc/select-rows (fn [{:keys [a b]}] (> a b))))
|
|
|
|
ds
|
|
;; => _unnamed [100 3]:
|
|
;; | :group | :a | :b |
|
|
;; |--------|-----------:|-----------:|
|
|
;; | :i | 0.21923487 | 0.02757318 |
|
|
;; | :o | 0.47121141 | 1.03039666 |
|
|
;; | :i | 0.26676569 | 4.16760503 |
|
|
;; | :u | 0.38287395 | 2.24760673 |
|
|
;; | :o | 0.96585848 | 0.01097307 |
|
|
;; | :o | 0.88531448 | 4.46218351 |
|
|
;; | :p | 0.19173693 | 3.57395669 |
|
|
;; | :p | 0.53615392 | 2.74081683 |
|
|
;; | :p | 0.43978083 | 4.67902393 |
|
|
;; | :p | 0.72916344 | 1.01818710 |
|
|
;; | ... | ... | ... |
|
|
;; | :o | 0.75565644 | 0.38405762 |
|
|
;; | :p | 0.06499388 | 2.39519054 |
|
|
;; | :p | 0.33801667 | 4.25192711 |
|
|
;; | :u | 0.76001839 | 3.49197309 |
|
|
;; | :u | 0.71527541 | 1.33661718 |
|
|
;; | :u | 0.46813365 | 3.31539727 |
|
|
;; | :o | 0.90347413 | 0.59158482 |
|
|
;; | :i | 0.40986820 | 1.26976567 |
|
|
;; | :u | 0.62212172 | 2.57981844 |
|
|
;; | :u | 0.77436201 | 1.90787845 |
|
|
;; | :p | 0.95894755 | 4.35116263 |
|
|
|
|
(def agg (-> ds
|
|
(tc/group-by [:group])
|
|
(tc/aggregate-columns [:a :b] dfn/mean)
|
|
(tc/rename-columns {:a :mean-a :b :mean-b})))
|
|
|
|
agg
|
|
;; => _unnamed [4 3]:
|
|
;; | :mean-b | :group | :mean-a |
|
|
;; |-----------:|--------|-----------:|
|
|
;; | 2.20742202 | :i | 0.53935031 |
|
|
;; | 2.00721651 | :o | 0.60963595 |
|
|
;; | 2.69337664 | :u | 0.48111990 |
|
|
;; | 2.69662835 | :p | 0.50979658 |
|
|
|
|
(-> (tc/left-join ds agg :group)
|
|
(tc/drop-columns :right.group))
|
|
|
|
;; => left-outer-join [100 5]:
|
|
;; | :group | :a | :b | :mean-b | :mean-a |
|
|
;; |--------|-----------:|-----------:|-----------:|-----------:|
|
|
;; | :i | 0.21923487 | 0.02757318 | 2.20742202 | 0.53935031 |
|
|
;; | :i | 0.26676569 | 4.16760503 | 2.20742202 | 0.53935031 |
|
|
;; | :i | 0.97345184 | 3.68318639 | 2.20742202 | 0.53935031 |
|
|
;; | :i | 0.19880123 | 1.53596251 | 2.20742202 | 0.53935031 |
|
|
;; | :i | 0.13353903 | 4.57865084 | 2.20742202 | 0.53935031 |
|
|
;; | :i | 0.49446615 | 0.12514693 | 2.20742202 | 0.53935031 |
|
|
;; | :i | 0.69925871 | 0.14210092 | 2.20742202 | 0.53935031 |
|
|
;; | :i | 0.77344282 | 0.86134163 | 2.20742202 | 0.53935031 |
|
|
;; | :i | 0.56868174 | 2.86614456 | 2.20742202 | 0.53935031 |
|
|
;; | :i | 0.37321600 | 4.43636290 | 2.20742202 | 0.53935031 |
|
|
;; | ... | ... | ... | ... | ... |
|
|
;; | :p | 0.13506468 | 1.93568665 | 2.69662835 | 0.50979658 |
|
|
;; | :p | 0.23051436 | 4.22573832 | 2.69662835 | 0.50979658 |
|
|
;; | :p | 0.22231685 | 3.38299530 | 2.69662835 | 0.50979658 |
|
|
;; | :p | 0.98586194 | 0.82699630 | 2.69662835 | 0.50979658 |
|
|
;; | :p | 0.82490726 | 2.79908028 | 2.69662835 | 0.50979658 |
|
|
;; | :p | 0.35192255 | 1.06928476 | 2.69662835 | 0.50979658 |
|
|
;; | :p | 0.79226756 | 2.31301546 | 2.69662835 | 0.50979658 |
|
|
;; | :p | 0.11991132 | 2.11139863 | 2.69662835 | 0.50979658 |
|
|
;; | :p | 0.06499388 | 2.39519054 | 2.69662835 | 0.50979658 |
|
|
;; | :p | 0.33801667 | 4.25192711 | 2.69662835 | 0.50979658 |
|
|
;; | :p | 0.95894755 | 4.35116263 | 2.69662835 | 0.50979658 |
|
|
|
|
|
|
(tc/add-columns ds {:mean-a (dfn/mean (:a ds))
|
|
:mean-b (dfn/mean (:b ds))})
|
|
|
|
;; => _unnamed [100 5]:
|
|
;; | :group | :a | :b | :mean-a | :mean-b |
|
|
;; |--------|-----------:|-----------:|-----------:|-----------:|
|
|
;; | :i | 0.21923487 | 0.02757318 | 0.52913219 | 2.43060934 |
|
|
;; | :o | 0.47121141 | 1.03039666 | 0.52913219 | 2.43060934 |
|
|
;; | :i | 0.26676569 | 4.16760503 | 0.52913219 | 2.43060934 |
|
|
;; | :u | 0.38287395 | 2.24760673 | 0.52913219 | 2.43060934 |
|
|
;; | :o | 0.96585848 | 0.01097307 | 0.52913219 | 2.43060934 |
|
|
;; | :o | 0.88531448 | 4.46218351 | 0.52913219 | 2.43060934 |
|
|
;; | :p | 0.19173693 | 3.57395669 | 0.52913219 | 2.43060934 |
|
|
;; | :p | 0.53615392 | 2.74081683 | 0.52913219 | 2.43060934 |
|
|
;; | :p | 0.43978083 | 4.67902393 | 0.52913219 | 2.43060934 |
|
|
;; | :p | 0.72916344 | 1.01818710 | 0.52913219 | 2.43060934 |
|
|
;; | ... | ... | ... | ... | ... |
|
|
;; | :o | 0.75565644 | 0.38405762 | 0.52913219 | 2.43060934 |
|
|
;; | :p | 0.06499388 | 2.39519054 | 0.52913219 | 2.43060934 |
|
|
;; | :p | 0.33801667 | 4.25192711 | 0.52913219 | 2.43060934 |
|
|
;; | :u | 0.76001839 | 3.49197309 | 0.52913219 | 2.43060934 |
|
|
;; | :u | 0.71527541 | 1.33661718 | 0.52913219 | 2.43060934 |
|
|
;; | :u | 0.46813365 | 3.31539727 | 0.52913219 | 2.43060934 |
|
|
;; | :o | 0.90347413 | 0.59158482 | 0.52913219 | 2.43060934 |
|
|
;; | :i | 0.40986820 | 1.26976567 | 0.52913219 | 2.43060934 |
|
|
;; | :u | 0.62212172 | 2.57981844 | 0.52913219 | 2.43060934 |
|
|
;; | :u | 0.77436201 | 1.90787845 | 0.52913219 | 2.43060934 |
|
|
;; | :p | 0.95894755 | 4.35116263 | 0.52913219 | 2.43060934 |
|
|
|
|
(def ds (tc/dataset {:a [1 2 3]
|
|
:b [99 98 97]
|
|
:c [:r :t :y]}))
|
|
|
|
(tc/rows ds) ;; returns vector of rows as vectors
|
|
;; => [[1 99 :r] [2 98 :t] [3 97 :y]]
|
|
(tc/rows ds :as-maps) ;; returns vector of rows as columns
|
|
;; => [{:a 1, :b 99, :c :r} {:a 2, :b 98, :c :t} {:a 3, :b 97, :c :y}]
|
|
|
|
(tc/columns ds) ;; returns sequence of columns
|
|
;; => [#tech.v3.dataset.column<int64>[3]
|
|
;; :a
|
|
;; [1, 2, 3] #tech.v3.dataset.column<int64>[3]
|
|
;; :b
|
|
;; [99, 98, 97] #tech.v3.dataset.column<keyword>[3]
|
|
;; :c
|
|
;; [:r, :t, :y]]
|
|
|
|
(tc/column ds :c) ;; returns column
|
|
;; => #tech.v3.dataset.column<keyword>[3]
|
|
;; :c
|
|
;; [:r, :t, :y]
|
|
|
|
;; column is a sequence
|
|
(seqable? (tc/column ds :c)) ;; => true
|
|
(first (tc/column ds :c)) ;; => :r
|
|
|
|
;; column is a vector
|
|
(sequential? (tc/column ds :c)) ;; => true
|
|
((tc/column ds :c) 0) ;; => :r
|
|
|
|
(def DS (tc/dataset {:V1 (take 9 (cycle [1 2]))
|
|
:V2 (range 1 10)
|
|
:V3 (take 9 (cycle [0.5 1.0 1.5]))
|
|
:V4 (take 9 (cycle ["A" "B" "C"]))}))
|
|
|
|
|
|
(-> (tc/group-by DS {:z [1 2 3]
|
|
:b [0 1 2]})
|
|
(tc/ungroup))
|
|
;; => _unnamed [2 3]:
|
|
;; | :name | :group-id | :data |
|
|
;; |---------|----------:|-----------------------|
|
|
;; | {:V1 1} | 0 | Group: {:V1 1} [5 4]: |
|
|
;; | {:V1 2} | 1 | Group: {:V1 2} [4 4]: |
|
|
|
|
(tc/map-rows DS (fn [{:keys [V1 V2]}] {:V1 0
|
|
:V5 (/ (+ V1 V2) (double V2))}))
|
|
|
|
(-> (tc/group-by DS [:V3])
|
|
(tc/map-rows (fn [{:keys [V1 V2]}] {:V1 0
|
|
:V5 (/ (+ V1 V2) (double V2))}))
|
|
(tc/ungroup))
|
|
|
|
|
|
(tc/separate-column DS :V3 (fn [^double v]
|
|
[(int (quot v 1.0))
|
|
(mod v 1.0)]))
|
|
|
|
(-> (tc/dataset {:x [1] :y [[2 3 9 10 11 22 33]]})
|
|
(tc/separate-column :y))
|
|
;; => _unnamed [1 8]:
|
|
;; | :x | :y-0 | :y-1 | :y-2 | :y-3 | :y-4 | :y-5 | :y-6 |
|
|
;; |---:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|
|
|
;; | 1 | 2 | 3 | 9 | 10 | 11 | 22 | 33 |
|
|
|
|
(-> (tc/dataset {:x [1] :y [[2 3 9 10 11 22 33]]})
|
|
(tc/separate-column :y reverse))
|
|
;; => _unnamed [1 8]:
|
|
;; | :x | :y-0 | :y-1 | :y-2 | :y-3 | :y-4 | :y-5 | :y-6 |
|
|
;; |---:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|
|
|
;; | 1 | 33 | 22 | 11 | 10 | 9 | 3 | 2 |
|
|
|
|
(-> (tc/dataset {:x [1] :y [[2 3 9 10 11 22 33]]})
|
|
(tc/separate-column :y (fn [input]
|
|
(zipmap "somenames" input))))
|
|
;; => _unnamed [1 7]:
|
|
;; | :x | a | s | e | m | n | o |
|
|
;; |---:|---:|--:|---:|---:|---:|--:|
|
|
;; | 1 | 22 | 2 | 10 | 33 | 11 | 3 |
|
|
|
|
|
|
;;
|
|
|
|
|
|
(let [col-names (map (partial str "row-") (range 10))]
|
|
(-> (tc/dataset {:x (range 10)
|
|
:y (range 100 110)
|
|
:z (seq "abcdefghij")})
|
|
(tc/rows)
|
|
(->> (zipmap col-names))
|
|
(tc/dataset)
|
|
(tc/select-columns col-names)))
|
|
|
|
;; => _unnamed [3 10]:
|
|
;; | row-0 | row-1 | row-2 | row-3 | row-4 | row-5 | row-6 | row-7 | row-8 | row-9 |
|
|
;; |-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|
|
|
;; | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
|
|
;; | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 |
|
|
;; | a | b | c | d | e | f | g | h | i | j |
|
|
|
|
(defonce stocks (tc/dataset "https://raw.githubusercontent.com/techascent/tech.ml.dataset/master/test/data/stocks.csv" {:key-fn keyword}))
|
|
|
|
(def price-index (-> (->> (stocks :price)
|
|
(map-indexed vector)
|
|
(group-by second))
|
|
(update-vals (partial map first))
|
|
(java.util.TreeMap.)))
|
|
|
|
;; selection
|
|
|
|
(-> stocks
|
|
(tc/select-rows (->> (.subMap price-index 10.0 true 20.0 false) ;; select range <10,20)
|
|
(.values) ;; get indices
|
|
(mapcat identity))))
|
|
|
|
;; => https://raw.githubusercontent.com/techascent/tech.ml.dataset/master/test/data/stocks.csv [61 3]:
|
|
;; | :symbol | :date | :price |
|
|
;; |---------|------------|-------:|
|
|
;; | AMZN | 2001-02-01 | 10.19 |
|
|
;; | AMZN | 2001-03-01 | 10.23 |
|
|
;; | AAPL | 2003-09-01 | 10.36 |
|
|
;; | AAPL | 2003-11-01 | 10.45 |
|
|
;; | AAPL | 2003-07-01 | 10.54 |
|
|
;; | AAPL | 2001-11-01 | 10.65 |
|
|
;; | AAPL | 2003-12-01 | 10.69 |
|
|
;; | AAPL | 2001-01-01 | 10.81 |
|
|
;; | AMZN | 2001-12-01 | 10.82 |
|
|
;; | AAPL | 2002-02-01 | 10.85 |
|
|
;; | ... | ... | ... |
|
|
;; | AMZN | 2002-12-01 | 18.89 |
|
|
;; | MSFT | 2008-12-01 | 18.91 |
|
|
;; | MSFT | 2003-01-01 | 19.31 |
|
|
;; | MSFT | 2003-02-01 | 19.34 |
|
|
;; | AMZN | 2002-10-01 | 19.36 |
|
|
;; | AAPL | 2004-09-01 | 19.38 |
|
|
;; | MSFT | 2002-07-01 | 19.52 |
|
|
;; | MSFT | 2008-11-01 | 19.66 |
|
|
;; | MSFT | 2003-03-01 | 19.76 |
|
|
;; | MSFT | 2009-04-01 | 19.84 |
|
|
;; | MSFT | 2002-08-01 | 19.97 |
|
|
|
|
(tc/aggregate-columns stocks count)
|
|
|
|
(ds/descriptive-stats stocks)
|
|
|
|
;; the exception around `tech.v3.dataset.format-sequence/formatter`
|
|
|
|
(def stats (ds/descriptive-stats stocks))
|
|
(seq stats)
|
|
|
|
;; => ([:col-name #tech.v3.dataset.column<keyword>[3]
|
|
;; :col-name
|
|
;; [:date, :price, :symbol]]
|
|
;; [:datatype #tech.v3.dataset.column<keyword>[3]
|
|
;; :datatype
|
|
;; [:packed-local-date, :float64, :string]]
|
|
;; [:n-valid #tech.v3.dataset.column<int64>[3]
|
|
;; :n-valid
|
|
;; [560, 560, 560]]
|
|
;; [:n-missing #tech.v3.dataset.column<int64>[3]
|
|
;; :n-missing
|
|
;; [0, 0, 0]]
|
|
;; [:min #tech.v3.dataset.column<object>[3]
|
|
;; :min
|
|
;; [2000-01-01, 5.970, ]]
|
|
;; [:mean #tech.v3.dataset.column<object>[3]
|
|
;; :mean
|
|
;; [2005-05-12, 100.7, ]]
|
|
;; [:mode #tech.v3.dataset.column<string>[3]
|
|
;; :mode
|
|
;; [MSFT]]
|
|
;; [:max #tech.v3.dataset.column<object>[3]
|
|
;; :max
|
|
;; [2010-03-01, 707.0, ]]
|
|
;; [:standard-deviation #tech.v3.dataset.column<float64>[3]
|
|
;; :standard-deviation
|
|
;; [9.250E+10, 132.6, ]]
|
|
;; [:skew #tech.v3.dataset.column<float64>[3]
|
|
;; :skew
|
|
;; [-0.1389, 2.413, ]]
|
|
;; [:first #tech.v3.dataset.column<object>[3]
|
|
;; :first
|
|
;; [2000-01-01, 39.81, MSFT]]
|
|
;; [:last #tech.v3.dataset.column<object>[3]
|
|
;; :last
|
|
;; [2010-03-01, 223.0, AAPL]])
|
|
|
|
;;
|
|
|
|
(tc/by-rank DS :V3 zero?) ;; most V3 values
|
|
|
|
(tc/by-rank DS :V3 zero? {:desc? false}) ;; least V3 values
|
|
|
|
(tc/by-rank DS :V3 zero? {:desc? false}) ;; least V3 values
|
|
|
|
(tc/by-rank DS [:V1 :V3] zero? {:desc? false})
|
|
|
|
(defonce flights (tc/dataset "https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv"))
|
|
|
|
(tc/select-rows flights #(> (get % "dep_delay") 1000))
|
|
;; => https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv [8 11]:
|
|
;; | year | month | day | dep_delay | arr_delay | carrier | origin | dest | air_time | distance | hour |
|
|
;; |-----:|------:|----:|----------:|----------:|---------|--------|------|---------:|---------:|-----:|
|
|
;; | 2014 | 2 | 15 | 1003 | 994 | DL | JFK | DEN | 242 | 1626 | 12 |
|
|
;; | 2014 | 2 | 21 | 1014 | 1007 | DL | JFK | MCO | 139 | 944 | 8 |
|
|
;; | 2014 | 4 | 15 | 1241 | 1223 | AA | JFK | BOS | 39 | 187 | 13 |
|
|
;; | 2014 | 6 | 13 | 1071 | 1064 | AA | EWR | DFW | 175 | 1372 | 10 |
|
|
;; | 2014 | 6 | 16 | 1022 | 1073 | AA | EWR | DFW | 178 | 1372 | 7 |
|
|
;; | 2014 | 7 | 14 | 1087 | 1090 | DL | EWR | ATL | 97 | 746 | 8 |
|
|
;; | 2014 | 9 | 12 | 1056 | 1115 | AA | EWR | DFW | 198 | 1372 | 6 |
|
|
;; | 2014 | 10 | 4 | 1498 | 1494 | AA | EWR | DFW | 200 | 1372 | 7 |
|
|
|
|
(tc/select-rows flights #(> (get % "dep_delay") 1000) {:result-type :as-indexes})
|
|
;; => (32306 37131 82591 131877 134039 158830 211512 230042)
|
|
|
|
(tc/select-rows flights '(32306 37131 82591 131877 134039 158830 211512 230042))
|
|
;; => https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv [8 11]:
|
|
;; | year | month | day | dep_delay | arr_delay | carrier | origin | dest | air_time | distance | hour |
|
|
;; |-----:|------:|----:|----------:|----------:|---------|--------|------|---------:|---------:|-----:|
|
|
;; | 2014 | 2 | 15 | 1003 | 994 | DL | JFK | DEN | 242 | 1626 | 12 |
|
|
;; | 2014 | 2 | 21 | 1014 | 1007 | DL | JFK | MCO | 139 | 944 | 8 |
|
|
;; | 2014 | 4 | 15 | 1241 | 1223 | AA | JFK | BOS | 39 | 187 | 13 |
|
|
;; | 2014 | 6 | 13 | 1071 | 1064 | AA | EWR | DFW | 175 | 1372 | 10 |
|
|
;; | 2014 | 6 | 16 | 1022 | 1073 | AA | EWR | DFW | 178 | 1372 | 7 |
|
|
;; | 2014 | 7 | 14 | 1087 | 1090 | DL | EWR | ATL | 97 | 746 | 8 |
|
|
;; | 2014 | 9 | 12 | 1056 | 1115 | AA | EWR | DFW | 198 | 1372 | 6 |
|
|
;; | 2014 | 10 | 4 | 1498 | 1494 | AA | EWR | DFW | 200 | 1372 | 7 |
|
|
|
|
(tc/select-rows flights #(> (get % "dep_delay") 1000) {:pre {:mean #(dfn/mean (get % "dep_delay"))}})
|
|
|
|
(->> flights
|
|
(ds/mapseq-reader)
|
|
(aop/argfilter (comp boolean (fn [row] (and (= (get row "origin") "JFK")
|
|
(= (get row "month") 6))))))
|
|
|
|
|
|
|
|
;;
|
|
|
|
|
|
(tc/aggregate DS #(reduce + (% :V2)))
|
|
|
|
(tc/aggregate DS {:sum-of-V2 #(reduce + (% :V2))})
|
|
|
|
(tc/aggregate DS #(take 5 (% :V2)))
|
|
|
|
(tc/aggregate DS [#(take 3 (% :V2))
|
|
(fn [ds] {:sum-v1 (reduce + (ds :V1))
|
|
:prod-v3 (reduce * (ds :V3))})] {:default-column-name-prefix "V2-value"})
|
|
|
|
|
|
(require '[tech.v3.dataset :as ds])
|
|
(require '[tech.v3.datatype.datetime :as dtype-dt])
|
|
|
|
(def stocks' (-> stocks
|
|
(ds/update-column :date #(dtype-dt/datetime->epoch :epoch-days %))))
|
|
|
|
(require '[tech.v3.dataset.reductions :as ds-reduce])
|
|
|
|
(ds-reduce/group-by-column-agg
|
|
:symbol
|
|
{
|
|
:something (fn [a b] a b)
|
|
:price-avg (ds-reduce/mean :price)
|
|
:price-sum (ds-reduce/sum :price)
|
|
:price-med (ds-reduce/prob-median :price)}
|
|
(repeat 3 stocks'))
|
|
|
|
|
|
(def DSm2 (tc/dataset {:a [nil nil nil 1.0 2 nil nil nil nil nil 4 nil 11 nil nil]
|
|
:b [2 2 2 nil nil nil nil nil nil 13 nil 3 4 5 5]}))
|
|
|
|
DSm2
|
|
;; => _unnamed [15 2]:
|
|
;; | :a | :b |
|
|
;; |-----:|---:|
|
|
;; | | 2 |
|
|
;; | | 2 |
|
|
;; | | 2 |
|
|
;; | 1.0 | |
|
|
;; | 2.0 | |
|
|
;; | | |
|
|
;; | | |
|
|
;; | | |
|
|
;; | | |
|
|
;; | | 13 |
|
|
;; | 4.0 | |
|
|
;; | | 3 |
|
|
;; | 11.0 | 4 |
|
|
;; | | 5 |
|
|
;; | | 5 |
|
|
|
|
;; indexes of missing values
|
|
(col/missing (DSm2 :a)) ;; => {0,1,2,5,6,7,8,9,11,13,14}
|
|
(col/missing (DSm2 :b)) ;; => {3,4,5,6,7,8,10}
|
|
|
|
(class (col/missing (DSm2 :a))) ;; => org.roaringbitmap.RoaringBitmap
|
|
|
|
;; index of the nearest non-missing value in column `:a` starting from 0
|
|
(.nextAbsentValue (col/missing (DSm2 :a)) 0) ;; => 3
|
|
;; there is no previous non-missing
|
|
(.previousAbsentValue (col/missing (DSm2 :a)) 0) ;; => -1
|
|
|
|
;; replace some missing values by hand
|
|
(tc/replace-missing DSm2 :a :value {0 100 1 -100 14 -1000})
|
|
;; => _unnamed [15 2]:
|
|
;; | :a | :b |
|
|
;; |--------:|---:|
|
|
;; | 100.0 | 2 |
|
|
;; | -100.0 | 2 |
|
|
;; | | 2 |
|
|
;; | 1.0 | |
|
|
;; | 2.0 | |
|
|
;; | | |
|
|
;; | | |
|
|
;; | | |
|
|
;; | | |
|
|
;; | | 13 |
|
|
;; | 4.0 | |
|
|
;; | | 3 |
|
|
;; | 11.0 | 4 |
|
|
;; | | 5 |
|
|
;; | -1000.0 | 5 |
|
|
|
|
;;
|
|
|
|
(let [ds (ds/->dataset {:foo (range 0 5)
|
|
:bar (repeatedly #(rand-int 100))
|
|
:baz (repeatedly #(rand-int 100))})]
|
|
(ds/add-or-update-column ds :quz (apply col/column-map
|
|
(fn [foo bar baz]
|
|
(if (zero? (mod (+ foo bar baz) 7)) "mod 7" "not mod 7"))
|
|
nil (ds/columns ds))))
|
|
|
|
|
|
|
|
(let [ds (ds/->dataset {:foo (range 0 5)
|
|
:bar (repeatedly #(rand-int 100))})]
|
|
(ds/add-or-update-column ds :quz (apply col/column-map
|
|
(fn [foo bar]
|
|
(if (zero? (mod (+ foo bar) 7)) "mod 7" "not mod 7"))
|
|
nil (ds/columns ds))))
|
|
;; => _unnamed [5 3]:
|
|
;; | :foo | :bar | :quz |
|
|
;; |-----:|-----:|-----------|
|
|
;; | 0 | 63 | mod 7 |
|
|
;; | 1 | 20 | mod 7 |
|
|
;; | 2 | 15 | not mod 7 |
|
|
;; | 3 | 85 | not mod 7 |
|
|
;; | 4 | 46 | not mod 7 |
|
|
|
|
|
|
;;
|
|
|
|
|
|
(def ds (ds/->dataset [{"DestinationName" "CZ_1", "ProductName" "FG_1", "Quantity" 100, "Allocated-Qty" 0, "SourceName" "DC_1", :ratio 0.5} {"DestinationName" "CZ_1", "ProductName" "FG_1", "Quantity" 100, "Allocated-Qty" 0, "SourceName" "DC_2", :ratio 0.5}]))
|
|
|
|
;; => _unnamed [2 6]:
|
|
;; | DestinationName | ProductName | Quantity | Allocated-Qty | SourceName | :ratio |
|
|
;; |-----------------|-------------|---------:|--------------:|------------|-------:|
|
|
;; | CZ_1 | FG_1 | 100 | 0 | DC_1 | 0.5 |
|
|
;; | CZ_1 | FG_1 | 100 | 0 | DC_2 | 0.5 |
|
|
|
|
(ds/concat ds ds)
|
|
|
|
;; => _unnamed [4 6]:
|
|
;; | DestinationName | ProductName | Quantity | Allocated-Qty | SourceName | :ratio |
|
|
;; |-----------------|-------------|---------:|--------------:|------------|-------:|
|
|
;; | CZ_1 | FG_1 | 100 | 0 | DC_1 | 0.5 |
|
|
;; | CZ_1 | FG_1 | 100 | 0 | DC_2 | 0.5 |
|
|
;; | CZ_1 | FG_1 | 100 | 0 | DC_1 | 0.5 |
|
|
;; | CZ_1 | FG_1 | 100 | 0 | DC_2 | 0.5 |
|
|
|
|
(ds/concat-copying ds ds)
|
|
|
|
;; => _unnamed [4 6]:
|
|
;; | DestinationName | ProductName | Quantity | Allocated-Qty | SourceName | :ratio |
|
|
;; |-----------------|-------------|---------:|--------------:|------------|-------:|
|
|
;; | CZ_1 | FG_1 | 100 | 0 | DC_1 | 0.5 |
|
|
;; | CZ_1 | FG_1 | 100 | 0 | DC_2 | 0.5 |
|
|
;; | CZ_1 | FG_1 | 100 | 0 | DC_1 | 0.5 |
|
|
;; | CZ_1 | FG_1 | 100 | 0 | DC_2 | 0.5 |
|
|
|
|
|
|
|
|
;;
|
|
|
|
(def joinrds (tc/dataset {:a (range 1 7)
|
|
:b (range 7 13)
|
|
:c (range 13 19)
|
|
:ID (seq "bbbaac")
|
|
:d ["hello" "hello" "hello" "water" "water" "world"]}))
|
|
|
|
;; => _unnamed [6 5]:
|
|
;; | :a | :b | :c | :ID | :d |
|
|
;; |---:|---:|---:|-----|-------|
|
|
;; | 1 | 7 | 13 | b | hello |
|
|
;; | 2 | 8 | 14 | b | hello |
|
|
;; | 3 | 9 | 15 | b | hello |
|
|
;; | 4 | 10 | 16 | a | water |
|
|
;; | 5 | 11 | 17 | a | water |
|
|
;; | 6 | 12 | 18 | c | world |
|
|
|
|
(-> joinrds
|
|
(tc/group-by [:ID :d])
|
|
(tc/ungroup))
|
|
|
|
;; => _unnamed [6 5]:
|
|
;; | :a | :b | :c | :ID | :d |
|
|
;; |---:|---:|---:|-----|-------|
|
|
;; | 1 | 7 | 13 | b | hello |
|
|
;; | 2 | 8 | 14 | b | hello |
|
|
;; | 3 | 9 | 15 | b | hello |
|
|
;; | 4 | 10 | 16 | a | water |
|
|
;; | 5 | 11 | 17 | a | water |
|
|
;; | 6 | 12 | 18 | c | world |
|
|
|
|
(-> joinrds
|
|
(tc/group-by [:ID :d])
|
|
(tc/ungroup {:add-group-id-as-column :blah}))
|
|
|
|
;; => _unnamed [6 6]:
|
|
;; | :blah | :a | :b | :c | :ID | :d |
|
|
;; |------:|---:|---:|---:|-----|-------|
|
|
;; | 0 | 1 | 7 | 13 | b | hello |
|
|
;; | 0 | 2 | 8 | 14 | b | hello |
|
|
;; | 0 | 3 | 9 | 15 | b | hello |
|
|
;; | 1 | 4 | 10 | 16 | a | water |
|
|
;; | 1 | 5 | 11 | 17 | a | water |
|
|
;; | 2 | 6 | 12 | 18 | c | world |
|
|
|
|
(-> joinrds
|
|
(tc/group-by [:ID :d])
|
|
(tc/unmark-group))
|
|
|
|
;; => _unnamed [3 3]:
|
|
;; | :name | :group-id | :data |
|
|
;; |----------------------|----------:|------------------------------------|
|
|
;; | {:ID \b, :d "hello"} | 0 | Group: {:ID \b, :d "hello"} [3 5]: |
|
|
;; | {:ID \a, :d "water"} | 1 | Group: {:ID \a, :d "water"} [2 5]: |
|
|
;; | {:ID \c, :d "world"} | 2 | Group: {:ID \c, :d "world"} [1 5]: |
|
|
|
|
(-> joinrds
|
|
(tc/group-by [:ID :d])
|
|
(tc/ungroup {:add-group-as-column :blah
|
|
:separate? false}))
|
|
|
|
;; => _unnamed [6 6]:
|
|
;; | :blah | :a | :b | :c | :ID | :d |
|
|
;; |----------------------|---:|---:|---:|-----|-------|
|
|
;; | {:ID \b, :d "hello"} | 1 | 7 | 13 | b | hello |
|
|
;; | {:ID \b, :d "hello"} | 2 | 8 | 14 | b | hello |
|
|
;; | {:ID \b, :d "hello"} | 3 | 9 | 15 | b | hello |
|
|
;; | {:ID \a, :d "water"} | 4 | 10 | 16 | a | water |
|
|
;; | {:ID \a, :d "water"} | 5 | 11 | 17 | a | water |
|
|
;; | {:ID \c, :d "world"} | 6 | 12 | 18 | c | world |
|
|
|
|
|
|
(-> joinrds
|
|
(tc/group-by :ID)
|
|
(tc/ungroup {:add-group-as-column :blah}))
|
|
|
|
;; => _unnamed [6 6]:
|
|
;; | :blah | :a | :b | :c | :ID | :d |
|
|
;; |-------|---:|---:|---:|-----|-------|
|
|
;; | b | 1 | 7 | 13 | b | hello |
|
|
;; | b | 2 | 8 | 14 | b | hello |
|
|
;; | b | 3 | 9 | 15 | b | hello |
|
|
;; | a | 4 | 10 | 16 | a | water |
|
|
;; | a | 5 | 11 | 17 | a | water |
|
|
;; | c | 6 | 12 | 18 | c | world |
|
|
|
|
|
|
(-> joinrds
|
|
(tc/group-by (juxt :ID :d))
|
|
(tc/unmark-group))
|
|
|
|
;; => _unnamed [3 3]:
|
|
;; | :name | :group-id | :data |
|
|
;; |--------------|----------:|----------------------------|
|
|
;; | [\b "hello"] | 0 | Group: [\b "hello"] [3 5]: |
|
|
;; | [\a "water"] | 1 | Group: [\a "water"] [2 5]: |
|
|
;; | [\c "world"] | 2 | Group: [\c "world"] [1 5]: |
|
|
|
|
(-> joinrds
|
|
(tc/group-by (juxt :ID :d))
|
|
(tc/ungroup {:add-group-as-column :blah}))
|
|
|
|
;; => _unnamed [6 7]:
|
|
;; | :blah-0 | :blah-1 | :a | :b | :c | :ID | :d |
|
|
;; |---------|---------|---:|---:|---:|-----|-------|
|
|
;; | b | hello | 1 | 7 | 13 | b | hello |
|
|
;; | b | hello | 2 | 8 | 14 | b | hello |
|
|
;; | b | hello | 3 | 9 | 15 | b | hello |
|
|
;; | a | water | 4 | 10 | 16 | a | water |
|
|
;; | a | water | 5 | 11 | 17 | a | water |
|
|
;; | c | world | 6 | 12 | 18 | c | world |
|
|
|
|
(-> joinrds
|
|
(tc/group-by (juxt :ID :d))
|
|
(tc/ungroup {:add-group-as-column :blah
|
|
:separate? false}))
|
|
|
|
;; => _unnamed [6 6]:
|
|
;; | :blah | :a | :b | :c | :ID | :d |
|
|
;; |--------------|---:|---:|---:|-----|-------|
|
|
;; | [\b "hello"] | 1 | 7 | 13 | b | hello |
|
|
;; | [\b "hello"] | 2 | 8 | 14 | b | hello |
|
|
;; | [\b "hello"] | 3 | 9 | 15 | b | hello |
|
|
;; | [\a "water"] | 4 | 10 | 16 | a | water |
|
|
;; | [\a "water"] | 5 | 11 | 17 | a | water |
|
|
;; | [\c "world"] | 6 | 12 | 18 | c | world |
|
|
|
|
|
|
(-> joinrds
|
|
(tc/group-by [:ID :d])
|
|
(tc/aggregate-columns [:a :b :c] dfn/mean))
|
|
|
|
;; => _unnamed [3 5]:
|
|
;; | :ID | :d | :a | :b | :c |
|
|
;; |-----|-------|----:|-----:|-----:|
|
|
;; | b | hello | 2.0 | 8.0 | 14.0 |
|
|
;; | a | water | 4.5 | 10.5 | 16.5 |
|
|
;; | c | world | 6.0 | 12.0 | 18.0 |
|
|
|
|
(-> joinrds
|
|
(tc/group-by [:ID :d])
|
|
(tc/aggregate-columns [:a :b :c] dfn/mean {:separate? false}))
|
|
|
|
;; => _unnamed [3 4]:
|
|
;; | :$group-name | :a | :b | :c |
|
|
;; |----------------------|----:|-----:|-----:|
|
|
;; | {:ID \b, :d "hello"} | 2.0 | 8.0 | 14.0 |
|
|
;; | {:ID \a, :d "water"} | 4.5 | 10.5 | 16.5 |
|
|
;; | {:ID \c, :d "world"} | 6.0 | 12.0 | 18.0 |
|
|
|
|
|
|
;; #108
|
|
|
|
(let [qids ["AGYSUB" "LOC" "AGELVL" "EDLVL"]
|
|
per-val (-> (tc/dataset "per-val.csv.gz")
|
|
(tc/set-dataset-name "per-val"))
|
|
per-qid (-> (tc/dataset "per-qid.csv.gz")
|
|
(tc/set-dataset-name "per-qid"))]
|
|
(-> (tc/left-join per-val per-qid qids)
|
|
(tc/select-rows (fn [row]
|
|
(or (not= (get row "per-qid.AGYSUB") (get row "AGYSUB"))
|
|
(not= (get row "per-qid.LOC") (get row "LOC"))
|
|
(not= (get row "per-qid.AGELVL") (get row "AGELVL"))
|
|
(not= (get row "per-qid.EDLVL") (get row "EDLVL")))))))
|
|
|
|
(ds/select-rows (ds/->dataset []) [0])
|
|
;; => _unnamed [0 0]
|
|
|
|
|
|
;;
|
|
|
|
(get-in (read-string (slurp "deps.edn")) [:deps 'techascent/tech.ml.dataset :mvn/version])
|
|
;; => "7.000-beta-50"
|
|
|
|
(nth (read-string (slurp "project.clj")) 2)
|
|
|
|
(def ds1 (tc/dataset {:a [1 2 1 2 3 4 nil nil 4]
|
|
:b (range 101 110)
|
|
:c (map str "abs tract")}))
|
|
(def ds2 (tc/dataset {:a [nil 1 2 5 4 3 2 1 nil]
|
|
:b (range 110 101 -1)
|
|
:c (map str "datatable")
|
|
:d (symbol "X")
|
|
:e [3 4 5 6 7 nil 8 1 1]}))
|
|
|
|
(tc/left-join ds1 ds2 :b {:hashing (fn [[v]] (mod v 5))})
|
|
|
|
(defn last-char
|
|
[ds]
|
|
(->> (ds/value-reader ds)
|
|
(map (comp last str first))))
|
|
|
|
(last-char ds1)
|
|
;; => (\1 \2 \3 \4 \5 \6 \7 \8 \9)
|
|
|
|
(last-char ds2)
|
|
;; => (\0 \9 \8 \7 \6 \5 \4 \3 \2)
|
|
|
|
(def new-ds1 (ds/add-or-update-column ds1 :z (last-char ds1)))
|
|
(def new-ds2 (ds/add-or-update-column ds2 :z (last-char ds2)))
|
|
|
|
(j/left-join :z new-ds1 new-ds2)
|
|
|
|
|
|
(def ds1 (ds/->dataset {:a '(\1 \2 \3 \4 \5 \6 \7 \8 \9)}))
|
|
(def ds2 (ds/->dataset {:a '(\0 \9 \8 \7 \6 \5 \4 \3 \2)}))
|
|
|
|
(ds1 :a)
|
|
;; => #tech.v3.dataset.column<char>[9]
|
|
;; :a
|
|
;; [1, 2, 3, 4, 5, 6, 7, 8, 9]
|
|
|
|
(ds2 :a)
|
|
;; => #tech.v3.dataset.column<char>[9]
|
|
;; :a
|
|
;; [0, 9, 8, 7, 6, 5, 4, 3, 2]
|
|
|
|
(j/left-join :a ds1 ds2)
|
|
|
|
(j/left-join :a ds1 ds1)
|
|
;; => left-outer-join [9 2]:
|
|
;; | :a | :right.a |
|
|
;; |----|----------|
|
|
;; | 1 | 1 |
|
|
;; | 2 | 2 |
|
|
;; | 3 | 3 |
|
|
;; | 4 | 4 |
|
|
;; | 5 | 5 |
|
|
;; | 6 | 6 |
|
|
;; | 7 | 7 |
|
|
;; | 8 | 8 |
|
|
;; | 9 | 9 |
|
|
|
|
|
|
(j/left-join :a ds2 ds2)
|
|
;; => left-outer-join [9 2]:
|
|
;; | :a | :right.a |
|
|
;; |----|----------|
|
|
;; | 0 | 0 |
|
|
;; | 9 | 9 |
|
|
;; | 8 | 8 |
|
|
;; | 7 | 7 |
|
|
;; | 6 | 6 |
|
|
;; | 5 | 5 |
|
|
;; | 4 | 4 |
|
|
;; | 3 | 3 |
|
|
;; | 2 | 2 |
|
|
|
|
|
|
(def billboard (tc/dataset "data/data/billboard/billboard.csv.gz"))
|
|
|
|
(with-open [io (-> (tio/input-stream "data.zip")
|
|
(java.util.zip.ZipInputStream.))]
|
|
(ds-io/str->file-info (.getName (.getNextEntry io))))
|
|
;; => {:gzipped? false, :file-type :unknown}
|
|
|
|
(zip/zipfile->dataset-seq "data/data.zip")
|
|
|
|
;;
|
|
|
|
(defn get-rand [n col] (repeatedly n #(rand-nth col)))
|
|
|
|
(def actions (tc/dataset {:campaign (get-rand 1000 [1000 1001 1002 1000 1000])
|
|
:click (get-rand 1000 [true false false false])
|
|
:skip (get-rand 1000 [true false])
|
|
:abandon (get-rand 1000 [true true true false])}))
|
|
|
|
(-> actions
|
|
(tc/convert-types :type/boolean :int16) ;; convert true/false to 1/0
|
|
(tc/group-by [:campaign])
|
|
(tc/aggregate {:impressions tc/row-count
|
|
:clicks #(dfn/sum (:click %))
|
|
:skips #(dfn/sum (:skip %))
|
|
:abandons #(dfn/sum (:abandon %))})
|
|
(tc/map-rows (fn [{:keys [clicks abandons impressions]}]
|
|
{:ctr% (* 100 (/ clicks impressions))
|
|
:atr% (* 100 (/ abandons impressions))})))
|
|
|
|
;; => _unnamed [3 7]:
|
|
;; | :campaign | :impressions | :clicks | :skips | :abandons | :ctr% | :atr% |
|
|
;; |----------:|-------------:|--------:|-------:|----------:|------------:|------------:|
|
|
;; | 1001 | 182 | 52.0 | 82.0 | 127.0 | 28.57142857 | 69.78021978 |
|
|
;; | 1000 | 609 | 157.0 | 314.0 | 450.0 | 25.77996716 | 73.89162562 |
|
|
;; | 1002 | 209 | 47.0 | 113.0 | 160.0 | 22.48803828 | 76.55502392 |
|
|
|
|
|
|
(-> actions
|
|
(tc/convert-types :type/boolean :int16) ;; convert true/false to 1/0
|
|
(tc/add-column :impressions 1) ;; add artificial column filled with '1'
|
|
(tc/group-by [:campaign])
|
|
(tc/aggregate-columns [:impressions :click :skip :abandon] dfn/sum) ;; just sum selected columns
|
|
(tc/map-rows (fn [{:keys [click abandon impressions]}] ;; calculate
|
|
{:ctr% (* 100 (/ click impressions))
|
|
:atr% (* 100 (/ abandon impressions))}))
|
|
(tc/convert-types [:impressions :click :skip :abandon] :int32)
|
|
(tc/rename-columns {:click :clicks :skip :skips :abandon :abandons}))
|
|
|
|
;; => _unnamed [3 7]:
|
|
;; | :campaign | :impressions | :clicks | :skips | :abandons | :ctr% | :atr% |
|
|
;; |----------:|-------------:|--------:|-------:|----------:|------------:|------------:|
|
|
;; | 1001 | 182 | 52 | 82 | 127 | 28.57142857 | 69.78021978 |
|
|
;; | 1000 | 609 | 157 | 314 | 450 | 25.77996716 | 73.89162562 |
|
|
;; | 1002 | 209 | 47 | 113 | 160 | 22.48803828 | 76.55502392 |
|
|
|
|
|
|
(def ds1 (tc/dataset [{:a 1 :b "test"} {:a 2 :b "hi"}]))
|
|
(def ds2 (tc/dataset [{:a 2 :b "hi"}]))
|
|
|
|
(tc/difference ds1 ds2)
|
|
;; => difference [1 2]:
|
|
;; | :a | :b |
|
|
;; |---:|------|
|
|
;; | 1 | test |
|
|
|
|
|
|
(def ds (tc/dataset {:a ["a" "" " " "b"]}))
|
|
|
|
ds
|
|
;; => _unnamed [4 1]:
|
|
;; | :a |
|
|
;; |----|
|
|
;; | a |
|
|
;; | |
|
|
;; | |
|
|
;; | b |
|
|
|
|
(tc/info ds)
|
|
;; => _unnamed: descriptive-stats [1 7]:
|
|
;; | :col-name | :datatype | :n-valid | :n-missing | :mode | :first | :last |
|
|
;; |-----------|-----------|---------:|-----------:|-------|--------|-------|
|
|
;; | :a | :string | 3 | 1 | a | a | b |
|
|
|
|
(tc/replace-missing ds :a :value "it was a missing value")
|
|
;; => _unnamed [4 1]:
|
|
;; | :a |
|
|
;; |------------------------|
|
|
;; | a |
|
|
;; | it was a missing value |
|
|
;; | |
|
|
;; | b |
|
|
|
|
|
|
(def ds (tc/dataset {:a [1 2 3 4]}))
|
|
(def c (ds :a))
|
|
|
|
c
|
|
;; => #tech.v3.dataset.column<int64>[4]
|
|
;; :a
|
|
;; [1, 2, 3, 4]
|
|
|
|
(associative? c) ;; => true
|
|
(assoc c 2 11) ;; => [1 2 11 4]
|
|
|
|
(sequential? c) ;; => true
|
|
(seqable? c) ;; => true
|
|
(seq c) ;; => (1 2 3 4)
|
|
|
|
(reversible? c) ;; => true
|
|
(reverse c) ;; => (4 3 2 1)
|
|
|
|
(indexed? c) ;; => true
|
|
(c 2) ;; => 3
|
|
|
|
;; however
|
|
|
|
(vector? c) ;; => false
|
|
(seq? c) ;; => false
|
|
|
|
|
|
(-> {:a [1 nil 2]
|
|
:b [3 4 nil]}
|
|
(tc/dataset)
|
|
(tc/rows :as-maps {:nil-missing? false}))
|
|
|
|
(->
|
|
(tablecloth.api/dataset {:a [1 2 3]})
|
|
(tablecloth.api/add-column :Survived [ "na"] :cycle)
|
|
:Survived)
|
|
|
|
(tech.v3.dataset/new-column :Survived [""])
|
|
;; => #tech.v3.dataset.column<boolean>[1]
|
|
;; :Survived
|
|
;; []
|
|
|
|
(tech.v3.dataset/new-column :Survived ["na"])
|
|
;; => #tech.v3.dataset.column<boolean>[1]
|
|
;; :Survived
|
|
;; []
|
|
|
|
(let [ds (tech.v3.dataset/->dataset {:Survived (tech.v3.dataset/new-column :Survived [""])})]
|
|
(apply tech.v3.dataset/concat (repeat 3 ds)))
|
|
;; => _unnamed [3 1]:
|
|
;; | :Survived |
|
|
;; |-----------|
|
|
;; | false |
|
|
;; | false |
|
|
;; | false |
|
|
|
|
(let [ds (tech.v3.dataset/->dataset {:Survived (tech.v3.dataset/new-column :Survived ["na"])})]
|
|
(apply tech.v3.dataset/concat (repeat 3 ds)))
|
|
;; => _unnamed [3 1]:
|
|
;; | :Survived |
|
|
;; |-----------|
|
|
;; | false |
|
|
;; | false |
|
|
;; | false |
|
|
|
|
(let [ds (tech.v3.dataset/new-dataset [(tech.v3.dataset/new-column :Survived [""])])]
|
|
(apply tech.v3.dataset/concat (repeat 3 ds)))
|
|
;; => _unnamed [3 1]:
|
|
;; | :Survived |
|
|
;; |-----------|
|
|
;; | |
|
|
;; | |
|
|
;; | |
|
|
|
|
|
|
(-> (tc/dataset [{:a "foo" :b true}
|
|
{:a "bar" :b false}])
|
|
(tc/join-columns :join-columns-string [:a :b] {:drop-columns? false})
|
|
(tc/join-columns :join-columns-map [:a :b] {:drop-columns? false
|
|
:result-type :map})
|
|
(tc/add-column :rows-as-maps (fn [ds] (-> ds
|
|
(tc/select-columns [:a :b])
|
|
(tc/rows :as-maps)))))
|
|
|
|
;; => _unnamed [2 5]:
|
|
;; | :a | :b | :join-columns-string | :join-columns-map | :rows-as-maps |
|
|
;; |-----|-------|----------------------|----------------------|----------------------|
|
|
;; | foo | true | foo-true | {:a "foo", :b true} | {:a "foo", :b true} |
|
|
;; | bar | false | bar-false | {:a "bar", :b false} | {:a "bar", :b false} |
|
|
|
|
(def foo (tc/dataset [{:a "bar" :b nil :r 1}
|
|
{:a "bar" :b nil :r 2}
|
|
{:a "baz" :b nil :r 3}]))
|
|
|
|
(-> foo
|
|
(tc/group-by [:a :b])
|
|
(tc/aggregate {:row-count tc/row-count}))
|
|
|
|
|
|
(tc/fold-by foo [:a :b])
|
|
|
|
(-> (tc/dataset [{:r nil :c "a" :x "2a"}
|
|
{:r 1 :c "b" :x "1b"}])
|
|
(tc/pivot->wider :c :x {:drop-missing? false}))
|
|
|
|
(require '[tech.v3.dataset :as ds]
|
|
'[tech.v3.dataset.column :as c]
|
|
'[tech.v3.dataset.join :as j]
|
|
)
|
|
|
|
(let [left (ds/->dataset {:a [nil 1.2]
|
|
:b [3 4]})
|
|
right (ds/->dataset {:a [nil 3.4]
|
|
:b [6 7]})]
|
|
(j/left-join :a left right))
|
|
;; => left-outer-join [2 4]:
|
|
;; | :a | :b | :right.a | :right.b |
|
|
;; |----|---:|---------:|---------:|
|
|
;; | | 3 | | 6 |
|
|
;; | 2 | 4 | | |
|
|
|
|
;; => left-outer-join [2 4]:
|
|
;; | :a | :b | :right.a | :right.b |
|
|
;; |---:|---:|---------:|---------:|
|
|
;; | | 3 | | |
|
|
;; | 2 | 4 | | |
|
|
|
|
(-> (tc/left-join (-> (tc/dataset [{:i "foo" :y 2022}]))
|
|
(-> (tc/dataset [{:i "foo" :y 2022 :s "2022"}
|
|
{:i "foo" :y 2023 :s "2023"}]))
|
|
[:i :y])
|
|
(tc/rows :as-maps))
|
|
|
|
(-> (tc/left-join (-> (tc/dataset [{:i "foo" :y 2022}])
|
|
(tc/convert-types {:y :int16}))
|
|
(-> (tc/dataset [{:i "foo" :y 2022 :s "2022"}
|
|
{:i "foo" :y 2023 :s "2023"}]))
|
|
[:i :y]))
|
|
|
|
(-> (tc/dataset [{:i "foo" :y 2022}])
|
|
(tc/convert-types {:y :int16})
|
|
:y first class)
|
|
|
|
|
|
(-> (j/left-join :z
|
|
(ds/->dataset [{:z ["foo" (short 2022)]}])
|
|
(ds/->dataset [{:z ["foo" (long 2022)] :s "2022"}
|
|
{:z ["foo" (long 2023)] :s "2023"}])))
|
|
;; => left-outer-join [2 3]:
|
|
;; | :z | :right.z | :s |
|
|
;; |--------------|--------------|------|
|
|
;; | ["foo" 2022] | ["foo" 2022] | 2022 |
|
|
;; | ["foo" 2022] | | |
|
|
|
|
(-> (j/left-join :z
|
|
(ds/->dataset [{:z (short 2022)}])
|
|
(ds/->dataset [{:z (long 2022) :s "2022"}
|
|
{:z (long 2023) :s "2023"}])))
|
|
|
|
;;
|
|
|
|
(def world-bank-pop (tc/dataset "data/world_bank_pop.csv.gz"))
|
|
|
|
(->> world-bank-pop
|
|
(tc/column-names)
|
|
(take 8)
|
|
(tc/select-columns world-bank-pop))
|
|
|
|
(def pop2 (tc/pivot->longer world-bank-pop (map str (range 2000 2018)) {:drop-missing? false
|
|
:target-columns ["year"]
|
|
:value-column-name "value"}))
|
|
|
|
pop2
|
|
|
|
|
|
(def pop3 (tc/separate-column pop2
|
|
"indicator" ["area" "variable"]
|
|
#(rest (clojure.string/split % #"\."))))
|
|
pop3
|
|
|
|
(tc/pivot->wider pop3 "variable" "value" {:drop-missing? false})
|
|
|
|
;;
|
|
|
|
(defn make-row [] {:row 1})
|
|
(let [left (ds/->dataset (repeatedly 10000 make-row))
|
|
right (ds/->dataset (repeatedly 1000 make-row))]
|
|
(j/left-join :row left right))
|
|
|
|
;;
|
|
|
|
(let [rows-fn tc/drop-rows
|
|
join-column-name [:k]
|
|
lj (tc/left-join (tc/dataset [{:k nil :v "\"nil\""}
|
|
{:k "bar" :v "\"bar\""}
|
|
{:k "baz" :v "\"baz\""}])
|
|
(tc/dataset [{:k "baz"}
|
|
{:k nil}])
|
|
join-column-name)
|
|
_ (println lj)
|
|
right-columns (:right-column-names (meta lj))
|
|
missing-column (col/missing (lj (right-columns (if (vector? join-column-name)
|
|
(first join-column-name)
|
|
join-column-name))))
|
|
left-column (col/missing (lj (if (vector? join-column-name)
|
|
(first join-column-name)
|
|
join-column-name)))]
|
|
[missing-column left-column (RoaringBitmap/andNot missing-column left-column)
|
|
(-> lj
|
|
(rows-fn (RoaringBitmap/andNot missing-column left-column))
|
|
(tc/drop-columns (vals right-columns))
|
|
(ds/unique-by identity) )])
|
|
|
|
|
|
(def ds1 (tc/dataset {:a [1 2 1 2 3 4 nil nil 4]
|
|
:b (range 101 110)
|
|
:c (map str "abs tract")}))
|
|
(def ds2 (tc/dataset {:a [nil 1 2 5 4 3 2 1 nil]
|
|
:b (range 110 101 -1)
|
|
:c (map str "datatable")
|
|
:d (symbol "X")
|
|
:e [3 4 5 6 7 nil 8 1 1]}))
|
|
|
|
(tc/inner-join ds1 ds2 :b)
|
|
|
|
|
|
(defn semi-join
|
|
[l r k]
|
|
(-> (tc/inner-join l r k)
|
|
(tc/select-columns (tc/column-names l))
|
|
(tc/unique-by)))
|
|
|
|
(let [ds1 (tc/dataset [{:k nil :v "\"nil\""}
|
|
{:k "baz" :v "\"bar\""}
|
|
{:k "baz" :v "\"bar\""}
|
|
{:k "bar" :v "1"}])
|
|
ds2 (tc/dataset [{:k "baz"}])]
|
|
(j/pd-merge ds1 ds2 {:on :k :how :outer}))
|
|
|
|
(let [ds1 (tc/dataset {:x1 [:A :B :C]
|
|
:x2 [1 2 3]})
|
|
ds2 (tc/dataset {:x1 [:A :B :D]
|
|
:x3 [true false true]})]
|
|
(tc/full-join ds1 ds2 :x1))
|
|
|
|
|
|
;; => left-outer-join [3 3]:
|
|
;; | :k | :v | :right.k |
|
|
;; |-----|-------|----------|
|
|
;; | baz | "baz" | baz |
|
|
;; | | "nil" | |
|
|
;; | bar | "bar" | |
|
|
|
|
|
|
(semi-join (tc/dataset [{:k nil :v "\"nil\""}
|
|
{:k "bar" :v "\"bar\""}
|
|
{:k "baz" :v "\"baz\""}])
|
|
(tc/dataset [{:qk "baz"}
|
|
])
|
|
[:k])
|
|
;; => left-outer-join [3 3]:
|
|
;; | :k | :v | :right.k |
|
|
;; |-----|-------|----------|
|
|
;; | baz | "baz" | baz |
|
|
;; | | "nil" | |
|
|
;; | bar | "bar" | |
|
|
|
|
(:lhs-indexes (j/hash-join :k (tc/dataset [{:k nil :v "\"nil\""}
|
|
{:k "bar" :v "\"bar\""}
|
|
{:k "baz" :v "\"baz\""}
|
|
{:k "baz" :v "ss"}
|
|
{:k "s" :v "d"}])
|
|
(tc/dataset [{:k "baz"}]) {:lhs-missing? true}))
|
|
|
|
|
|
(semi-join (tc/dataset [{:k nil :v "\"nil\""}
|
|
{:k "bar" :v "\"bar\""}
|
|
{:k "baz" :v "\"baz\""}])
|
|
(tc/dataset [{:k "baz"}
|
|
{:k "baz"}])
|
|
[:k])
|
|
|
|
|
|
(-> (tc/dataset {:a [0 0 1 2 0 0 0 0 1 2 0 0 10 0 0]})
|
|
(tc/map-columns :a #(when (not (zero? %)) %)) ;; make 0 as missing value
|
|
(tc/replace-missing :a :downup)) ;; replace missing, first drag values down then up
|
|
;; => _unnamed [15 1]:
|
|
;; | :a |
|
|
;; |---:|
|
|
;; | 1 |
|
|
;; | 1 |
|
|
;; | 1 |
|
|
;; | 2 |
|
|
;; | 2 |
|
|
;; | 2 |
|
|
;; | 2 |
|
|
;; | 2 |
|
|
;; | 1 |
|
|
;; | 2 |
|
|
;; | 2 |
|
|
;; | 2 |
|
|
;; | 10 |
|
|
;; | 10 |
|
|
;; | 10 |
|
|
|
|
(let [data (map (fn [v] (when (not (zero? v)) v)) [0 0 1 2 0 0 0 0 1 2 0 0 10 0 0])]
|
|
(-> (tc/dataset {:a data})
|
|
(tc/replace-missing :a :downup)))
|
|
;; => _unnamed [15 1]:
|
|
;; | :a |
|
|
;; |---:|
|
|
;; | 1 |
|
|
;; | 1 |
|
|
;; | 1 |
|
|
;; | 2 |
|
|
;; | 2 |
|
|
;; | 2 |
|
|
;; | 2 |
|
|
;; | 2 |
|
|
;; | 1 |
|
|
;; | 2 |
|
|
;; | 2 |
|
|
;; | 2 |
|
|
;; | 10 |
|
|
;; | 10 |
|
|
;; | 10 |
|
|
|
|
;;
|
|
|
|
(def exdata (tc/dataset {:cat1 (repeatedly 100 #(rand-nth [:a :b :c :c]))
|
|
:cat2 (repeatedly 100 #(rand-nth ["A" "B" "A"]))
|
|
:value (repeatedly 100 #(rand-nth (range 10)))}))
|
|
|
|
exdata
|
|
;; => _unnamed [100 3]:
|
|
;; | :cat1 | :cat2 | :value |
|
|
;; |-------|-------|-------:|
|
|
;; | :c | A | 6 |
|
|
;; | :a | B | 1 |
|
|
;; | :b | A | 2 |
|
|
;; | :b | B | 3 |
|
|
;; | :c | B | 9 |
|
|
;; | :a | A | 6 |
|
|
;; | :c | A | 3 |
|
|
;; | :c | A | 3 |
|
|
;; | :c | A | 8 |
|
|
;; | :a | A | 5 |
|
|
;; | ... | ... | ... |
|
|
;; | :a | B | 2 |
|
|
;; | :c | B | 5 |
|
|
;; | :b | A | 0 |
|
|
;; | :a | A | 3 |
|
|
;; | :b | A | 0 |
|
|
;; | :c | B | 6 |
|
|
;; | :b | A | 9 |
|
|
;; | :b | A | 5 |
|
|
;; | :b | B | 4 |
|
|
;; | :b | A | 5 |
|
|
;; | :c | A | 0 |
|
|
|
|
|
|
(-> exdata
|
|
(tc/group-by [:cat1 :cat2])
|
|
(tc/add-column :rank #(tablecloth.api.utils/rank (% :value) :max))
|
|
(tc/ungroup))
|
|
;; => _unnamed [100 4]:
|
|
;; | :cat1 | :cat2 | :value | :rank |
|
|
;; |-------|-------|-------:|------:|
|
|
;; | :c | A | 6 | 20 |
|
|
;; | :c | A | 3 | 10 |
|
|
;; | :c | A | 3 | 10 |
|
|
;; | :c | A | 8 | 26 |
|
|
;; | :c | A | 9 | 30 |
|
|
;; | :c | A | 8 | 26 |
|
|
;; | :c | A | 2 | 7 |
|
|
;; | :c | A | 9 | 30 |
|
|
;; | :c | A | 5 | 17 |
|
|
;; | :c | A | 7 | 23 |
|
|
;; | ... | ... | ... | ... |
|
|
;; | :a | A | 4 | 8 |
|
|
;; | :a | A | 9 | 15 |
|
|
;; | :a | A | 0 | 1 |
|
|
;; | :a | A | 5 | 10 |
|
|
;; | :a | A | 7 | 13 |
|
|
;; | :a | A | 7 | 13 |
|
|
;; | :a | A | 4 | 8 |
|
|
;; | :a | A | 2 | 3 |
|
|
;; | :a | A | 3 | 5 |
|
|
;; | :a | A | 1 | 2 |
|
|
;; | :a | A | 3 | 5 |
|
|
|
|
|
|
(tablecloth.api.utils/rank [1 2 2 3] :max);; => (0 2 2 3)
|
|
|
|
;;
|
|
|
|
(-> (expand-grid input)
|
|
(tc/group-by :sex)
|
|
(tc/reorder-columns :sex :weight)
|
|
(tc/ungroup))
|
|
|
|
|
|
|
|
|
|
(require '[clojure.data.json :as json])
|
|
|
|
(def ds-with-json (tc/dataset [{:x 1, :y 2 :properties "{\"a\": 1, \"b\": 1}"}
|
|
{:x 2, :y 3 :properties "{\"a\": 2, \"b\": 2}"}]))
|
|
|
|
|
|
(->> (:properties ds-with-json)
|
|
(map json/read-str)
|
|
(tc/dataset)
|
|
(tc/append (tc/drop-columns ds-with-json :properties)))
|
|
|
|
;; => _unnamed [2 4]:
|
|
;; | :x | :y | a | b |
|
|
;; |---:|---:|--:|--:|
|
|
;; | 1 | 2 | 1 | 1 |
|
|
;; | 2 | 3 | 2 | 2 |
|
|
|
|
;;
|
|
|
|
(def ds (tc/dataset {:a ["2.3300", "02.1100"]}))
|
|
|
|
(tc/info ds)
|
|
;; => _unnamed: descriptive-stats [1 7]:
|
|
;; | :col-name | :datatype | :n-valid | :n-missing | :mode | :first | :last |
|
|
;; |-----------|-----------|---------:|-----------:|--------|--------|---------|
|
|
;; | :a | :string | 2 | 0 | 2.3300 | 2.3300 | 02.1100 |
|
|
|
|
(tc/write-csv! ds "z.csv")
|
|
|
|
;; read as float64
|
|
(def read-ds (tc/dataset "z.csv"))
|
|
|
|
(tc/info read-ds)
|
|
;; => z.csv: descriptive-stats [1 11]:
|
|
;; | :col-name | :datatype | :n-valid | :n-missing | :min | :mean | :max | :standard-deviation | :skew | :first | :last |
|
|
;; |-----------|-----------|---------:|-----------:|-----:|------:|-----:|--------------------:|-------|-------:|------:|
|
|
;; | a | :float64 | 2 | 0 | 2.11 | 2.22 | 2.33 | 0.15556349 | | 2.33 | 2.11 |
|
|
|
|
;; read as string
|
|
(def read-ds2 (tc/dataset "z.csv" {:parser-fn {"a" :string}}))
|
|
|
|
(tc/info read-ds2)
|
|
;; => z.csv: descriptive-stats [1 7]:
|
|
;; | :col-name | :datatype | :n-valid | :n-missing | :mode | :first | :last |
|
|
;; |-----------|-----------|---------:|-----------:|--------|--------|---------|
|
|
;; | a | :string | 2 | 0 | 2.3300 | 2.3300 | 02.1100 |
|
|
|
|
read-ds2
|
|
;; => z.csv [2 1]:
|
|
;; | a |
|
|
;; |---------|
|
|
;; | 2.3300 |
|
|
;; | 02.1100 |
|
|
|
|
(def ds1 (tc/dataset {:x (range 10)}))
|
|
|
|
(def ds2
|
|
(time
|
|
(-> ds1
|
|
(tc/select-rows (fn [row]
|
|
(Thread/sleep 10)
|
|
(-> row :x (< 500)))
|
|
{:result-type :as-indexes}))))
|
|
|
|
(class ds2)
|
|
|
|
(defn- ->predicate
|
|
[f]
|
|
(reify java.util.function.Predicate
|
|
(test [_ v] (boolean (f v)))))
|
|
|
|
(def ds1 (tc/dataset {:x (range 10)}))
|
|
|
|
;; NOT LAZY
|
|
(def idxs1 (tech.v3.datatype.argops/argfilter (fn [row]
|
|
(println row)
|
|
(-> row :x (< 4)))
|
|
(ds/mapseq-reader ds1)))
|
|
|
|
;; LAZY
|
|
(def idxs2 (tech.v3.datatype.argops/argfilter (fn [row]
|
|
(println row)
|
|
(-> row :x (< 4)))
|
|
(map (partial zipmap [:x]) (:x ds1))))
|
|
|
|
|
|
(count idxs)
|
|
|
|
|
|
;;
|
|
|
|
|
|
(def ds (tc/dataset {:a []}))
|
|
|
|
ds
|
|
;; => _unnamed [0 1]:
|
|
;; | :a |
|
|
;; |----|
|
|
|
|
;; (tc/group-by ds [:a])
|
|
(-> (tc/dataset {:name [{:a 1}] :group-id [0] :data [ds]})
|
|
(tc/mark-as-group)
|
|
(tc/aggregate {:frequency tc/row-count}))
|
|
;; => _unnamed [1 2]:
|
|
;; | :a | :frequency |
|
|
;; |---:|-----------:|
|
|
;; | 1 | 0 |
|
|
|
|
;; (tc/group-by ds :a)
|
|
(-> (tc/dataset {:name [1] :group-id [0] :data [ds]})
|
|
(tc/mark-as-group)
|
|
(tc/aggregate {:frequency tc/row-count}))
|
|
;; => _unnamed [1 2]:
|
|
;; | :$group-name | :frequency |
|
|
;; |-------------:|-----------:|
|
|
;; | 1 | 0 |
|
|
|
|
|
|
(tc/group-by ds [:a])
|
|
|
|
(tc/aggregate ds {:frequency tc/row-count})
|
|
;; => _unnamed [1 1]:
|
|
;; | :frequency |
|
|
;; |-----------:|
|
|
;; | 0 |
|
|
|
|
;; => _unnamed [1 1]:
|
|
;; | :frequenct |
|
|
;; |-----------:|
|
|
;; | 0 |
|
|
|
|
(-> ds
|
|
(tc/group-by :a)
|
|
(tc/aggregate {:frequency tc/row-count}))
|
|
;; => _unnamed [0 0]
|
|
|
|
(-> ds
|
|
(tc/group-by [:b])
|
|
(tc/aggregate {:frequency tc/row-count}))
|
|
;; => _unnamed [0 0]
|
|
|
|
(tc/dataset ())
|
|
|
|
(def DS (tc/dataset {:V1 (take 9 (cycle [1 2]))
|
|
:V2 (range 1 10)
|
|
:V3 (take 9 (cycle [0.5 1.0 1.5]))
|
|
:V4 (take 9 (cycle ["A" "B" "C"]))}))
|
|
|
|
(tc/group-by DS :V1)
|
|
;; => _unnamed [2 3]:
|
|
;; | :name | :group-id | :data |
|
|
;; |------:|----------:|-----------------|
|
|
;; | 1 | 0 | Group: 1 [5 4]: |
|
|
;; | 2 | 1 | Group: 2 [4 4]: |
|
|
|
|
(tc/group-by DS [:V1])
|
|
;; => _unnamed [2 3]:
|
|
;; | :name | :group-id | :data |
|
|
;; |---------|----------:|-----------------------|
|
|
;; | {:V1 1} | 0 | Group: {:V1 1} [5 4]: |
|
|
;; | {:V1 2} | 1 | Group: {:V1 2} [4 4]: |
|