init research
This commit is contained in:
+238
@@ -0,0 +1,238 @@
|
||||
;; # Tablecloth Column Exploration
|
||||
|
||||
^{:kind/hidden true}
|
||||
(ns intro
|
||||
(:require [tablecloth.api :as tc]
|
||||
[scicloj.clay.v2.api :as clay]
|
||||
[scicloj.kindly.v3.api :as kindly]
|
||||
[scicloj.kindly.v3.kind :as kind]))
|
||||
|
||||
^{:kind/hidden true}
|
||||
(clay/start!)
|
||||
|
||||
^{:kind/hidden true}
|
||||
(comment
|
||||
(clay/show-doc! "docs/column_exploration.clj" {:hide-doc? true})
|
||||
(clay/write-html! "docs/column_exploration.html")
|
||||
,)
|
||||
|
||||
;; ## What is this exploration?
|
||||
;;
|
||||
;; We want to add a `column` entity to tablecloth that parallels `dataset`. It will make
|
||||
;; the column a first-class entity within tablecloth.
|
||||
|
||||
;; ## Usage
|
||||
|
||||
(require '[tablecloth.column.api :refer [column] :as col])
|
||||
|
||||
;; ### Column creation
|
||||
|
||||
;; We can create an empty column like this:
|
||||
|
||||
(column)
|
||||
|
||||
;; We can check if it it's a column.
|
||||
|
||||
(col/column? (column))
|
||||
|
||||
;; We can create a columns with data in a number of ways
|
||||
|
||||
(column [1 2 3 4])
|
||||
|
||||
(column (range 10))
|
||||
|
||||
;; When you do this the types of the resulting array is determined
|
||||
;; automatically from the items provided.
|
||||
|
||||
(let [int-column (column (range 10))]
|
||||
(col/typeof int-column))
|
||||
|
||||
(let [string-column (column ["foo" "bar"])]
|
||||
(col/typeof string-column))
|
||||
|
||||
|
||||
;; ### Basic Operations
|
||||
|
||||
;; Operations are right now in their own namespace
|
||||
(require '[tablecloth.column.api.operators :as ops])
|
||||
|
||||
;; With that imported we can perform a large number of operations:
|
||||
|
||||
(def a (column [20 30 40 50]))
|
||||
(def b (column (range 4)))
|
||||
|
||||
(ops/- a b)
|
||||
|
||||
(ops/pow a 2)
|
||||
|
||||
(ops/* 10 (ops/sin a))
|
||||
|
||||
(ops/< a 35)
|
||||
|
||||
;; All these operations take a column as their first argument and
|
||||
;; return a column, so they can be chained easily.
|
||||
|
||||
(-> a
|
||||
(ops/* b)
|
||||
(ops/< 70))
|
||||
|
||||
;; ### Subsetting and accesssing
|
||||
|
||||
;; You can access an element in a column in exactly the same ways you
|
||||
;; would in Clojure.
|
||||
|
||||
(def myclm (column (range 5)))
|
||||
|
||||
myclm
|
||||
|
||||
(myclm 2)
|
||||
|
||||
(nth myclm 2)
|
||||
|
||||
(get myclm 2)
|
||||
|
||||
;; #### Selecting multiple elements
|
||||
|
||||
;; There are two ways to select multiple elements from a column:
|
||||
;; * If you need to select a continuous subset, you can use `slice`;
|
||||
;; * if you may need to select diverse elements, use `select`.
|
||||
;;
|
||||
|
||||
;; **Slice**
|
||||
|
||||
;; The `slice` method allows you to use indexes to specify a portion
|
||||
;; of the column to extract.
|
||||
|
||||
(def myclm
|
||||
(column (repeatedly 10 #(rand-int 10))))
|
||||
|
||||
myclm
|
||||
|
||||
(col/slice myclm 3 5)
|
||||
|
||||
|
||||
;; It also supports negative indexing, making it possible to slice
|
||||
;; from the end of the column:
|
||||
|
||||
(col/slice myclm -7 -5)
|
||||
|
||||
;; It's also possible to slice from one direction to the beginning or
|
||||
;; end:
|
||||
|
||||
(col/slice myclm 7 :end)
|
||||
|
||||
(col/slice myclm -3 :end)
|
||||
|
||||
(col/slice myclm :start 7)
|
||||
|
||||
(col/slice myclm :start -3)
|
||||
|
||||
;; **Select**
|
||||
;;
|
||||
;; The `select` fn works by taking a list of index positions:
|
||||
|
||||
(col/select myclm [1 3 5 8])
|
||||
|
||||
;; We can combine this type of selection with the operations just
|
||||
;; demonstrated to select certain values.
|
||||
|
||||
|
||||
myclm
|
||||
|
||||
;; Let's see which positions are greter than 5.
|
||||
(ops/> myclm 5)
|
||||
|
||||
|
||||
;; We can use a column of boolean values like the one above with the `select` function as well. `select` will choose all the positions that are true. It's like supplying select a list of the index positions that hold true values.
|
||||
(col/select myclm (ops/> myclm 5))
|
||||
|
||||
|
||||
;; ### Iterating over a column
|
||||
|
||||
;; Many operations that you might want to perform on a column are
|
||||
;; available in the `tablecloth.column.api.operators` namespace.
|
||||
;; However, when there is a need to do something custom, you can also
|
||||
;; interate over the column.
|
||||
|
||||
(defn calc-percent [x]
|
||||
(/ x 100.0))
|
||||
|
||||
(col/column-map myclm calc-percent)
|
||||
|
||||
;; It's also possible to iterate over multiple columns by supplying a
|
||||
;; vector of columns:
|
||||
|
||||
(-> [(column [5 6 7 8 9])
|
||||
(column [1 2 3 4 5])]
|
||||
(col/column-map (partial *)))
|
||||
|
||||
|
||||
(comment
|
||||
(-> (column [1 nil 2 3 nil 0])
|
||||
(ops/* 10))
|
||||
|
||||
(-> (column [1 nil 2 3 nil 0])
|
||||
(ops/max [10 10 10 10 10 10]))
|
||||
|
||||
|
||||
(tech.v3.dataset.column/missing))
|
||||
|
||||
;; ### Sorting a column
|
||||
|
||||
;; You can use `sort-column` to sort a colum
|
||||
|
||||
(def myclm (column (repeatedly 10 #(rand-int 100))))
|
||||
|
||||
myclm
|
||||
|
||||
(col/sort-column myclm)
|
||||
|
||||
;; As you can see, sort-columns sorts in ascending order by default,
|
||||
;; but you can also specify a different order using ordering keywords
|
||||
;; `:asc` and `:desc`:
|
||||
|
||||
|
||||
(col/sort-column myclm :desc)
|
||||
|
||||
;; Finally, sort can also accept a `comparator-fn`:
|
||||
|
||||
(let [c (column ["1" "100" "4" "-10"])]
|
||||
(col/sort-column c (fn [a b]
|
||||
(let [a (parse-long a)
|
||||
b (parse-long b)]
|
||||
(< a b)))))
|
||||
|
||||
|
||||
;; ### Missing values
|
||||
|
||||
;; The column has built-in support for basic awareness and handling of
|
||||
;; missing values. Columns will be scanned for missing values when
|
||||
;; created.
|
||||
|
||||
(def myclm (column [10 nil -4 20 1000 nil -233]))
|
||||
|
||||
;; You can identify the set of index positions of missing values:
|
||||
|
||||
(col/missing myclm)
|
||||
|
||||
(col/count-missing myclm)
|
||||
|
||||
;; You can remove missing values:
|
||||
|
||||
(col/drop-missing myclm)
|
||||
|
||||
;; Or you can replace them:
|
||||
|
||||
(col/replace-missing myclm)
|
||||
|
||||
;; There are a range of built-in strategies:
|
||||
|
||||
(col/replace-missing myclm :midpoint)
|
||||
|
||||
|
||||
;; And you can provide your own value using a specific value or fn:
|
||||
|
||||
(col/replace-missing myclm :value 555)
|
||||
|
||||
(col/replace-missing myclm :value (fn [col-without-missing]
|
||||
(ops/mean col-without-missing)))
|
||||
Reference in New Issue
Block a user