131 lines
4.3 KiB
Clojure
131 lines
4.3 KiB
Clojure
(ns exploration
|
|
"Side-by-side exploration: Kotlin DataFrame bridge + Clojure data stack.
|
|
Render with Clay: (require '[scicloj.clay.v2.api :as clay])
|
|
(clay/make! {:source-path \"notebooks/exploration.clj\"})"
|
|
(:require [tablecloth.api :as tc]
|
|
[tech.v3.dataset :as ds]
|
|
[tech.v3.datatype.functional :as dfn]
|
|
[scicloj.tableplot.v1.plotly :as plotly]
|
|
[scicloj.kindly.v4.kind :as kind]
|
|
[df-bridge.core :as bridge]
|
|
[malli.provider :as mp])
|
|
(:import [org.jetbrains.kotlinx.dataframe.api ToDataFrameKt TypeConversionsKt]))
|
|
|
|
;; # Kotlin DataFrame <-> Clojure Bridge Exploration
|
|
|
|
;; ## 1. Create data in Kotlin DataFrame, bring it to Clojure
|
|
|
|
;; Build a dataset on the Kotlin side (simulating data coming from a Kotlin service):
|
|
|
|
(def kt-data
|
|
(let [n 500
|
|
rng (java.util.Random. 42)
|
|
categories (cycle ["electronics" "clothing" "food" "books" "sports"])
|
|
regions (cycle ["north" "south" "east" "west"])]
|
|
(java.util.HashMap.
|
|
{"product_id" (java.util.ArrayList. (mapv str (range n)))
|
|
"category" (java.util.ArrayList. (vec (take n categories)))
|
|
"region" (java.util.ArrayList. (vec (take n regions)))
|
|
"price" (java.util.ArrayList. (mapv (fn [_] (+ 5.0 (* 195.0 (.nextDouble rng)))) (range n)))
|
|
"quantity" (java.util.ArrayList. (mapv (fn [_] (+ 1 (.nextInt rng 100))) (range n)))
|
|
"rating" (java.util.ArrayList. (mapv (fn [_] (+ 1.0 (* 4.0 (.nextDouble rng)))) (range n)))})))
|
|
|
|
(def kt-df (ToDataFrameKt/toDataFrame kt-data))
|
|
|
|
;; Kotlin DataFrame info:
|
|
(kind/md (format "**Kotlin DataFrame**: %d rows x %d columns — columns: %s"
|
|
(.rowsCount kt-df) (.columnsCount kt-df)
|
|
(vec (.columnNames kt-df))))
|
|
|
|
;; ## 2. Bridge to tablecloth
|
|
|
|
(def sales (bridge/kt->tc kt-df))
|
|
|
|
sales
|
|
|
|
;; ## 3. Basic tablecloth operations
|
|
|
|
;; ### Summary by category
|
|
|
|
(def by-category
|
|
(-> sales
|
|
(tc/group-by "category")
|
|
(tc/aggregate {"avg-price" (fn [ds] (dfn/mean (ds/column ds "price")))
|
|
"avg-rating" (fn [ds] (dfn/mean (ds/column ds "rating")))
|
|
"total-qty" (fn [ds] (dfn/sum (ds/column ds "quantity")))})))
|
|
|
|
by-category
|
|
|
|
;; ### Filter: high-value items (price > 100, rating > 3.5)
|
|
|
|
(def premium
|
|
(-> sales
|
|
(tc/select-rows (fn [row] (and (> (get row "price") 100.0)
|
|
(> (get row "rating") 3.5))))))
|
|
|
|
(kind/md (format "**Premium items**: %d out of %d" (tc/row-count premium) (tc/row-count sales)))
|
|
|
|
premium
|
|
|
|
;; ## 4. Visualization with tableplot
|
|
|
|
;; ### Price distribution by category
|
|
|
|
(-> sales
|
|
(plotly/base {:=x "price"})
|
|
(plotly/layer-histogram {:=histogram-nbins 30
|
|
:=color "category"}))
|
|
|
|
;; ### Price vs Rating scatter
|
|
|
|
(-> sales
|
|
(plotly/base {:=x "price" :=y "rating"})
|
|
(plotly/layer-point {:=color "category"
|
|
:=mark-size 6}))
|
|
|
|
;; ### Total quantity by region (bar chart)
|
|
|
|
(def qty-by-region
|
|
(-> sales
|
|
(tc/group-by "region")
|
|
(tc/aggregate {"total-qty" (fn [ds] (dfn/sum (ds/column ds "quantity")))})))
|
|
|
|
(-> qty-by-region
|
|
(plotly/base {:=x :$group-name :=y "total-qty"})
|
|
(plotly/layer-bar {}))
|
|
|
|
;; ### Average price by category (bar chart)
|
|
|
|
(-> by-category
|
|
(plotly/base {:=x :$group-name :=y "avg-price"})
|
|
(plotly/layer-bar {}))
|
|
|
|
;; ## 5. Roundtrip: modify in Clojure, send back to Kotlin
|
|
|
|
(def enriched
|
|
(-> sales
|
|
(tc/map-columns "revenue" ["price" "quantity"] *)
|
|
(tc/select-columns ["product_id" "category" "region" "price" "quantity" "revenue" "rating"])))
|
|
|
|
(def kt-enriched (bridge/dataset->kt enriched))
|
|
|
|
(kind/md (format "**Roundtrip**: enriched tablecloth dataset -> KT DataFrame: %d rows x %d cols, columns: %s"
|
|
(.rowsCount kt-enriched) (.columnsCount kt-enriched)
|
|
(vec (.columnNames kt-enriched))))
|
|
|
|
;; Revenue distribution:
|
|
(-> enriched
|
|
(plotly/base {:=x "revenue"})
|
|
(plotly/layer-histogram {:=histogram-nbins 40
|
|
:=color "category"}))
|
|
|
|
;; ## 6. Schema inference with malli
|
|
|
|
(def row-sample (take 10 (bridge/kt->rows kt-df)))
|
|
|
|
(def inferred-schema (mp/provide row-sample))
|
|
|
|
(kind/md (str "**Malli inferred schema from KT DataFrame rows:**\n```clojure\n"
|
|
(pr-str inferred-schema)
|
|
"\n```"))
|