init research
This commit is contained in:
@@ -0,0 +1,30 @@
|
||||
{:paths ["src" "notebooks"]
|
||||
:deps {org.clojure/clojure {:mvn/version "1.12.0"}
|
||||
|
||||
;; Kotlin DataFrame
|
||||
org.jetbrains.kotlinx/dataframe-core {:mvn/version "1.0.0-Beta4"}
|
||||
org.jetbrains.kotlin/kotlin-reflect {:mvn/version "2.1.10"}
|
||||
|
||||
;; Clojure data stack
|
||||
scicloj/tablecloth {:mvn/version "7.062"}
|
||||
metosin/malli {:mvn/version "0.17.0"}
|
||||
|
||||
;; Arrow support (for both KT DataFrame and TMD)
|
||||
org.jetbrains.kotlinx/dataframe-arrow {:mvn/version "1.0.0-Beta4"}
|
||||
org.apache.arrow/arrow-vector {:mvn/version "18.2.0"}
|
||||
org.apache.arrow/arrow-memory-unsafe {:mvn/version "18.2.0"}
|
||||
com.cnuernber/jarrow {:mvn/version "1.000"}
|
||||
org.lz4/lz4-java {:mvn/version "1.8.0"}
|
||||
com.github.luben/zstd-jni {:mvn/version "1.5.4-1"}
|
||||
|
||||
;; Visualization
|
||||
org.scicloj/tableplot {:mvn/version "1-beta14"}
|
||||
org.scicloj/clay {:mvn/version "2-beta56"}
|
||||
|
||||
;; Logging (suppress SLF4J warnings)
|
||||
ch.qos.logback/logback-classic {:mvn/version "1.4.14"}}
|
||||
|
||||
:aliases
|
||||
{:repl
|
||||
{:jvm-opts ["--add-opens=java.base/java.nio=ALL-UNNAMED"
|
||||
"--add-opens=java.base/sun.nio.ch=ALL-UNNAMED"]}}}
|
||||
@@ -0,0 +1,130 @@
|
||||
(ns exploration
|
||||
"Side-by-side exploration: Kotlin DataFrame bridge + Clojure data stack.
|
||||
Render with Clay: (require '[scicloj.clay.v2.api :as clay])
|
||||
(clay/make! {:source-path \"notebooks/exploration.clj\"})"
|
||||
(:require [tablecloth.api :as tc]
|
||||
[tech.v3.dataset :as ds]
|
||||
[tech.v3.datatype.functional :as dfn]
|
||||
[scicloj.tableplot.v1.plotly :as plotly]
|
||||
[scicloj.kindly.v4.kind :as kind]
|
||||
[df-bridge.core :as bridge]
|
||||
[malli.provider :as mp])
|
||||
(:import [org.jetbrains.kotlinx.dataframe.api ToDataFrameKt TypeConversionsKt]))
|
||||
|
||||
;; # Kotlin DataFrame <-> Clojure Bridge Exploration
|
||||
|
||||
;; ## 1. Create data in Kotlin DataFrame, bring it to Clojure
|
||||
|
||||
;; Build a dataset on the Kotlin side (simulating data coming from a Kotlin service):
|
||||
|
||||
(def kt-data
|
||||
(let [n 500
|
||||
rng (java.util.Random. 42)
|
||||
categories (cycle ["electronics" "clothing" "food" "books" "sports"])
|
||||
regions (cycle ["north" "south" "east" "west"])]
|
||||
(java.util.HashMap.
|
||||
{"product_id" (java.util.ArrayList. (mapv str (range n)))
|
||||
"category" (java.util.ArrayList. (vec (take n categories)))
|
||||
"region" (java.util.ArrayList. (vec (take n regions)))
|
||||
"price" (java.util.ArrayList. (mapv (fn [_] (+ 5.0 (* 195.0 (.nextDouble rng)))) (range n)))
|
||||
"quantity" (java.util.ArrayList. (mapv (fn [_] (+ 1 (.nextInt rng 100))) (range n)))
|
||||
"rating" (java.util.ArrayList. (mapv (fn [_] (+ 1.0 (* 4.0 (.nextDouble rng)))) (range n)))})))
|
||||
|
||||
(def kt-df (ToDataFrameKt/toDataFrame kt-data))
|
||||
|
||||
;; Kotlin DataFrame info:
|
||||
(kind/md (format "**Kotlin DataFrame**: %d rows x %d columns — columns: %s"
|
||||
(.rowsCount kt-df) (.columnsCount kt-df)
|
||||
(vec (.columnNames kt-df))))
|
||||
|
||||
;; ## 2. Bridge to tablecloth
|
||||
|
||||
(def sales (bridge/kt->tc kt-df))
|
||||
|
||||
sales
|
||||
|
||||
;; ## 3. Basic tablecloth operations
|
||||
|
||||
;; ### Summary by category
|
||||
|
||||
(def by-category
|
||||
(-> sales
|
||||
(tc/group-by "category")
|
||||
(tc/aggregate {"avg-price" (fn [ds] (dfn/mean (ds/column ds "price")))
|
||||
"avg-rating" (fn [ds] (dfn/mean (ds/column ds "rating")))
|
||||
"total-qty" (fn [ds] (dfn/sum (ds/column ds "quantity")))})))
|
||||
|
||||
by-category
|
||||
|
||||
;; ### Filter: high-value items (price > 100, rating > 3.5)
|
||||
|
||||
(def premium
|
||||
(-> sales
|
||||
(tc/select-rows (fn [row] (and (> (get row "price") 100.0)
|
||||
(> (get row "rating") 3.5))))))
|
||||
|
||||
(kind/md (format "**Premium items**: %d out of %d" (tc/row-count premium) (tc/row-count sales)))
|
||||
|
||||
premium
|
||||
|
||||
;; ## 4. Visualization with tableplot
|
||||
|
||||
;; ### Price distribution by category
|
||||
|
||||
(-> sales
|
||||
(plotly/base {:=x "price"})
|
||||
(plotly/layer-histogram {:=histogram-nbins 30
|
||||
:=color "category"}))
|
||||
|
||||
;; ### Price vs Rating scatter
|
||||
|
||||
(-> sales
|
||||
(plotly/base {:=x "price" :=y "rating"})
|
||||
(plotly/layer-point {:=color "category"
|
||||
:=mark-size 6}))
|
||||
|
||||
;; ### Total quantity by region (bar chart)
|
||||
|
||||
(def qty-by-region
|
||||
(-> sales
|
||||
(tc/group-by "region")
|
||||
(tc/aggregate {"total-qty" (fn [ds] (dfn/sum (ds/column ds "quantity")))})))
|
||||
|
||||
(-> qty-by-region
|
||||
(plotly/base {:=x :$group-name :=y "total-qty"})
|
||||
(plotly/layer-bar {}))
|
||||
|
||||
;; ### Average price by category (bar chart)
|
||||
|
||||
(-> by-category
|
||||
(plotly/base {:=x :$group-name :=y "avg-price"})
|
||||
(plotly/layer-bar {}))
|
||||
|
||||
;; ## 5. Roundtrip: modify in Clojure, send back to Kotlin
|
||||
|
||||
(def enriched
|
||||
(-> sales
|
||||
(tc/map-columns "revenue" ["price" "quantity"] *)
|
||||
(tc/select-columns ["product_id" "category" "region" "price" "quantity" "revenue" "rating"])))
|
||||
|
||||
(def kt-enriched (bridge/dataset->kt enriched))
|
||||
|
||||
(kind/md (format "**Roundtrip**: enriched tablecloth dataset -> KT DataFrame: %d rows x %d cols, columns: %s"
|
||||
(.rowsCount kt-enriched) (.columnsCount kt-enriched)
|
||||
(vec (.columnNames kt-enriched))))
|
||||
|
||||
;; Revenue distribution:
|
||||
(-> enriched
|
||||
(plotly/base {:=x "revenue"})
|
||||
(plotly/layer-histogram {:=histogram-nbins 40
|
||||
:=color "category"}))
|
||||
|
||||
;; ## 6. Schema inference with malli
|
||||
|
||||
(def row-sample (take 10 (bridge/kt->rows kt-df)))
|
||||
|
||||
(def inferred-schema (mp/provide row-sample))
|
||||
|
||||
(kind/md (str "**Malli inferred schema from KT DataFrame rows:**\n```clojure\n"
|
||||
(pr-str inferred-schema)
|
||||
"\n```"))
|
||||
@@ -0,0 +1,282 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Kotlin DataFrame + Kandy: Bridge Comparison\n",
|
||||
"\n",
|
||||
"This notebook mirrors `exploration.clj` — same analysis, Kotlin ecosystem.\n",
|
||||
"Requires: Kotlin Notebook plugin in IntelliJ IDEA."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%useLatestDescriptors\n",
|
||||
"%use dataframe, kandy"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 1. Create data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import kotlin.random.Random\n",
|
||||
"\n",
|
||||
"val rng = Random(42)\n",
|
||||
"val n = 500\n",
|
||||
"val categories = listOf(\"electronics\", \"clothing\", \"food\", \"books\", \"sports\")\n",
|
||||
"val regions = listOf(\"north\", \"south\", \"east\", \"west\")\n",
|
||||
"\n",
|
||||
"val sales = dataFrameOf(\n",
|
||||
" \"product_id\" to (0 until n).map { it.toString() },\n",
|
||||
" \"category\" to (0 until n).map { categories[it % categories.size] },\n",
|
||||
" \"region\" to (0 until n).map { regions[it % regions.size] },\n",
|
||||
" \"price\" to (0 until n).map { 5.0 + 195.0 * rng.nextDouble() },\n",
|
||||
" \"quantity\" to (0 until n).map { 1 + rng.nextInt(100) },\n",
|
||||
" \"rating\" to (0 until n).map { 1.0 + 4.0 * rng.nextDouble() },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"sales.head(10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sales.describe()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 2. Group-by and aggregate"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"val byCategory = sales.groupBy { category }.aggregate {\n",
|
||||
" mean { price } into \"avg_price\"\n",
|
||||
" mean { rating } into \"avg_rating\"\n",
|
||||
" sum { quantity } into \"total_qty\"\n",
|
||||
"}\n",
|
||||
"byCategory"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 3. Filter: premium items"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"val premium = sales.filter { price > 100.0 && rating > 3.5 }\n",
|
||||
"println(\"Premium items: ${premium.rowsCount()} out of ${sales.rowsCount()}\")\n",
|
||||
"premium.head(10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 4. Visualization with Kandy\n",
|
||||
"\n",
|
||||
"### Price distribution by category"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sales.groupBy { category }.plot {\n",
|
||||
" histogram(x = price, binsOption = BinsOption.byNumber(30)) {\n",
|
||||
" fillColor(key.category)\n",
|
||||
" alpha = 0.7\n",
|
||||
" position = Position.dodge()\n",
|
||||
" }\n",
|
||||
" layout {\n",
|
||||
" title = \"Price Distribution by Category\"\n",
|
||||
" size = 850 to 500\n",
|
||||
" }\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Price vs Rating scatter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sales.plot {\n",
|
||||
" points {\n",
|
||||
" x(price)\n",
|
||||
" y(rating)\n",
|
||||
" color(category)\n",
|
||||
" size = 4.0\n",
|
||||
" }\n",
|
||||
" layout {\n",
|
||||
" title = \"Price vs Rating\"\n",
|
||||
" size = 850 to 500\n",
|
||||
" }\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Total quantity by region"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"val qtyByRegion = sales.groupBy { region }.aggregate {\n",
|
||||
" sum { quantity } into \"total_qty\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"qtyByRegion.plot {\n",
|
||||
" bars {\n",
|
||||
" x(region)\n",
|
||||
" y(total_qty)\n",
|
||||
" }\n",
|
||||
" layout {\n",
|
||||
" title = \"Total Quantity by Region\"\n",
|
||||
" size = 600 to 400\n",
|
||||
" }\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Average price by category"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"byCategory.plot {\n",
|
||||
" bars {\n",
|
||||
" x(category)\n",
|
||||
" y(avg_price)\n",
|
||||
" }\n",
|
||||
" layout {\n",
|
||||
" title = \"Average Price by Category\"\n",
|
||||
" size = 600 to 400\n",
|
||||
" }\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 5. Add computed column + revenue histogram"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"val enriched = sales.add(\"revenue\") { price * quantity }\n",
|
||||
" .select { product_id and category and region and price and quantity and revenue and rating }\n",
|
||||
"\n",
|
||||
"println(\"Enriched: ${enriched.rowsCount()} rows x ${enriched.columnsCount()} cols\")\n",
|
||||
"enriched.head(10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"enriched.groupBy { category }.plot {\n",
|
||||
" histogram(x = revenue, binsOption = BinsOption.byNumber(40)) {\n",
|
||||
" fillColor(key.category)\n",
|
||||
" alpha = 0.7\n",
|
||||
" position = Position.dodge()\n",
|
||||
" }\n",
|
||||
" layout {\n",
|
||||
" title = \"Revenue Distribution by Category\"\n",
|
||||
" size = 850 to 500\n",
|
||||
" }\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 6. Schema info (Kotlin way)\n",
|
||||
"\n",
|
||||
"Kotlin DataFrame provides compile-time schema via `@DataSchema` and runtime via `.schema()`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sales.schema()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Kotlin",
|
||||
"language": "kotlin",
|
||||
"name": "kotlin"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "kotlin"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -0,0 +1,78 @@
|
||||
(ns df-bridge.arrow-bench
|
||||
"Compare Arrow IPC interchange vs direct Map bridge."
|
||||
(:require [df-bridge.core :as bridge]
|
||||
[tech.v3.dataset :as ds]
|
||||
[tech.v3.libs.arrow :as arrow]
|
||||
[tablecloth.api :as tc])
|
||||
(:import [org.jetbrains.kotlinx.dataframe DataFrame]
|
||||
[org.jetbrains.kotlinx.dataframe.api ToDataFrameKt TypeConversionsKt NullabilityOptions]
|
||||
[org.jetbrains.kotlinx.dataframe.io ArrowWritingKt ArrowReadingKt]
|
||||
[java.io File]))
|
||||
|
||||
(defn make-test-data-kt
|
||||
"Create a KT DataFrame with n rows."
|
||||
[n]
|
||||
(let [data (java.util.HashMap.
|
||||
{"name" (java.util.ArrayList. (mapv #(str "person-" %) (range n)))
|
||||
"age" (java.util.ArrayList. (mapv #(+ 18 (mod % 80)) (range n)))
|
||||
"score" (java.util.ArrayList. (mapv #(+ 50.0 (* 50.0 (/ (double %) n))) (range n)))
|
||||
"id" (java.util.ArrayList. (mapv long (range n)))})]
|
||||
(ToDataFrameKt/toDataFrame data)))
|
||||
|
||||
(defn bench [label f iters]
|
||||
(f) ;; warmup
|
||||
(let [start (System/nanoTime)]
|
||||
(dotimes [_ iters] (f))
|
||||
(let [elapsed-ms (/ (- (System/nanoTime) start) 1e6)]
|
||||
(println (format " %-50s %8.1f ms total, %8.3f ms/iter"
|
||||
label elapsed-ms (/ elapsed-ms iters))))))
|
||||
|
||||
(def df-companion (DataFrame/Companion))
|
||||
|
||||
(defn run-arrow-comparison []
|
||||
(doseq [n [10000 100000 1000000]]
|
||||
(println (format "\n=== %,d rows (4 columns: string, int, double, long) ===" n))
|
||||
(let [kt-df (make-test-data-kt n)
|
||||
tmp-file (File/createTempFile "bridge-bench" ".arrow")
|
||||
iters (cond (>= n 1000000) 3
|
||||
(>= n 100000) 10
|
||||
:else 30)]
|
||||
|
||||
;; Path 1: Direct Map bridge (KT -> TMD)
|
||||
(bench "Direct Map: KT -> TMD dataset"
|
||||
#(bridge/kt->dataset kt-df) iters)
|
||||
|
||||
;; Path 2a: Arrow IPC write (KT side)
|
||||
(bench "Arrow IPC: KT write to file"
|
||||
#(ArrowWritingKt/writeArrowIPC kt-df tmp-file false) iters)
|
||||
|
||||
;; Path 2b: Arrow IPC read (TMD side)
|
||||
(ArrowWritingKt/writeArrowIPC kt-df tmp-file false)
|
||||
(bench "Arrow IPC: TMD read from file"
|
||||
#(arrow/stream->dataset (.getAbsolutePath tmp-file) {:key-fn keyword}) iters)
|
||||
|
||||
;; Path 2 combined: Arrow roundtrip KT -> file -> TMD
|
||||
(bench "Arrow IPC: KT write + TMD read (combined)"
|
||||
#(do (ArrowWritingKt/writeArrowIPC kt-df tmp-file false)
|
||||
(arrow/stream->dataset (.getAbsolutePath tmp-file) {:key-fn keyword}))
|
||||
iters)
|
||||
|
||||
;; Path 3: Byte array (in-memory Arrow, no file I/O)
|
||||
(bench "Arrow byte[]: KT->bytes->TMD (in-memory)"
|
||||
#(let [bytes (ArrowWritingKt/saveArrowIPCToByteArray kt-df)]
|
||||
(arrow/stream->dataset (java.io.ByteArrayInputStream. bytes) {:key-fn keyword}))
|
||||
iters)
|
||||
|
||||
;; Reverse: TMD -> KT via Map
|
||||
(let [tmd-ds (bridge/kt->dataset kt-df)]
|
||||
(bench "Direct Map: TMD -> KT DataFrame"
|
||||
#(bridge/dataset->kt tmd-ds) iters))
|
||||
|
||||
;; File size
|
||||
(ArrowWritingKt/writeArrowIPC kt-df tmp-file false)
|
||||
(println (format " Arrow IPC file size: %,.0f KB" (/ (.length tmp-file) 1024.0)))
|
||||
|
||||
(.delete tmp-file))))
|
||||
|
||||
(defn -main [& _]
|
||||
(run-arrow-comparison))
|
||||
@@ -0,0 +1,59 @@
|
||||
(ns df-bridge.bench
|
||||
"Benchmark the bridge at various data sizes."
|
||||
(:require [df-bridge.core :as bridge]
|
||||
[tech.v3.dataset :as ds]
|
||||
[tablecloth.api :as tc])
|
||||
(:import [org.jetbrains.kotlinx.dataframe.api ToDataFrameKt TypeConversionsKt]))
|
||||
|
||||
(defn make-test-data
|
||||
"Generate column-oriented test data with n rows."
|
||||
[n]
|
||||
(let [names (java.util.ArrayList. (mapv #(str "person-" %) (range n)))
|
||||
ages (java.util.ArrayList. (mapv #(+ 18 (mod % 80)) (range n)))
|
||||
scores (java.util.ArrayList. (mapv #(+ 50.0 (* 50.0 (/ (double %) n))) (range n)))]
|
||||
{"name" names "age" ages "score" scores}))
|
||||
|
||||
(defn bench-one [label f iterations]
|
||||
(f) ;; warmup
|
||||
(let [start (System/nanoTime)]
|
||||
(dotimes [_ iterations] (f))
|
||||
(let [elapsed-ms (/ (- (System/nanoTime) start) 1e6)]
|
||||
(println (format " %-40s %8.1f ms total, %8.3f ms/iter (%d iters)"
|
||||
label elapsed-ms (/ elapsed-ms iterations) iterations)))))
|
||||
|
||||
(defn run-benchmarks []
|
||||
(doseq [n [1000 100000 1000000]]
|
||||
(println (format "\n=== %,d rows ===" n))
|
||||
(let [data (make-test-data n)
|
||||
iters (cond (>= n 1000000) 5
|
||||
(>= n 100000) 20
|
||||
:else 100)
|
||||
|
||||
;; Pre-build objects for each direction
|
||||
kt-df (bridge/col-map->kt data)
|
||||
tc-ds (tc/dataset {:name (get data "name")
|
||||
:age (get data "age")
|
||||
:score (get data "score")})]
|
||||
|
||||
;; Clojure Map -> KT DataFrame
|
||||
(bench-one "Map -> KT DataFrame"
|
||||
#(bridge/col-map->kt data) iters)
|
||||
|
||||
;; KT DataFrame -> Clojure Map
|
||||
(bench-one "KT DataFrame -> Map"
|
||||
#(bridge/kt->col-map kt-df) iters)
|
||||
|
||||
;; KT DataFrame -> TMD dataset
|
||||
(bench-one "KT DataFrame -> TMD dataset"
|
||||
#(bridge/kt->dataset kt-df) iters)
|
||||
|
||||
;; Tablecloth -> KT DataFrame
|
||||
(bench-one "Tablecloth -> KT DataFrame"
|
||||
#(bridge/dataset->kt tc-ds) iters)
|
||||
|
||||
;; Full roundtrip: TC -> KT -> TC
|
||||
(bench-one "Full roundtrip TC->KT->TC"
|
||||
#(bridge/kt->tc (bridge/dataset->kt tc-ds)) iters))))
|
||||
|
||||
(defn -main [& _args]
|
||||
(run-benchmarks))
|
||||
@@ -0,0 +1,127 @@
|
||||
(ns df-bridge.core
|
||||
"Bridge between Kotlin DataFrame and Clojure data ecosystem.
|
||||
Converts via Map<String, List> -- the natural columnar interchange type."
|
||||
(:require [tech.v3.dataset :as ds]
|
||||
[tablecloth.api :as tc])
|
||||
(:import [org.jetbrains.kotlinx.dataframe.api ToDataFrameKt TypeConversionsKt]
|
||||
[org.jetbrains.kotlinx.dataframe DataFrame DataColumn DataRow]))
|
||||
|
||||
(def ^:private companion (DataColumn/Companion))
|
||||
|
||||
;;; --- Helpers ---
|
||||
|
||||
(defn- datarow->map
|
||||
"Recursively convert a KT DataRow (from ColumnGroup) to a Clojure map."
|
||||
[^DataRow row]
|
||||
(let [m (TypeConversionsKt/toMap row)]
|
||||
(into {} (map (fn [[k v]]
|
||||
[(keyword k)
|
||||
(if (instance? DataRow v) (datarow->map v) v)])
|
||||
m))))
|
||||
|
||||
(defn- deep-convert-col-values
|
||||
"Convert column values, turning DataRow objects into Clojure maps."
|
||||
[values]
|
||||
(mapv (fn [v] (if (instance? DataRow v) (datarow->map v) v)) values))
|
||||
|
||||
;;; --- Kotlin DataFrame -> Clojure ---
|
||||
|
||||
(defn kt->col-map
|
||||
"Convert a Kotlin DataFrame to a column-oriented Clojure map.
|
||||
Returns {\"col1\" [v1 v2 ...] \"col2\" [v1 v2 ...]}.
|
||||
ColumnGroup columns are converted to vectors of nested keyword maps."
|
||||
[kt-df]
|
||||
(into {}
|
||||
(map (fn [[k v]] [k (deep-convert-col-values v)]))
|
||||
(TypeConversionsKt/toMap kt-df)))
|
||||
|
||||
(defn kt->dataset
|
||||
"Convert a Kotlin DataFrame to a tech.ml.dataset.
|
||||
Note: ColumnGroups become columns of maps (TMD doesn't have nested columns)."
|
||||
[kt-df]
|
||||
(ds/->dataset (kt->col-map kt-df)))
|
||||
|
||||
(defn kt->tc
|
||||
"Convert a Kotlin DataFrame to a tablecloth dataset."
|
||||
[kt-df]
|
||||
(tc/dataset (kt->col-map kt-df)))
|
||||
|
||||
(defn kt->rows
|
||||
"Convert a Kotlin DataFrame to a seq of Clojure maps (row-oriented).
|
||||
ColumnGroups become nested keyword maps."
|
||||
[kt-df]
|
||||
(mapv (fn [row] (datarow->map row))
|
||||
(iterator-seq (.iterator kt-df))))
|
||||
|
||||
;;; --- Clojure -> Kotlin DataFrame ---
|
||||
|
||||
(defn col-map->kt
|
||||
"Convert a column-oriented map to a Kotlin DataFrame.
|
||||
Input: {\"col1\" [v1 v2 ...] \"col2\" [v1 v2 ...]}."
|
||||
[col-map]
|
||||
(let [jmap (java.util.HashMap. ^java.util.Map col-map)]
|
||||
(ToDataFrameKt/toDataFrame jmap)))
|
||||
|
||||
(defn dataset->kt
|
||||
"Convert a tech.ml.dataset / tablecloth dataset to a Kotlin DataFrame."
|
||||
[ds]
|
||||
(let [col-map (java.util.HashMap.)]
|
||||
(doseq [col-name (ds/column-names ds)]
|
||||
(.put col-map
|
||||
(if (keyword? col-name) (name col-name) (str col-name))
|
||||
(vec (ds/column ds col-name))))
|
||||
(ToDataFrameKt/toDataFrame col-map)))
|
||||
|
||||
(defn rows->kt
|
||||
"Convert a seq of row maps to a Kotlin DataFrame.
|
||||
Uses the @JvmName variant for Iterable<Map<String,Any?>>."
|
||||
[rows]
|
||||
(let [jrows (java.util.ArrayList.
|
||||
(mapv (fn [m] (java.util.HashMap. ^java.util.Map
|
||||
(into {} (map (fn [[k v]] [(name k) v])) m)))
|
||||
rows))]
|
||||
(ToDataFrameKt/toDataFrameMapStringAnyNullable jrows)))
|
||||
|
||||
;;; --- ColumnGroup support ---
|
||||
|
||||
(defn make-column-group
|
||||
"Create a KT DataFrame ColumnGroup from Clojure data.
|
||||
group-name: string name for the group
|
||||
col-map: {\"col1\" [v1 v2 ...] ...} data for the nested columns"
|
||||
[group-name col-map]
|
||||
(let [cols (mapv (fn [[k v]]
|
||||
(.createWithTypeInference companion (str k) (java.util.ArrayList. v) false))
|
||||
col-map)
|
||||
inner-df (ToDataFrameKt/toDataFrameAnyColumn cols)]
|
||||
(.createColumnGroup companion (str group-name) inner-df)))
|
||||
|
||||
(defn make-kt-with-groups
|
||||
"Create a KT DataFrame with ColumnGroups from a spec.
|
||||
spec is a vector of [name data] pairs where data is either:
|
||||
- a vector of values (creates a ValueColumn)
|
||||
- a map of {col-name values} (creates a ColumnGroup)"
|
||||
[spec]
|
||||
(let [cols (mapv (fn [[col-name data]]
|
||||
(if (map? data)
|
||||
(make-column-group col-name data)
|
||||
(.createWithTypeInference companion (str col-name) (java.util.ArrayList. data) false)))
|
||||
spec)]
|
||||
(ToDataFrameKt/toDataFrameAnyColumn cols)))
|
||||
|
||||
;;; --- Roundtrip test ---
|
||||
|
||||
(defn roundtrip-test
|
||||
"Quick sanity test: Clojure map -> KT DataFrame -> Clojure map."
|
||||
[]
|
||||
(let [input {"name" (java.util.ArrayList. ["Alice" "Bob" "Charlie"])
|
||||
"age" (java.util.ArrayList. [30 25 35])
|
||||
"score" (java.util.ArrayList. [95.5 87.3 92.1])}
|
||||
kt-df (col-map->kt input)
|
||||
output (kt->col-map kt-df)]
|
||||
{:input input
|
||||
:kt-df-class (class kt-df)
|
||||
:kt-df-rows (.rowsCount kt-df)
|
||||
:kt-df-cols (.columnsCount kt-df)
|
||||
:kt-df-col-names (vec (.columnNames kt-df))
|
||||
:output output
|
||||
:roundtrip-ok? (= (get output "name") (get input "name"))}))
|
||||
Reference in New Issue
Block a user