init research

This commit is contained in:
2026-02-08 11:20:43 -10:00
commit bdf064f54d
3041 changed files with 1592200 additions and 0 deletions
+130
View File
@@ -0,0 +1,130 @@
(ns exploration
"Side-by-side exploration: Kotlin DataFrame bridge + Clojure data stack.
Render with Clay: (require '[scicloj.clay.v2.api :as clay])
(clay/make! {:source-path \"notebooks/exploration.clj\"})"
(:require [tablecloth.api :as tc]
[tech.v3.dataset :as ds]
[tech.v3.datatype.functional :as dfn]
[scicloj.tableplot.v1.plotly :as plotly]
[scicloj.kindly.v4.kind :as kind]
[df-bridge.core :as bridge]
[malli.provider :as mp])
(:import [org.jetbrains.kotlinx.dataframe.api ToDataFrameKt TypeConversionsKt]))
;; # Kotlin DataFrame <-> Clojure Bridge Exploration
;; ## 1. Create data in Kotlin DataFrame, bring it to Clojure
;; Build a dataset on the Kotlin side (simulating data coming from a Kotlin service):
(def kt-data
(let [n 500
rng (java.util.Random. 42)
categories (cycle ["electronics" "clothing" "food" "books" "sports"])
regions (cycle ["north" "south" "east" "west"])]
(java.util.HashMap.
{"product_id" (java.util.ArrayList. (mapv str (range n)))
"category" (java.util.ArrayList. (vec (take n categories)))
"region" (java.util.ArrayList. (vec (take n regions)))
"price" (java.util.ArrayList. (mapv (fn [_] (+ 5.0 (* 195.0 (.nextDouble rng)))) (range n)))
"quantity" (java.util.ArrayList. (mapv (fn [_] (+ 1 (.nextInt rng 100))) (range n)))
"rating" (java.util.ArrayList. (mapv (fn [_] (+ 1.0 (* 4.0 (.nextDouble rng)))) (range n)))})))
(def kt-df (ToDataFrameKt/toDataFrame kt-data))
;; Kotlin DataFrame info:
(kind/md (format "**Kotlin DataFrame**: %d rows x %d columns — columns: %s"
(.rowsCount kt-df) (.columnsCount kt-df)
(vec (.columnNames kt-df))))
;; ## 2. Bridge to tablecloth
(def sales (bridge/kt->tc kt-df))
sales
;; ## 3. Basic tablecloth operations
;; ### Summary by category
(def by-category
(-> sales
(tc/group-by "category")
(tc/aggregate {"avg-price" (fn [ds] (dfn/mean (ds/column ds "price")))
"avg-rating" (fn [ds] (dfn/mean (ds/column ds "rating")))
"total-qty" (fn [ds] (dfn/sum (ds/column ds "quantity")))})))
by-category
;; ### Filter: high-value items (price > 100, rating > 3.5)
(def premium
(-> sales
(tc/select-rows (fn [row] (and (> (get row "price") 100.0)
(> (get row "rating") 3.5))))))
(kind/md (format "**Premium items**: %d out of %d" (tc/row-count premium) (tc/row-count sales)))
premium
;; ## 4. Visualization with tableplot
;; ### Price distribution by category
(-> sales
(plotly/base {:=x "price"})
(plotly/layer-histogram {:=histogram-nbins 30
:=color "category"}))
;; ### Price vs Rating scatter
(-> sales
(plotly/base {:=x "price" :=y "rating"})
(plotly/layer-point {:=color "category"
:=mark-size 6}))
;; ### Total quantity by region (bar chart)
(def qty-by-region
(-> sales
(tc/group-by "region")
(tc/aggregate {"total-qty" (fn [ds] (dfn/sum (ds/column ds "quantity")))})))
(-> qty-by-region
(plotly/base {:=x :$group-name :=y "total-qty"})
(plotly/layer-bar {}))
;; ### Average price by category (bar chart)
(-> by-category
(plotly/base {:=x :$group-name :=y "avg-price"})
(plotly/layer-bar {}))
;; ## 5. Roundtrip: modify in Clojure, send back to Kotlin
(def enriched
(-> sales
(tc/map-columns "revenue" ["price" "quantity"] *)
(tc/select-columns ["product_id" "category" "region" "price" "quantity" "revenue" "rating"])))
(def kt-enriched (bridge/dataset->kt enriched))
(kind/md (format "**Roundtrip**: enriched tablecloth dataset -> KT DataFrame: %d rows x %d cols, columns: %s"
(.rowsCount kt-enriched) (.columnsCount kt-enriched)
(vec (.columnNames kt-enriched))))
;; Revenue distribution:
(-> enriched
(plotly/base {:=x "revenue"})
(plotly/layer-histogram {:=histogram-nbins 40
:=color "category"}))
;; ## 6. Schema inference with malli
(def row-sample (take 10 (bridge/kt->rows kt-df)))
(def inferred-schema (mp/provide row-sample))
(kind/md (str "**Malli inferred schema from KT DataFrame rows:**\n```clojure\n"
(pr-str inferred-schema)
"\n```"))
+282
View File
@@ -0,0 +1,282 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Kotlin DataFrame + Kandy: Bridge Comparison\n",
"\n",
"This notebook mirrors `exploration.clj` — same analysis, Kotlin ecosystem.\n",
"Requires: Kotlin Notebook plugin in IntelliJ IDEA."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%useLatestDescriptors\n",
"%use dataframe, kandy"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Create data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import kotlin.random.Random\n",
"\n",
"val rng = Random(42)\n",
"val n = 500\n",
"val categories = listOf(\"electronics\", \"clothing\", \"food\", \"books\", \"sports\")\n",
"val regions = listOf(\"north\", \"south\", \"east\", \"west\")\n",
"\n",
"val sales = dataFrameOf(\n",
" \"product_id\" to (0 until n).map { it.toString() },\n",
" \"category\" to (0 until n).map { categories[it % categories.size] },\n",
" \"region\" to (0 until n).map { regions[it % regions.size] },\n",
" \"price\" to (0 until n).map { 5.0 + 195.0 * rng.nextDouble() },\n",
" \"quantity\" to (0 until n).map { 1 + rng.nextInt(100) },\n",
" \"rating\" to (0 until n).map { 1.0 + 4.0 * rng.nextDouble() },\n",
")\n",
"\n",
"sales.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sales.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Group-by and aggregate"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"val byCategory = sales.groupBy { category }.aggregate {\n",
" mean { price } into \"avg_price\"\n",
" mean { rating } into \"avg_rating\"\n",
" sum { quantity } into \"total_qty\"\n",
"}\n",
"byCategory"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Filter: premium items"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"val premium = sales.filter { price > 100.0 && rating > 3.5 }\n",
"println(\"Premium items: ${premium.rowsCount()} out of ${sales.rowsCount()}\")\n",
"premium.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Visualization with Kandy\n",
"\n",
"### Price distribution by category"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sales.groupBy { category }.plot {\n",
" histogram(x = price, binsOption = BinsOption.byNumber(30)) {\n",
" fillColor(key.category)\n",
" alpha = 0.7\n",
" position = Position.dodge()\n",
" }\n",
" layout {\n",
" title = \"Price Distribution by Category\"\n",
" size = 850 to 500\n",
" }\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Price vs Rating scatter"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sales.plot {\n",
" points {\n",
" x(price)\n",
" y(rating)\n",
" color(category)\n",
" size = 4.0\n",
" }\n",
" layout {\n",
" title = \"Price vs Rating\"\n",
" size = 850 to 500\n",
" }\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Total quantity by region"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"val qtyByRegion = sales.groupBy { region }.aggregate {\n",
" sum { quantity } into \"total_qty\"\n",
"}\n",
"\n",
"qtyByRegion.plot {\n",
" bars {\n",
" x(region)\n",
" y(total_qty)\n",
" }\n",
" layout {\n",
" title = \"Total Quantity by Region\"\n",
" size = 600 to 400\n",
" }\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Average price by category"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"byCategory.plot {\n",
" bars {\n",
" x(category)\n",
" y(avg_price)\n",
" }\n",
" layout {\n",
" title = \"Average Price by Category\"\n",
" size = 600 to 400\n",
" }\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Add computed column + revenue histogram"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"val enriched = sales.add(\"revenue\") { price * quantity }\n",
" .select { product_id and category and region and price and quantity and revenue and rating }\n",
"\n",
"println(\"Enriched: ${enriched.rowsCount()} rows x ${enriched.columnsCount()} cols\")\n",
"enriched.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"enriched.groupBy { category }.plot {\n",
" histogram(x = revenue, binsOption = BinsOption.byNumber(40)) {\n",
" fillColor(key.category)\n",
" alpha = 0.7\n",
" position = Position.dodge()\n",
" }\n",
" layout {\n",
" title = \"Revenue Distribution by Category\"\n",
" size = 850 to 500\n",
" }\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. Schema info (Kotlin way)\n",
"\n",
"Kotlin DataFrame provides compile-time schema via `@DataSchema` and runtime via `.schema()`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sales.schema()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Kotlin",
"language": "kotlin",
"name": "kotlin"
},
"language_info": {
"name": "kotlin"
}
},
"nbformat": 4,
"nbformat_minor": 4
}