df-research/bridge/notebooks/kandy_equivalent.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Kotlin DataFrame + Kandy: Bridge Comparison\n",
    "\n",
    "This notebook mirrors `exploration.clj` — same analysis, Kotlin ecosystem.\n",
    "Requires: Kotlin Notebook plugin in IntelliJ IDEA."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%useLatestDescriptors\n",
    "%use dataframe, kandy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Create data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import kotlin.random.Random\n",
    "\n",
    "val rng = Random(42)\n",
    "val n = 500\n",
    "val categories = listOf(\"electronics\", \"clothing\", \"food\", \"books\", \"sports\")\n",
    "val regions = listOf(\"north\", \"south\", \"east\", \"west\")\n",
    "\n",
    "val sales = dataFrameOf(\n",
    "    \"product_id\" to (0 until n).map { it.toString() },\n",
    "    \"category\"   to (0 until n).map { categories[it % categories.size] },\n",
    "    \"region\"     to (0 until n).map { regions[it % regions.size] },\n",
    "    \"price\"      to (0 until n).map { 5.0 + 195.0 * rng.nextDouble() },\n",
    "    \"quantity\"   to (0 until n).map { 1 + rng.nextInt(100) },\n",
    "    \"rating\"     to (0 until n).map { 1.0 + 4.0 * rng.nextDouble() },\n",
    ")\n",
    "\n",
    "sales.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sales.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Group-by and aggregate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "val byCategory = sales.groupBy { category }.aggregate {\n",
    "    mean { price } into \"avg_price\"\n",
    "    mean { rating } into \"avg_rating\"\n",
    "    sum { quantity } into \"total_qty\"\n",
    "}\n",
    "byCategory"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Filter: premium items"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "val premium = sales.filter { price > 100.0 && rating > 3.5 }\n",
    "println(\"Premium items: ${premium.rowsCount()} out of ${sales.rowsCount()}\")\n",
    "premium.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Visualization with Kandy\n",
    "\n",
    "### Price distribution by category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sales.groupBy { category }.plot {\n",
    "    histogram(x = price, binsOption = BinsOption.byNumber(30)) {\n",
    "        fillColor(key.category)\n",
    "        alpha = 0.7\n",
    "        position = Position.dodge()\n",
    "    }\n",
    "    layout {\n",
    "        title = \"Price Distribution by Category\"\n",
    "        size = 850 to 500\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Price vs Rating scatter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sales.plot {\n",
    "    points {\n",
    "        x(price)\n",
    "        y(rating)\n",
    "        color(category)\n",
    "        size = 4.0\n",
    "    }\n",
    "    layout {\n",
    "        title = \"Price vs Rating\"\n",
    "        size = 850 to 500\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Total quantity by region"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "val qtyByRegion = sales.groupBy { region }.aggregate {\n",
    "    sum { quantity } into \"total_qty\"\n",
    "}\n",
    "\n",
    "qtyByRegion.plot {\n",
    "    bars {\n",
    "        x(region)\n",
    "        y(total_qty)\n",
    "    }\n",
    "    layout {\n",
    "        title = \"Total Quantity by Region\"\n",
    "        size = 600 to 400\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Average price by category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "byCategory.plot {\n",
    "    bars {\n",
    "        x(category)\n",
    "        y(avg_price)\n",
    "    }\n",
    "    layout {\n",
    "        title = \"Average Price by Category\"\n",
    "        size = 600 to 400\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Add computed column + revenue histogram"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "val enriched = sales.add(\"revenue\") { price * quantity }\n",
    "    .select { product_id and category and region and price and quantity and revenue and rating }\n",
    "\n",
    "println(\"Enriched: ${enriched.rowsCount()} rows x ${enriched.columnsCount()} cols\")\n",
    "enriched.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "enriched.groupBy { category }.plot {\n",
    "    histogram(x = revenue, binsOption = BinsOption.byNumber(40)) {\n",
    "        fillColor(key.category)\n",
    "        alpha = 0.7\n",
    "        position = Position.dodge()\n",
    "    }\n",
    "    layout {\n",
    "        title = \"Revenue Distribution by Category\"\n",
    "        size = 850 to 500\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Schema info (Kotlin way)\n",
    "\n",
    "Kotlin DataFrame provides compile-time schema via `@DataSchema` and runtime via `.schema()`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sales.schema()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Kotlin",
   "language": "kotlin",
   "name": "kotlin"
  },
  "language_info": {
   "name": "kotlin"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}