init research

2026-02-08 11:20:43 -10:00
commit bdf064f54d
3041 changed files with 1592200 additions and 0 deletions
@@ -0,0 +1,43 @@
+import org.jetbrains.kotlin.gradle.dsl.JvmTarget
+
+plugins {
+    application
+    kotlin("jvm")
+
+    // uses the 'old' Gradle plugin instead of the compiler plugin for now
+    id("org.jetbrains.kotlinx.dataframe")
+
+    // only mandatory if `kotlin.dataframe.add.ksp=false` in gradle.properties
+    id("com.google.devtools.ksp")
+}
+
+repositories {
+    mavenLocal() // in case of local dataframe development
+    mavenCentral()
+}
+
+dependencies {
+    // implementation("org.jetbrains.kotlinx:dataframe:X.Y.Z")
+    implementation(project(":"))
+
+    // exposed + sqlite database support
+    implementation(libs.sqlite)
+    implementation(libs.exposed.core)
+    implementation(libs.exposed.kotlin.datetime)
+    implementation(libs.exposed.jdbc)
+    implementation(libs.exposed.json)
+    implementation(libs.exposed.money)
+}
+
+kotlin {
+    compilerOptions {
+        jvmTarget = JvmTarget.JVM_1_8
+        freeCompilerArgs.add("-Xjdk-release=8")
+    }
+}
+
+tasks.withType<JavaCompile> {
+    sourceCompatibility = JavaVersion.VERSION_1_8.toString()
+    targetCompatibility = JavaVersion.VERSION_1_8.toString()
+    options.release.set(8)
+}
@@ -0,0 +1,107 @@
+package org.jetbrains.kotlinx.dataframe.examples.exposed
+
+import org.jetbrains.exposed.v1.core.BiCompositeColumn
+import org.jetbrains.exposed.v1.core.Column
+import org.jetbrains.exposed.v1.core.Expression
+import org.jetbrains.exposed.v1.core.ExpressionAlias
+import org.jetbrains.exposed.v1.core.ResultRow
+import org.jetbrains.exposed.v1.core.Table
+import org.jetbrains.exposed.v1.jdbc.Query
+import org.jetbrains.kotlinx.dataframe.AnyFrame
+import org.jetbrains.kotlinx.dataframe.DataFrame
+import org.jetbrains.kotlinx.dataframe.annotations.DataSchema
+import org.jetbrains.kotlinx.dataframe.api.convertTo
+import org.jetbrains.kotlinx.dataframe.api.toDataFrame
+import org.jetbrains.kotlinx.dataframe.codeGen.NameNormalizer
+import org.jetbrains.kotlinx.dataframe.impl.schema.DataFrameSchemaImpl
+import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema
+import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema
+import kotlin.reflect.KProperty1
+import kotlin.reflect.full.isSubtypeOf
+import kotlin.reflect.full.memberProperties
+import kotlin.reflect.typeOf
+
+/**
+ * Retrieves all columns of any [Iterable][Iterable]`<`[ResultRow][ResultRow]`>`, like [Query][Query],
+ * from Exposed row by row and converts the resulting [Map] into a [DataFrame], cast to type [T].
+ *
+ * In notebooks, the untyped version works just as well due to runtime inference :)
+ */
+inline fun <reified T : Any> Iterable<ResultRow>.convertToDataFrame(): DataFrame<T> =
+    convertToDataFrame().convertTo<T>()
+
+/**
+ * Retrieves all columns of an [Iterable][Iterable]`<`[ResultRow][ResultRow]`>` from Exposed, like [Query][Query],
+ * row by row and converts the resulting [Map] of lists into a [DataFrame] by calling
+ * [Map.toDataFrame].
+ */
+@JvmName("convertToAnyFrame")
+fun Iterable<ResultRow>.convertToDataFrame(): AnyFrame {
+    val map = mutableMapOf<String, MutableList<Any?>>()
+    for (row in this) {
+        for (expression in row.fieldIndex.keys) {
+            map.getOrPut(expression.readableName) {
+                mutableListOf()
+            } += row[expression]
+        }
+    }
+    return map.toDataFrame()
+}
+
+/**
+ * Retrieves a simple column name from [this] [Expression].
+ *
+ * Might need to be expanded with multiple types of [Expression].
+ */
+val Expression<*>.readableName: String
+    get() = when (this) {
+        is Column<*> -> name
+        is ExpressionAlias<*> -> alias
+        is BiCompositeColumn<*, *, *> -> getRealColumns().joinToString("_") { it.readableName }
+        else -> toString()
+    }
+
+/**
+ * Creates a [DataFrameSchema] from the declared [Table] instance.
+ *
+ * This is not needed for conversion, but it can be useful to create a DataFrame [@DataSchema][DataSchema] instance.
+ *
+ * @param columnNameToAccessor Optional [MutableMap] which will be filled with entries mapping
+ *   the SQL column name to the accessor name from the [Table].
+ *   This can be used to define a [NameNormalizer] later.
+ * @see toDataFrameSchemaWithNameNormalizer
+ */
+@Suppress("UNCHECKED_CAST")
+fun Table.toDataFrameSchema(columnNameToAccessor: MutableMap<String, String> = mutableMapOf()): DataFrameSchema {
+    // we use reflection to go over all `Column<*>` properties in the Table object
+    val columns = this::class.memberProperties
+        .filter { it.returnType.isSubtypeOf(typeOf<Column<*>>()) }
+        .associate { prop ->
+            prop as KProperty1<Table, Column<*>>
+
+            // retrieve the SQL column name
+            val columnName = prop.get(this).name
+            // store the SQL column name together with the accessor name in the map
+            columnNameToAccessor[columnName] = prop.name
+
+            // get the column type from `val a: Column<Type>`
+            val type = prop.returnType.arguments.first().type!!
+
+            // and we add the name and column shema type to the `columns` map :)
+            columnName to ColumnSchema.Value(type)
+        }
+    return DataFrameSchemaImpl(columns)
+}
+
+/**
+ * Creates a [DataFrameSchema] from the declared [Table] instance with a [NameNormalizer] to
+ * convert the SQL column names to the corresponding Kotlin property names.
+ *
+ * This is not needed for conversion, but it can be useful to create a DataFrame [@DataSchema][DataSchema] instance.
+ *
+ * @see toDataFrameSchema
+ */
+fun Table.toDataFrameSchemaWithNameNormalizer(): Pair<DataFrameSchema, NameNormalizer> {
+    val columnNameToAccessor = mutableMapOf<String, String>()
+    return Pair(toDataFrameSchema(), NameNormalizer { columnNameToAccessor[it] ?: it })
+}
@@ -0,0 +1,96 @@
+package org.jetbrains.kotlinx.dataframe.examples.exposed
+
+import org.jetbrains.exposed.v1.core.Column
+import org.jetbrains.exposed.v1.core.SortOrder
+import org.jetbrains.exposed.v1.core.count
+import org.jetbrains.exposed.v1.jdbc.Database
+import org.jetbrains.exposed.v1.jdbc.SchemaUtils
+import org.jetbrains.exposed.v1.jdbc.batchInsert
+import org.jetbrains.exposed.v1.jdbc.deleteAll
+import org.jetbrains.exposed.v1.jdbc.select
+import org.jetbrains.exposed.v1.jdbc.selectAll
+import org.jetbrains.exposed.v1.jdbc.transactions.transaction
+import org.jetbrains.kotlinx.dataframe.api.asSequence
+import org.jetbrains.kotlinx.dataframe.api.count
+import org.jetbrains.kotlinx.dataframe.api.describe
+import org.jetbrains.kotlinx.dataframe.api.groupBy
+import org.jetbrains.kotlinx.dataframe.api.print
+import org.jetbrains.kotlinx.dataframe.api.sortByDesc
+import org.jetbrains.kotlinx.dataframe.size
+import java.io.File
+
+/**
+ * Describes a simple bridge between [Exposed](https://www.jetbrains.com/exposed/) and DataFrame!
+ */
+fun main() {
+    // defining where to find our SQLite database for Exposed
+    val resourceDb = "chinook.db"
+    val dbPath = File(object {}.javaClass.classLoader.getResource(resourceDb)!!.toURI()).absolutePath
+    val db = Database.connect(url = "jdbc:sqlite:$dbPath", driver = "org.sqlite.JDBC")
+
+    // let's read the database!
+    val df = transaction(db) {
+        // addLogger(StdOutSqlLogger) // enable if you want to see verbose logs
+
+        // tables in Exposed need to be defined, see tables.kt
+        SchemaUtils.create(Customers, Artists, Albums)
+
+        println()
+
+        // In Exposed, we can write queries like this.
+        // Here, we count per country how many customers there are and print the results:
+        Customers
+            .select(Customers.country, Customers.customerId.count())
+            .groupBy(Customers.country)
+            .orderBy(Customers.customerId.count() to SortOrder.DESC)
+            .forEach {
+                println("${it[Customers.country]}: ${it[Customers.customerId.count()]} customers")
+            }
+
+        println()
+
+        // Perform the specific query you want to read into the DataFrame.
+        // Note: DataFrames are in-memory structures, so don't make it too large if you don't have the RAM ;)
+        val query = Customers.selectAll() // .where { Customers.company.isNotNull() }
+
+        println()
+
+        // read and convert the query to a typed DataFrame
+        // see compatibilityLayer.kt for how we created convertToDataFrame<>()
+        // and see tables.kt for how we created DfCustomers!
+        query.convertToDataFrame<DfCustomers>()
+    }
+
+    println(df.size())
+
+    // now we have a DataFrame, we can perform DataFrame operations,
+    // like doing the same operation as we did in Exposed above
+    df.groupBy { country }.count()
+        .sortByDesc { "count"<Int>() }
+        .print(columnTypes = true, borders = true)
+
+    // or just general statistics
+    df.describe()
+        .print(columnTypes = true, borders = true)
+
+    // or make plots using Kandy! It's all up to you
+
+    // writing a DataFrame back into an SQL database with Exposed can also be done easily!
+    transaction(db) {
+        // addLogger(StdOutSqlLogger) // enable if you want to see verbose logs
+
+        // first delete the original contents
+        Customers.deleteAll()
+
+        println()
+
+        // batch-insert our dataframe back into the SQL database as a sequence of rows
+        Customers.batchInsert(df.asSequence()) { dfRow ->
+            // we simply go over each value in the row and put it in the right place in the Exposed statement
+            for (column in Customers.columns) {
+                @Suppress("UNCHECKED_CAST")
+                this[column as Column<Any?>] = dfRow[column.name]
+            }
+        }
+    }
+}
@@ -0,0 +1,97 @@
+package org.jetbrains.kotlinx.dataframe.examples.exposed
+
+import org.jetbrains.exposed.v1.core.Column
+import org.jetbrains.exposed.v1.core.Table
+import org.jetbrains.kotlinx.dataframe.annotations.ColumnName
+import org.jetbrains.kotlinx.dataframe.annotations.DataSchema
+import org.jetbrains.kotlinx.dataframe.api.generateDataClasses
+import org.jetbrains.kotlinx.dataframe.api.print
+
+object Albums : Table() {
+    val albumId: Column<Int> = integer("AlbumId").autoIncrement()
+    val title: Column<String> = varchar("Title", 160)
+    val artistId: Column<Int> = integer("ArtistId")
+
+    override val primaryKey = PrimaryKey(albumId)
+}
+
+object Artists : Table() {
+    val artistId: Column<Int> = integer("ArtistId").autoIncrement()
+    val name: Column<String> = varchar("Name", 120)
+
+    override val primaryKey = PrimaryKey(artistId)
+}
+
+object Customers : Table() {
+    val customerId: Column<Int> = integer("CustomerId").autoIncrement()
+    val firstName: Column<String> = varchar("FirstName", 40)
+    val lastName: Column<String> = varchar("LastName", 20)
+    val company: Column<String?> = varchar("Company", 80).nullable()
+    val address: Column<String?> = varchar("Address", 70).nullable()
+    val city: Column<String?> = varchar("City", 40).nullable()
+    val state: Column<String?> = varchar("State", 40).nullable()
+    val country: Column<String?> = varchar("Country", 40).nullable()
+    val postalCode: Column<String?> = varchar("PostalCode", 10).nullable()
+    val phone: Column<String?> = varchar("Phone", 24).nullable()
+    val fax: Column<String?> = varchar("Fax", 24).nullable()
+    val email: Column<String> = varchar("Email", 60)
+    val supportRepId: Column<Int?> = integer("SupportRepId").nullable()
+
+    override val primaryKey = PrimaryKey(customerId)
+}
+
+/**
+ * Exposed requires you to provide [Table] instances to
+ * provide type-safe access to your columns and data.
+ *
+ * While DataFrame can infer types at runtime, which is enough for Kotlin Notebook,
+ * to get type safe access at compile time, we need to define a [@DataSchema][DataSchema].
+ *
+ * This is what we created the [toDataFrameSchema] function for!
+ */
+fun main() {
+    val (schema, nameNormalizer) = Customers.toDataFrameSchemaWithNameNormalizer()
+
+    // checking whether the schema is converted correctly.
+    // schema.print()
+
+    // printing a @DataSchema data class to copy-paste into the code.
+    // we use a NameNormalizer to let DataFrame generate the same accessors as in the Table
+    // while keeping the correct column names
+    schema.generateDataClasses(
+        markerName = "DfCustomers",
+        nameNormalizer = nameNormalizer,
+    ).print()
+}
+
+// created by Customers.toDataFrameSchema()
+// The same can be done for the other tables
+@DataSchema
+data class DfCustomers(
+    @ColumnName("Address")
+    val address: String?,
+    @ColumnName("City")
+    val city: String?,
+    @ColumnName("Company")
+    val company: String?,
+    @ColumnName("Country")
+    val country: String?,
+    @ColumnName("CustomerId")
+    val customerId: Int,
+    @ColumnName("Email")
+    val email: String,
+    @ColumnName("Fax")
+    val fax: String?,
+    @ColumnName("FirstName")
+    val firstName: String,
+    @ColumnName("LastName")
+    val lastName: String,
+    @ColumnName("Phone")
+    val phone: String?,
+    @ColumnName("PostalCode")
+    val postalCode: String?,
+    @ColumnName("State")
+    val state: String?,
+    @ColumnName("SupportRepId")
+    val supportRepId: Int?,
+)