init research

This commit is contained in:
2026-02-08 11:20:43 -10:00
commit bdf064f54d
3041 changed files with 1592200 additions and 0 deletions
@@ -0,0 +1,43 @@
import org.jetbrains.kotlin.gradle.dsl.JvmTarget
plugins {
application
kotlin("jvm")
// uses the 'old' Gradle plugin instead of the compiler plugin for now
id("org.jetbrains.kotlinx.dataframe")
// only mandatory if `kotlin.dataframe.add.ksp=false` in gradle.properties
id("com.google.devtools.ksp")
}
repositories {
mavenLocal() // in case of local dataframe development
mavenCentral()
}
dependencies {
// implementation("org.jetbrains.kotlinx:dataframe:X.Y.Z")
implementation(project(":"))
// exposed + sqlite database support
implementation(libs.sqlite)
implementation(libs.exposed.core)
implementation(libs.exposed.kotlin.datetime)
implementation(libs.exposed.jdbc)
implementation(libs.exposed.json)
implementation(libs.exposed.money)
}
kotlin {
compilerOptions {
jvmTarget = JvmTarget.JVM_1_8
freeCompilerArgs.add("-Xjdk-release=8")
}
}
tasks.withType<JavaCompile> {
sourceCompatibility = JavaVersion.VERSION_1_8.toString()
targetCompatibility = JavaVersion.VERSION_1_8.toString()
options.release.set(8)
}
@@ -0,0 +1,107 @@
package org.jetbrains.kotlinx.dataframe.examples.exposed
import org.jetbrains.exposed.v1.core.BiCompositeColumn
import org.jetbrains.exposed.v1.core.Column
import org.jetbrains.exposed.v1.core.Expression
import org.jetbrains.exposed.v1.core.ExpressionAlias
import org.jetbrains.exposed.v1.core.ResultRow
import org.jetbrains.exposed.v1.core.Table
import org.jetbrains.exposed.v1.jdbc.Query
import org.jetbrains.kotlinx.dataframe.AnyFrame
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.annotations.DataSchema
import org.jetbrains.kotlinx.dataframe.api.convertTo
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
import org.jetbrains.kotlinx.dataframe.codeGen.NameNormalizer
import org.jetbrains.kotlinx.dataframe.impl.schema.DataFrameSchemaImpl
import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema
import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema
import kotlin.reflect.KProperty1
import kotlin.reflect.full.isSubtypeOf
import kotlin.reflect.full.memberProperties
import kotlin.reflect.typeOf
/**
* Retrieves all columns of any [Iterable][Iterable]`<`[ResultRow][ResultRow]`>`, like [Query][Query],
* from Exposed row by row and converts the resulting [Map] into a [DataFrame], cast to type [T].
*
* In notebooks, the untyped version works just as well due to runtime inference :)
*/
inline fun <reified T : Any> Iterable<ResultRow>.convertToDataFrame(): DataFrame<T> =
convertToDataFrame().convertTo<T>()
/**
* Retrieves all columns of an [Iterable][Iterable]`<`[ResultRow][ResultRow]`>` from Exposed, like [Query][Query],
* row by row and converts the resulting [Map] of lists into a [DataFrame] by calling
* [Map.toDataFrame].
*/
@JvmName("convertToAnyFrame")
fun Iterable<ResultRow>.convertToDataFrame(): AnyFrame {
val map = mutableMapOf<String, MutableList<Any?>>()
for (row in this) {
for (expression in row.fieldIndex.keys) {
map.getOrPut(expression.readableName) {
mutableListOf()
} += row[expression]
}
}
return map.toDataFrame()
}
/**
* Retrieves a simple column name from [this] [Expression].
*
* Might need to be expanded with multiple types of [Expression].
*/
val Expression<*>.readableName: String
get() = when (this) {
is Column<*> -> name
is ExpressionAlias<*> -> alias
is BiCompositeColumn<*, *, *> -> getRealColumns().joinToString("_") { it.readableName }
else -> toString()
}
/**
* Creates a [DataFrameSchema] from the declared [Table] instance.
*
* This is not needed for conversion, but it can be useful to create a DataFrame [@DataSchema][DataSchema] instance.
*
* @param columnNameToAccessor Optional [MutableMap] which will be filled with entries mapping
* the SQL column name to the accessor name from the [Table].
* This can be used to define a [NameNormalizer] later.
* @see toDataFrameSchemaWithNameNormalizer
*/
@Suppress("UNCHECKED_CAST")
fun Table.toDataFrameSchema(columnNameToAccessor: MutableMap<String, String> = mutableMapOf()): DataFrameSchema {
// we use reflection to go over all `Column<*>` properties in the Table object
val columns = this::class.memberProperties
.filter { it.returnType.isSubtypeOf(typeOf<Column<*>>()) }
.associate { prop ->
prop as KProperty1<Table, Column<*>>
// retrieve the SQL column name
val columnName = prop.get(this).name
// store the SQL column name together with the accessor name in the map
columnNameToAccessor[columnName] = prop.name
// get the column type from `val a: Column<Type>`
val type = prop.returnType.arguments.first().type!!
// and we add the name and column shema type to the `columns` map :)
columnName to ColumnSchema.Value(type)
}
return DataFrameSchemaImpl(columns)
}
/**
* Creates a [DataFrameSchema] from the declared [Table] instance with a [NameNormalizer] to
* convert the SQL column names to the corresponding Kotlin property names.
*
* This is not needed for conversion, but it can be useful to create a DataFrame [@DataSchema][DataSchema] instance.
*
* @see toDataFrameSchema
*/
fun Table.toDataFrameSchemaWithNameNormalizer(): Pair<DataFrameSchema, NameNormalizer> {
val columnNameToAccessor = mutableMapOf<String, String>()
return Pair(toDataFrameSchema(), NameNormalizer { columnNameToAccessor[it] ?: it })
}
@@ -0,0 +1,96 @@
package org.jetbrains.kotlinx.dataframe.examples.exposed
import org.jetbrains.exposed.v1.core.Column
import org.jetbrains.exposed.v1.core.SortOrder
import org.jetbrains.exposed.v1.core.count
import org.jetbrains.exposed.v1.jdbc.Database
import org.jetbrains.exposed.v1.jdbc.SchemaUtils
import org.jetbrains.exposed.v1.jdbc.batchInsert
import org.jetbrains.exposed.v1.jdbc.deleteAll
import org.jetbrains.exposed.v1.jdbc.select
import org.jetbrains.exposed.v1.jdbc.selectAll
import org.jetbrains.exposed.v1.jdbc.transactions.transaction
import org.jetbrains.kotlinx.dataframe.api.asSequence
import org.jetbrains.kotlinx.dataframe.api.count
import org.jetbrains.kotlinx.dataframe.api.describe
import org.jetbrains.kotlinx.dataframe.api.groupBy
import org.jetbrains.kotlinx.dataframe.api.print
import org.jetbrains.kotlinx.dataframe.api.sortByDesc
import org.jetbrains.kotlinx.dataframe.size
import java.io.File
/**
* Describes a simple bridge between [Exposed](https://www.jetbrains.com/exposed/) and DataFrame!
*/
fun main() {
// defining where to find our SQLite database for Exposed
val resourceDb = "chinook.db"
val dbPath = File(object {}.javaClass.classLoader.getResource(resourceDb)!!.toURI()).absolutePath
val db = Database.connect(url = "jdbc:sqlite:$dbPath", driver = "org.sqlite.JDBC")
// let's read the database!
val df = transaction(db) {
// addLogger(StdOutSqlLogger) // enable if you want to see verbose logs
// tables in Exposed need to be defined, see tables.kt
SchemaUtils.create(Customers, Artists, Albums)
println()
// In Exposed, we can write queries like this.
// Here, we count per country how many customers there are and print the results:
Customers
.select(Customers.country, Customers.customerId.count())
.groupBy(Customers.country)
.orderBy(Customers.customerId.count() to SortOrder.DESC)
.forEach {
println("${it[Customers.country]}: ${it[Customers.customerId.count()]} customers")
}
println()
// Perform the specific query you want to read into the DataFrame.
// Note: DataFrames are in-memory structures, so don't make it too large if you don't have the RAM ;)
val query = Customers.selectAll() // .where { Customers.company.isNotNull() }
println()
// read and convert the query to a typed DataFrame
// see compatibilityLayer.kt for how we created convertToDataFrame<>()
// and see tables.kt for how we created DfCustomers!
query.convertToDataFrame<DfCustomers>()
}
println(df.size())
// now we have a DataFrame, we can perform DataFrame operations,
// like doing the same operation as we did in Exposed above
df.groupBy { country }.count()
.sortByDesc { "count"<Int>() }
.print(columnTypes = true, borders = true)
// or just general statistics
df.describe()
.print(columnTypes = true, borders = true)
// or make plots using Kandy! It's all up to you
// writing a DataFrame back into an SQL database with Exposed can also be done easily!
transaction(db) {
// addLogger(StdOutSqlLogger) // enable if you want to see verbose logs
// first delete the original contents
Customers.deleteAll()
println()
// batch-insert our dataframe back into the SQL database as a sequence of rows
Customers.batchInsert(df.asSequence()) { dfRow ->
// we simply go over each value in the row and put it in the right place in the Exposed statement
for (column in Customers.columns) {
@Suppress("UNCHECKED_CAST")
this[column as Column<Any?>] = dfRow[column.name]
}
}
}
}
@@ -0,0 +1,97 @@
package org.jetbrains.kotlinx.dataframe.examples.exposed
import org.jetbrains.exposed.v1.core.Column
import org.jetbrains.exposed.v1.core.Table
import org.jetbrains.kotlinx.dataframe.annotations.ColumnName
import org.jetbrains.kotlinx.dataframe.annotations.DataSchema
import org.jetbrains.kotlinx.dataframe.api.generateDataClasses
import org.jetbrains.kotlinx.dataframe.api.print
object Albums : Table() {
val albumId: Column<Int> = integer("AlbumId").autoIncrement()
val title: Column<String> = varchar("Title", 160)
val artistId: Column<Int> = integer("ArtistId")
override val primaryKey = PrimaryKey(albumId)
}
object Artists : Table() {
val artistId: Column<Int> = integer("ArtistId").autoIncrement()
val name: Column<String> = varchar("Name", 120)
override val primaryKey = PrimaryKey(artistId)
}
object Customers : Table() {
val customerId: Column<Int> = integer("CustomerId").autoIncrement()
val firstName: Column<String> = varchar("FirstName", 40)
val lastName: Column<String> = varchar("LastName", 20)
val company: Column<String?> = varchar("Company", 80).nullable()
val address: Column<String?> = varchar("Address", 70).nullable()
val city: Column<String?> = varchar("City", 40).nullable()
val state: Column<String?> = varchar("State", 40).nullable()
val country: Column<String?> = varchar("Country", 40).nullable()
val postalCode: Column<String?> = varchar("PostalCode", 10).nullable()
val phone: Column<String?> = varchar("Phone", 24).nullable()
val fax: Column<String?> = varchar("Fax", 24).nullable()
val email: Column<String> = varchar("Email", 60)
val supportRepId: Column<Int?> = integer("SupportRepId").nullable()
override val primaryKey = PrimaryKey(customerId)
}
/**
* Exposed requires you to provide [Table] instances to
* provide type-safe access to your columns and data.
*
* While DataFrame can infer types at runtime, which is enough for Kotlin Notebook,
* to get type safe access at compile time, we need to define a [@DataSchema][DataSchema].
*
* This is what we created the [toDataFrameSchema] function for!
*/
fun main() {
val (schema, nameNormalizer) = Customers.toDataFrameSchemaWithNameNormalizer()
// checking whether the schema is converted correctly.
// schema.print()
// printing a @DataSchema data class to copy-paste into the code.
// we use a NameNormalizer to let DataFrame generate the same accessors as in the Table
// while keeping the correct column names
schema.generateDataClasses(
markerName = "DfCustomers",
nameNormalizer = nameNormalizer,
).print()
}
// created by Customers.toDataFrameSchema()
// The same can be done for the other tables
@DataSchema
data class DfCustomers(
@ColumnName("Address")
val address: String?,
@ColumnName("City")
val city: String?,
@ColumnName("Company")
val company: String?,
@ColumnName("Country")
val country: String?,
@ColumnName("CustomerId")
val customerId: Int,
@ColumnName("Email")
val email: String,
@ColumnName("Fax")
val fax: String?,
@ColumnName("FirstName")
val firstName: String,
@ColumnName("LastName")
val lastName: String,
@ColumnName("Phone")
val phone: String?,
@ColumnName("PostalCode")
val postalCode: String?,
@ColumnName("State")
val state: String?,
@ColumnName("SupportRepId")
val supportRepId: Int?,
)