init research

This commit is contained in:
2026-02-08 11:20:43 -10:00
commit bdf064f54d
3041 changed files with 1592200 additions and 0 deletions
+39
View File
@@ -0,0 +1,39 @@
## :samples
Code samples, as well as DataFrame iframes and Kandy plot images, for the
[documentation website](https://github.com/Kotlin/dataframe).
### Korro
Saves code samples using [Korro](https://github.com/devcrocod/korro).
To save or update samples:
* Run the `korroClean` and `korro` Gradle tasks.
**Important**: May not work correctly until the
[migration from `:core` is finished](https://github.com/Kotlin/dataframe/issues/898).
Run Korro tasks for the whole project.
### SampleHelper
[`SampleHelper`](https://github.com/Kotlin/kandy/blob/samples_util/util/kandy-samples-utils/README.md)
allows you to save the resulting Kandy plots as SVG images and DataFrames as iframes.
Running tests in this module will save or update these samples.
**Important**:
1) If a sample has changed, verify that the change is intentional and correct.
You can track it with the Git file changes tracker in IDEA.
2) Add all iframes as resources in [this file](../docs/StardustDocs/topics/_shadow_resources.md).
Run [this script](https://github.com/Kotlin/kandy/blob/samples_util/util/kandy-samples-utils/README.md#how-to-use)
to update them.
### Notebook-To-Doc
A Kotlin notebook can be easily converted to documentation using
[this script](https://github.com/Kotlin/kandy/blob/samples_util/util/kandy-samples-utils/README.md#how-to-use).
It produces two files: `.kt` and `.md`.
* Place the `.kt` file in the tests of this module and run it.
* Place the `.md` file in the [docs topics directory](../docs/StardustDocs/topics).
* Run the Korro tasks.
+142
View File
@@ -0,0 +1,142 @@
import org.gradle.kotlin.dsl.dependencies
import org.gradle.kotlin.dsl.exclude
import org.gradle.kotlin.dsl.implementation
import org.gradle.kotlin.dsl.invoke
import org.gradle.kotlin.dsl.java
import org.gradle.kotlin.dsl.korro
import org.gradle.kotlin.dsl.kotlin
import org.gradle.kotlin.dsl.libs
import org.gradle.kotlin.dsl.main
import org.gradle.kotlin.dsl.projects
import org.gradle.kotlin.dsl.repositories
import org.gradle.kotlin.dsl.runKtlintCheckOverMainSourceSet
import org.gradle.kotlin.dsl.runKtlintCheckOverTestSourceSet
import org.gradle.kotlin.dsl.runKtlintFormatOverMainSourceSet
import org.gradle.kotlin.dsl.runKtlintFormatOverTestSourceSet
import org.gradle.kotlin.dsl.sourceSets
import org.gradle.kotlin.dsl.test
import org.gradle.kotlin.dsl.testImplementation
import org.jetbrains.kotlin.gradle.tasks.KotlinCompile
plugins {
with(convention.plugins) {
alias(kotlinJvm11)
alias(ktlint)
}
with(libs.plugins) {
alias(korro)
alias(dataframe.compiler.plugin)
}
}
val dependentProjects = with(projects) {
listOf(
core,
dataframeArrow,
dataframeExcel,
dataframeJdbc,
dataframeCsv,
dataframeJson,
)
}.map { project(it.path) }
tasks.withType<KotlinCompile> {
dependentProjects.forEach {
dependsOn("${it.path}:jar")
}
}
tasks.withType<KotlinCompile>().configureEach {
friendPaths.from(project(projects.core.path).projectDir)
}
// get the output of the instrumentedJars configuration, aka the jar-files of the compiled modules
// all modules with jar-task have this artifact in the DataFrame project
val dependentProjectJarPaths = dependentProjects.map {
it.configurations
.getByName("instrumentedJars")
.artifacts.single()
.file.absolutePath
.replace(File.separatorChar, '/')
}
dependencies {
runtimeOnly(projects.dataframe) // Must depend on jars for the compiler plugin to work!
implementation(files(dependentProjectJarPaths))
// include api() dependencies from dependent projects, as they are not included in the jars
dependentProjects.forEach {
it.configurations.getByName("api").dependencies.forEach { dep ->
if (dep is ExternalModuleDependency) {
implementation("${dep.group}:${dep.name}:${dep.version ?: "+"}")
}
}
}
testImplementation(libs.junit)
testImplementation(libs.kotestAssertions) {
exclude("org.jetbrains.kotlin", "kotlin-stdlib-jdk8")
}
testImplementation(libs.kandy) {
exclude("org.jetbrains.kotlinx", "dataframe")
}
testImplementation(libs.kandy.samples.utils) {
exclude("org.jetbrains.kotlinx", "dataframe")
}
testImplementation(libs.kotlin.datetimeJvm)
testImplementation(libs.poi)
testImplementation(libs.arrow.vector)
}
korro {
docs = fileTree(rootProject.rootDir) {
include("docs/StardustDocs/topics/DataSchema-Data-Classes-Generation.md")
include("docs/StardustDocs/topics/read.md")
include("docs/StardustDocs/topics/write.md")
include("docs/StardustDocs/topics/rename.md")
include("docs/StardustDocs/topics/format.md")
include("docs/StardustDocs/topics/toHTML.md")
include("docs/StardustDocs/topics/guides/*.md")
include("docs/StardustDocs/topics/operations/utils/*.md")
include("docs/StardustDocs/topics/operations/multiple/*.md")
include("docs/StardustDocs/topics/operations/column/*.md")
include("docs/StardustDocs/topics/collectionsInterop/*.md")
include("docs/StardustDocs/topics/dataSources/sql/*.md")
include("docs/StardustDocs/topics/info/*.md")
}
samples = fileTree(project.projectDir) {
include("src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/*.kt")
include("src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/*.kt")
include("src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/utils/*.kt")
include("src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/multiple/*.kt")
include("src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/render/*.kt")
include("src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/collectionsInterop/*.kt")
include("src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/column/*.kt")
include("src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/info/*.kt")
include("src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/guides/*.kt")
include("src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/io/*.kt")
}
groupSamples {
beforeSample = "<tab title=\"NAME\">\n"
afterSample = "\n</tab>"
funSuffix("_properties") {
replaceText("NAME", "Properties")
}
funSuffix("_accessors") {
replaceText("NAME", "Accessors")
}
funSuffix("_strings") {
replaceText("NAME", "Strings")
}
beforeGroup = "<tabs>\n"
afterGroup = "</tabs>"
}
}
tasks.test {
jvmArgs = listOf("--add-opens", "java.base/java.nio=ALL-UNNAMED")
}
@@ -0,0 +1,20 @@
package org.jetbrains.kotlinx.dataframe.samples
import org.jetbrains.kotlinx.dataframe.DataColumn
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
import org.jetbrains.kotlinx.dataframe.samples.api.TestBase
import org.jetbrains.kotlinx.kandy.letsplot.samples.SampleHelper
abstract class DataFrameSampleHelper(sampleName: String, subFolder: String = "samples") :
SampleHelper(
sampleName,
subFolder,
"../docs/StardustDocs/images",
"../docs/StardustDocs/resources",
),
TestBase {
fun DataColumn<*>.saveDfHtmlSample() {
toDataFrame().saveDfHtmlSample()
}
}
@@ -0,0 +1,103 @@
@file:Suppress("UNUSED_VARIABLE", "unused", "UNCHECKED_CAST", "ktlint", "ClassName")
package org.jetbrains.kotlinx.dataframe.samples.api
import org.jetbrains.kotlinx.dataframe.annotations.DataSchema
import org.jetbrains.kotlinx.dataframe.api.add
import org.jetbrains.kotlinx.dataframe.api.all
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import org.jetbrains.kotlinx.dataframe.api.filter
import org.jetbrains.kotlinx.dataframe.api.generateDataClasses
import org.jetbrains.kotlinx.dataframe.api.generateInterfaces
import org.jetbrains.kotlinx.dataframe.api.into
import org.jetbrains.kotlinx.dataframe.api.rename
import org.jetbrains.kotlinx.dataframe.api.sumOf
import org.jetbrains.kotlinx.dataframe.api.toList
import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper
import org.junit.Test
class Generate : DataFrameSampleHelper("generate_docs", "api") {
@DataSchema
data class Orders(
val orderId: Int,
val amount: Double,
)
private val ordersAlice = dataFrameOf(
"orderId" to listOf(101, 102),
"amount" to listOf(50.0, 75.5),
).cast<Orders>()
private val ordersBob = dataFrameOf(
"orderId" to listOf(103, 104, 105),
"amount" to listOf(20.0, 30.0, 25.0),
).cast<Orders>()
@DataSchema
data class Customer(
val user: String,
val orders: List<Orders>,
)
private val df = dataFrameOf(
"user" to listOf("Alice", "Bob"),
"orders" to listOf(ordersAlice, ordersBob),
).cast<Customer>()
@Test
fun notebook_test_generate_docs_1() {
// SampleStart
df
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_generate_docs_2() {
// SampleStart
df.generateInterfaces()
// SampleEnd
}
@Test
fun notebook_test_generate_docs_3() {
// SampleStart
df.filter { orders.all { orderId >= 102 } }
// SampleEnd
// .saveDfHtmlSample()
}
@Test
fun notebook_test_generate_docs_4() {
// SampleStart
df.generateDataClasses("Customer")
// SampleEnd
}
@Test
fun notebook_test_generate_docs_5() {
// SampleStart
val customers: List<Customer> = df.cast<Customer>().toList()
// SampleEnd
}
@Test
fun notebook_test_generate_docs_6() {
// SampleStart
df.generateInterfaces(markerName = "Customer")
// SampleEnd
}
@Test
fun notebook_test_generate_docs_7() {
// SampleStart
df.cast<Customer>()
.add("ordersTotal") { orders.sumOf { it.amount } }
.filter { user.startsWith("A") }
.rename { user }.into("customer")
// SampleEnd
// .saveDfHtmlSample()
}
}
@@ -0,0 +1,99 @@
package org.jetbrains.kotlinx.dataframe.samples.api
import org.jetbrains.kotlinx.dataframe.DataColumn
import org.jetbrains.kotlinx.dataframe.api.FormattingDsl
import org.jetbrains.kotlinx.dataframe.api.and
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import org.jetbrains.kotlinx.dataframe.api.format
import org.jetbrains.kotlinx.dataframe.api.getColumnIndex
import org.jetbrains.kotlinx.dataframe.api.linearBg
import org.jetbrains.kotlinx.dataframe.api.max
import org.jetbrains.kotlinx.dataframe.api.min
import org.jetbrains.kotlinx.dataframe.api.notNull
import org.jetbrains.kotlinx.dataframe.api.perRowCol
import org.jetbrains.kotlinx.dataframe.api.with
import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper
import org.junit.Test
@Suppress("ktlint:standard:argument-list-wrapping")
class Modify : DataFrameSampleHelper("operations", "modify") {
val df = peopleDf
private val df2 = dataFrameOf(
"col1", "col2", "col3", "col4", "col5", "col6", "col7", "col8", "col9", "col10",
)(
45, 12, 78, 34, 90, 23, 67, 89, 56, 43,
87, 34, 56, 78, 12, 45, 90, 23, 67, 89,
23, 67, 89, 45, 78, 90, 12, 56, 34, 78,
90, 45, 23, 67, 34, 78, 89, 12, 56, 23,
12, 89, 45, 90, 56, 34, 78, 67, 23, 90,
78, 56, 12, 23, 89, 67, 34, 90, 45, 12,
34, 90, 67, 12, 45, 23, 56, 78, 89, 67,
56, 23, 34, 89, 67, 12, 45, 34, 78, 90,
89, 78, 90, 56, 23, 89, 67, 45, 12, 34,
67, 45, 78, 12, 90, 56, 23, 89, 34, 78,
)
@Suppress("UNCHECKED_CAST")
@Test
fun formatExample_strings() {
// SampleStart
val ageMin = df.min { "age"<Int>() }
val ageMax = df.max { "age"<Int>() }
df
.format().with { bold and textColor(black) and background(white) }
.format("name").with { underline }
.format { "name"["lastName"] }.with { italic }
.format("isHappy").with {
background(if (it as Boolean) green else red)
}
.format("weight").notNull().with { linearBg(it as Int, 50 to blue, 90 to red) }
.format("age").perRowCol { row, col ->
col as DataColumn<Int>
textColor(
linear(value = col[row], from = ageMin to blue, to = ageMax to green),
)
}
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun formatExample_properties() {
// SampleStart
val ageMin = df.age.min()
val ageMax = df.age.max()
df
.format().with { bold and textColor(black) and background(white) }
.format { name }.with { underline }
.format { name.lastName }.with { italic }
.format { isHappy }.with { background(if (it) green else red) }
.format { weight }.notNull().linearBg(50 to FormattingDsl.blue, 90 to FormattingDsl.red)
.format { age }.perRowCol { row, col ->
textColor(
linear(value = col[row], from = ageMin to blue, to = ageMax to green),
)
}
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun formatExampleNumbers() {
// SampleStart
df2.format().perRowCol { row, col ->
val rowIndex = row.index()
val colIndex = row.df().getColumnIndex(col)
if ((rowIndex - colIndex) % 3 == 0) {
background(darkGray) and textColor(white)
} else {
background(white) and textColor(black)
}
}
// SampleEnd
.saveDfHtmlSample()
}
}
@@ -0,0 +1,156 @@
@file:Suppress("ktlint")
package org.jetbrains.kotlinx.dataframe.samples.api
import io.deephaven.csv.parsers.Parsers
import io.kotest.matchers.shouldBe
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.DataRow
import org.jetbrains.kotlinx.dataframe.api.ParserOptions
import org.jetbrains.kotlinx.dataframe.api.columnNames
import org.jetbrains.kotlinx.dataframe.api.columnTypes
import org.jetbrains.kotlinx.dataframe.io.ColType
import org.jetbrains.kotlinx.dataframe.io.StringColumns
import org.jetbrains.kotlinx.dataframe.io.readArrowFeather
import org.jetbrains.kotlinx.dataframe.io.readCsv
import org.jetbrains.kotlinx.dataframe.io.readExcel
import org.jetbrains.kotlinx.dataframe.io.readJson
import org.jetbrains.kotlinx.dataframe.testArrowFeather
import org.jetbrains.kotlinx.dataframe.testCsv
import org.jetbrains.kotlinx.dataframe.testJson
import org.junit.Ignore
import org.junit.Test
import java.time.format.DateTimeFormatter
import java.util.Locale
import kotlin.reflect.typeOf
class Read {
@Test
fun readCsvCustom() {
val file = testCsv("syntheticSample")
// SampleStart
val df = DataFrame.readCsv(
file,
delimiter = '|',
header = listOf("A", "B", "C", "D"),
parserOptions = ParserOptions(nullStrings = setOf("not assigned")),
)
// SampleEnd
df.rowsCount() shouldBe 3
df.columnNames() shouldBe listOf("A", "B", "C", "D")
df["A"].type() shouldBe typeOf<Int>()
df["D"].type() shouldBe typeOf<Boolean?>()
}
@Test
fun readJson() {
val file = testJson("synthetic")
// SampleStart
val df = DataFrame.readJson(file)
// SampleEnd
df.rowsCount() shouldBe 4
df.columnNames() shouldBe listOf("A", "B", "C", "D")
df["A"].type() shouldBe typeOf<String>()
df["B"].type() shouldBe typeOf<Int>()
df["D"].type() shouldBe typeOf<Boolean?>()
}
@Test
fun readJsonRow() {
val file = testJson("syntheticObj")
// SampleStart
val row = DataRow.readJson(file)
// SampleEnd
row.columnNames() shouldBe listOf("A", "B", "C", "D")
row.columnTypes() shouldBe listOf(typeOf<String>(), typeOf<Int>(), typeOf<Float>(), typeOf<Boolean>())
}
@Test
@Ignore
fun fixMixedColumn() {
// SampleStart
val df = DataFrame.readExcel("mixed_column.xlsx", stringColumns = StringColumns("A"))
// SampleEnd
}
@Test
fun readArrowFeather() {
val file = testArrowFeather("data-arrow_2.0.0_uncompressed")
// SampleStart
val df = DataFrame.readArrowFeather(file)
// SampleEnd
df.rowsCount() shouldBe 1
df.columnsCount() shouldBe 4
}
@Test
fun readNumbersWithSpecificLocale() {
val file = testCsv("numbers")
// SampleStart
val df = DataFrame.readCsv(
file,
parserOptions = ParserOptions(locale = Locale.UK),
)
// SampleEnd
}
@Test
fun readNumbersWithColType() {
val file = testCsv("numbers")
// SampleStart
val df = DataFrame.readCsv(
file,
colTypes = mapOf("colName" to ColType.String),
)
// SampleEnd
}
@Test
fun readDatesWithSpecificDateTimePattern() {
val file = testCsv("dates")
// SampleStart
val df = DataFrame.readCsv(
file,
parserOptions = ParserOptions(dateTimePattern = "dd/MMM/yy h:mm a")
)
// SampleEnd
}
@Test
fun readDatesWithSpecificDateTimeFormatter() {
val file = testCsv("dates")
// SampleStart
val df = DataFrame.readCsv(
file,
parserOptions = ParserOptions(dateTimeFormatter = DateTimeFormatter.ofPattern("dd/MMM/yy h:mm a"))
)
// SampleEnd
}
@Test
fun readDatesWithDefaultType() {
val file = testCsv("dates")
// SampleStart
val df = DataFrame.readCsv(
file,
colTypes = mapOf(ColType.DEFAULT to ColType.String),
)
// SampleEnd
}
@Test
fun readDatesWithDeephavenDateTimeParser() {
val file = testCsv("dates")
try {
// SampleStart
val df = DataFrame.readCsv(
inputStream = file.openStream(),
adjustCsvSpecs = { // it: CsvSpecs.Builder
it.putParserForName("date", Parsers.DATETIME)
},
)
// SampleEnd
} catch (_: Exception) {
}
}
}
@@ -0,0 +1,42 @@
@file:Suppress("PropertyName", "unused", "ktlint")
package org.jetbrains.kotlinx.dataframe.samples.api
import org.jetbrains.kotlinx.dataframe.api.column
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import org.jetbrains.kotlinx.dataframe.api.rename
import org.jetbrains.kotlinx.dataframe.api.renameToCamelCase
import org.jetbrains.kotlinx.dataframe.api.toCamelCase
import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper
import org.junit.Test
class RenameToCamelCase : DataFrameSampleHelper("rename", "api") {
private val df = dataFrameOf("ColumnA", "column_b", "COLUMN-C")(1, "a", true, 2, "b", false)
val ColumnA by column<String>()
val `COLUMN-C` by column<String>()
@Test
fun notebook_test_rename_3() {
// SampleStart
df
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_rename_4() {
// SampleStart
df.rename { ColumnA and `COLUMN-C` }.toCamelCase()
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_rename_5() {
// SampleStart
df.renameToCamelCase()
// SampleEnd
.saveDfHtmlSample()
}
}
@@ -0,0 +1,273 @@
package org.jetbrains.kotlinx.dataframe.samples.api
import org.jetbrains.kotlinx.dataframe.AnyFrame
import org.jetbrains.kotlinx.dataframe.api.chunked
import org.jetbrains.kotlinx.dataframe.api.columnOf
import org.jetbrains.kotlinx.dataframe.api.convert
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import org.jetbrains.kotlinx.dataframe.api.duplicateRows
import org.jetbrains.kotlinx.dataframe.api.forEachIndexed
import org.jetbrains.kotlinx.dataframe.api.reorderColumnsByName
import org.jetbrains.kotlinx.dataframe.api.sortBy
import org.jetbrains.kotlinx.dataframe.api.sortByDesc
import org.jetbrains.kotlinx.dataframe.api.with
import org.jetbrains.kotlinx.dataframe.dataTypes.IFRAME
import org.jetbrains.kotlinx.dataframe.dataTypes.IMG
import org.jetbrains.kotlinx.dataframe.indices
import org.jetbrains.kotlinx.dataframe.io.DataFrameHtmlData
import org.jetbrains.kotlinx.dataframe.io.DisplayConfiguration
import org.jetbrains.kotlinx.dataframe.io.toHtml
import org.jetbrains.kotlinx.dataframe.io.toStandaloneHtml
import org.jetbrains.kotlinx.dataframe.jupyter.ChainedCellRenderer
import org.jetbrains.kotlinx.dataframe.jupyter.DefaultCellRenderer
import org.jetbrains.kotlinx.dataframe.jupyter.RenderedContent
import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper
import org.junit.Ignore
import org.junit.Test
import java.io.File
import java.net.URI
import kotlin.io.path.Path
class Render : DataFrameSampleHelper("toHTML", "api") {
private val df: AnyFrame = dataFrameOf(
"name" to columnOf(
"firstName" to columnOf("Alice", "Bob", "Charlie", "Charlie", "Bob", "Alice", "Charlie"),
"lastName" to columnOf("Cooper", "Dylan", "Daniels", "Chaplin", "Marley", "Wolf", "Byrd"),
),
"age" to columnOf(15, 45, 20, 40, 30, 20, 30),
"city" to columnOf("London", "Dubai", "Moscow", "Milan", "Tokyo", null, "Moscow"),
"weight" to columnOf(54, 87, null, null, 68, 55, 90),
"isHappy" to columnOf(true, true, false, true, true, false, true),
)
@Test
@Ignore
fun useRenderingResult() {
// SampleStart
val configuration = DisplayConfiguration(rowsLimit = null)
df.toStandaloneHtml(configuration).openInBrowser()
df.toStandaloneHtml(configuration).writeHtml(File("/path/to/file"))
df.toStandaloneHtml(configuration).writeHtml(Path("/path/to/file"))
// SampleEnd
}
@Test
fun composeTables_strings() {
val df = dataFrameOf(
"name" to columnOf(
"firstName" to columnOf("Alice", "Bob", "Charlie", "Charlie", "Bob", "Alice", "Charlie"),
"lastName" to columnOf("Cooper", "Dylan", "Daniels", "Chaplin", "Marley", "Wolf", "Byrd"),
),
"age" to columnOf(15, 45, 20, 40, 30, 20, 30),
"city" to columnOf("London", "Dubai", "Moscow", "Milan", "Tokyo", null, "Moscow"),
"weight" to columnOf(54, 87, null, null, 68, 55, 90),
"isHappy" to columnOf(true, true, false, true, true, false, true),
)
// SampleStart
val df1 = df.reorderColumnsByName()
val df2 = df.sortBy("age")
val df3 = df.sortByDesc("age")
listOf(df1, df2, df3).fold(DataFrameHtmlData.tableDefinitions()) { acc, df ->
acc + df.toHtml()
}
// SampleEnd
}
@Test
fun composeTables_properties() {
val df = dataFrameOf(
"name" to columnOf(
"firstName" to columnOf("Alice", "Bob", "Charlie", "Charlie", "Bob", "Alice", "Charlie"),
"lastName" to columnOf("Cooper", "Dylan", "Daniels", "Chaplin", "Marley", "Wolf", "Byrd"),
),
"age" to columnOf(15, 45, 20, 40, 30, 20, 30),
"city" to columnOf("London", "Dubai", "Moscow", "Milan", "Tokyo", null, "Moscow"),
"weight" to columnOf(54, 87, null, null, 68, 55, 90),
"isHappy" to columnOf(true, true, false, true, true, false, true),
)
// SampleStart
val df1 = df.reorderColumnsByName()
val df2 = df.sortBy { age }
val df3 = df.sortByDesc { age }
listOf(df1, df2, df3).fold(DataFrameHtmlData.tableDefinitions()) { acc, df -> acc + df.toHtml() }
// SampleEnd
}
@Test
fun configureCellOutput() {
// SampleStart
df.toHtml(DisplayConfiguration(cellContentLimit = -1))
// SampleEnd
}
@Test
fun displayImg() {
// SampleStart
val htmlData = dataFrameOf(
"kotlinLogo" to columnOf(
IMG("https://kotlin.github.io/dataframe/images/kotlin-logo.svg"),
),
).toStandaloneHtml()
// SampleEnd
// .openInBrowser()
}
@Test
fun displayIFrame() {
// SampleStart
val htmlData = dataFrameOf(
"documentationPages" to columnOf(
IFRAME(
src = "https://kotlin.github.io/dataframe/tohtml.html",
width = 850,
height = 500,
),
),
).toStandaloneHtml()
// SampleEnd
// .openInBrowser()
}
@Test
fun displayURL() {
// SampleStart
val htmlData = dataFrameOf(
"documentationPages" to columnOf(
URI("https://kotlin.github.io/dataframe/format.html").toURL(),
URI("https://kotlin.github.io/dataframe/tohtml.html").toURL(),
URI("https://kotlin.github.io/dataframe/jupyterrendering.html").toURL(),
),
).toStandaloneHtml()
// SampleEnd
// .openInBrowser()
}
@Test
fun displayMediaContent_strings() {
// SampleStart
val htmlData = dataFrameOf(
"documentationPages" to columnOf(
"https://kotlin.github.io/dataframe/format.html",
"https://kotlin.github.io/dataframe/tohtml.html",
"https://kotlin.github.io/dataframe/jupyterrendering.html",
),
)
.convert { "documentationPages"<String>() }.with {
val uri = URI(it)
RenderedContent.media("""<a href='$uri'>${uri.path}</a>""")
}
.toStandaloneHtml()
// SampleEnd
// .openInBrowser()
}
@Test
fun displayMediaContent_properties() {
// SampleStart
val htmlData = dataFrameOf(
"documentationPages" to columnOf(
"https://kotlin.github.io/dataframe/format.html",
"https://kotlin.github.io/dataframe/tohtml.html",
"https://kotlin.github.io/dataframe/jupyterrendering.html",
),
)
.convert { documentationPages }.with {
val uri = URI(it)
RenderedContent.media("""<a href='$uri'>${uri.path}</a>""")
}
.toStandaloneHtml()
// SampleEnd
// .openInBrowser()
}
@Test
fun cellRenderer() {
// SampleStart
class CustomArrayCellRenderer : ChainedCellRenderer(DefaultCellRenderer) {
override fun maybeContent(value: Any?, configuration: DisplayConfiguration): RenderedContent? {
if (value is Boolean) {
return RenderedContent.text(if (value) "" else "")
}
// return null to delegate work to parent renderer: DefaultCellRenderer
return null
}
override fun maybeTooltip(value: Any?, configuration: DisplayConfiguration): String? {
// return null to delegate work to parent renderer: DefaultCellRenderer
return null
}
}
val htmlData = df.toStandaloneHtml(cellRenderer = CustomArrayCellRenderer())
// SampleEnd
// .openInBrowser()
}
@Test
fun df() {
// SampleStart
val df = dataFrameOf(
"name" to columnOf(
"firstName" to columnOf("Alice", "Bob", "Charlie", "Charlie", "Bob", "Alice", "Charlie"),
"lastName" to columnOf("Cooper", "Dylan", "Daniels", "Chaplin", "Marley", "Wolf", "Byrd"),
),
"age" to columnOf(15, 45, 20, 40, 30, 20, 30),
"city" to columnOf("London", "Dubai", "Moscow", "Milan", "Tokyo", null, "Moscow"),
"weight" to columnOf(54, 87, null, null, 68, 55, 90),
"isHappy" to columnOf(true, true, false, true, true, false, true),
)
// SampleEnd
}
@Test
fun appendCustomHtml() {
// SampleStart
val pages = df.duplicateRows(10).chunked(20)
val files = pages.indices.map { i -> File("page$i.html") }
val navLinks = files.mapIndexed { i, file ->
"""<a href="${file.name}">Page ${i + 1}</a>"""
}.joinToString(" | ")
pages.forEachIndexed { i, page ->
val output = files[i]
page.toStandaloneHtml().plus(DataFrameHtmlData(body = navLinks))
// uncomment
// .writeHtml(output)
}
// SampleEnd
}
@Test
fun interactiveJs() {
// SampleStart
val selectCellInteraction = DataFrameHtmlData(
style =
"""
td:hover {
background-color: rgba(0, 123, 255, 0.15);
cursor: pointer;
}
""".trimIndent(),
script =
"""
(function() {
let cells = document.querySelectorAll('td');
cells.forEach(function(cell) {
cell.addEventListener('click', function(e) {
let content = cell.textContent;
alert(content);
});
});
})();
""".trimIndent(),
)
// keep in mind JS script initialization order.
val htmlData = df.toStandaloneHtml().plus(selectCellInteraction)
// SampleEnd
// .openInBrowser()
}
}
@@ -0,0 +1,47 @@
@file:Suppress("ktlint")
package org.jetbrains.kotlinx.dataframe.samples.api
import io.kotest.matchers.shouldBe
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.DataRow
import org.jetbrains.kotlinx.dataframe.annotations.DataSchema
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import org.jetbrains.kotlinx.dataframe.api.group
import org.jetbrains.kotlinx.dataframe.api.into
@Suppress("ktlint:standard:argument-list-wrapping")
interface TestBase {
val peopleDf: DataFrame<Person>
get() = run {
dataFrameOf("firstName", "lastName", "age", "city", "weight", "isHappy")(
"Alice", "Cooper", 15, "London", 54, true,
"Bob", "Dylan", 45, "Dubai", 87, true,
"Charlie", "Daniels", 20, "Moscow", null, false,
"Charlie", "Chaplin", 40, "Milan", null, true,
"Bob", "Marley", 30, "Tokyo", 68, true,
"Alice", "Wolf", 20, null, 55, false,
"Charlie", "Byrd", 30, "Moscow", 90, true,
).group { firstName and lastName }.into("name")
.cast<Person>(verify = false)
}
@DataSchema
interface Name {
val firstName: String
val lastName: String
}
@DataSchema
interface Person {
val age: Int
val city: String?
val name: DataRow<Name> // TODO Requires https://code.jetbrains.team/p/kt/repositories/kotlin/reviews/23694 to be merged
val weight: Int?
val isHappy: Boolean
}
infix fun <T, U : T> T.willBe(expected: U?) = shouldBe(expected)
}
@@ -0,0 +1,250 @@
@file:Suppress("ktlint")
package org.jetbrains.kotlinx.dataframe.samples.api
import io.kotest.matchers.string.shouldStartWith
import org.apache.arrow.vector.types.pojo.Schema
import org.apache.poi.ss.usermodel.Sheet
import org.apache.poi.ss.usermodel.WorkbookFactory
import org.jetbrains.kotlinx.dataframe.api.filter
import org.jetbrains.kotlinx.dataframe.api.remove
import org.jetbrains.kotlinx.dataframe.io.ArrowWriter
import org.jetbrains.kotlinx.dataframe.io.arrowWriter
import org.jetbrains.kotlinx.dataframe.io.saveArrowFeatherToByteArray
import org.jetbrains.kotlinx.dataframe.io.saveArrowIPCToByteArray
import org.jetbrains.kotlinx.dataframe.io.toCsvStr
import org.jetbrains.kotlinx.dataframe.io.toJson
import org.jetbrains.kotlinx.dataframe.io.writeArrowFeather
import org.jetbrains.kotlinx.dataframe.io.writeArrowIPC
import org.jetbrains.kotlinx.dataframe.io.writeCsv
import org.jetbrains.kotlinx.dataframe.io.writeExcel
import org.jetbrains.kotlinx.dataframe.io.writeJson
import org.jetbrains.kotlinx.dataframe.io.writeMismatchMessage
import org.junit.Test
import java.io.File
import kotlin.io.path.deleteExisting
class Write : TestBase {
val df = peopleDf
@Test
fun writeCsv() {
useTempFile { file ->
// SampleStart
df.writeCsv(file)
// SampleEnd
}
}
@Test
fun writeJson() {
useTempFile { file ->
// SampleStart
df.writeJson(file)
// SampleEnd
}
}
@Test
fun writeCsvStr() {
// SampleStart
val csvStr = df.toCsvStr(delimiter = ';', recordSeparator = System.lineSeparator())
// SampleEnd
csvStr shouldStartWith """
name;age;city;weight;isHappy
"{""firstName"":""Alice"",""lastName"":""Cooper""}";15;London;54;true
""".trimIndent().lines().joinToString(System.lineSeparator())
}
@Test
fun writeJsonStr() {
// SampleStart
val jsonStr = df.toJson(prettyPrint = true)
// SampleEnd
jsonStr shouldStartWith """
[
{
"name": {
"firstName": "Alice",
"lastName": "Cooper"
},
"age": 15,
"city": "London",
"weight": 54,
"isHappy": true
""".trimIndent()
}
@Test
fun writeXls() {
useTempFile { file ->
// SampleStart
df.writeExcel(file)
// SampleEnd
}
}
@Test
fun writeXlsAppendAndPostProcessing() {
useTempFile { file ->
// SampleStart
/**
* Do something with generated sheets. Here we set bold style for headers and italic style for first data column
*/
fun setStyles(sheet: Sheet) {
val headerFont = sheet.workbook.createFont()
headerFont.bold = true
val headerStyle = sheet.workbook.createCellStyle()
headerStyle.setFont(headerFont)
val indexFont = sheet.workbook.createFont()
indexFont.italic = true
val indexStyle = sheet.workbook.createCellStyle()
indexStyle.setFont(indexFont)
sheet.forEachIndexed { index, row ->
if (index == 0) {
for (cell in row) {
cell.cellStyle = headerStyle
}
} else {
row.first().cellStyle = indexStyle
}
}
}
// Create a workbook (or use existing)
val wb = WorkbookFactory.create(true)
// Create different sheets from different dataframes in the workbook
val allPersonsSheet = df.writeExcel(wb, sheetName = "allPersons")
val happyPersonsSheet =
df.filter { person -> person.isHappy }.remove("isHappy").writeExcel(wb, sheetName = "happyPersons")
val unhappyPersonsSheet =
df.filter { person -> !person.isHappy }.remove("isHappy").writeExcel(wb, sheetName = "unhappyPersons")
// Do anything you want by POI
listOf(happyPersonsSheet, unhappyPersonsSheet).forEach { setStyles(it) }
// Save the result
file.outputStream().use { wb.write(it) }
wb.close()
// SampleEnd
}
}
@Test
fun writeArrowFile() {
useTempFile { file ->
// SampleStart
df.writeArrowIPC(file)
// or
df.writeArrowFeather(file)
// SampleEnd
}
}
@Test
fun writeArrowByteArray() {
// SampleStart
val ipcByteArray: ByteArray = df.saveArrowIPCToByteArray()
// or
val featherByteArray: ByteArray = df.saveArrowFeatherToByteArray()
// SampleEnd
}
@Test
fun writeArrowPerSchema() {
useTempFile { file ->
val schemaJson =
"""{
"fields" : [ {
"name" : "name",
"nullable" : true,
"type" : {
"name" : "utf8"
},
"children" : [ ]
}, {
"name" : "age",
"nullable" : false,
"type" : {
"name" : "int",
"bitWidth" : 32,
"isSigned" : true
},
"children" : [ ]
}, {
"name" : "city",
"nullable" : false,
"type" : {
"name" : "utf8"
},
"children" : [ ]
}, {
"name" : "weight",
"nullable" : true,
"type" : {
"name" : "floatingpoint",
"precision" : "DOUBLE"
},
"children" : [ ]
} ]
}
"""
// SampleStart
// Get schema from anywhere you want. It can be deserialized from JSON, generated from another dataset
// (including the DataFrame.columns().toArrowSchema() method), created manually, and so on.
val schema = Schema.fromJSON(schemaJson)
df.arrowWriter(
// Specify your schema
targetSchema = schema,
// Specify desired behavior mode
mode = ArrowWriter.Mode(
restrictWidening = true,
restrictNarrowing = true,
strictType = true,
strictNullable = false,
),
// Specify mismatch subscriber
mismatchSubscriber = writeMismatchMessage,
).use { writer: ArrowWriter ->
// Save to any format and sink, like in the previous example
writer.writeArrowFeather(file)
}
// SampleEnd
}
}
@Test
fun writeXlsWithMultipleSheets() {
useTempFile { file ->
// SampleStart
// Create a new Excel workbook with a single sheet called "allPersons", replacing the file if it already exists -> Current sheets: allPersons
df.writeExcel(file, sheetName = "allPersons")
// Add a new sheet to the previous file without replacing it, by setting keepFile = true -> Current sheets: allPersons, happyPersons
df.filter { person -> person.isHappy }.remove("isHappy")
.writeExcel(file, sheetName = "happyPersons", keepFile = true)
// Add a new sheet to the previous file without replacing it, by setting keepFile = true -> Current sheets: allPersons, happyPersons, unhappyPersons
df.filter { person -> !person.isHappy }.remove("isHappy")
.writeExcel(file, sheetName = "unhappyPersons", keepFile = true)
// SampleEnd
}
}
companion object {
private fun useTempFile(action: (File) -> Unit) {
val file = kotlin.io.path.createTempFile("dataframeWriteTest")
action(file.toFile())
file.deleteExisting()
}
}
}
@@ -0,0 +1,28 @@
package org.jetbrains.kotlinx.dataframe.samples.api.collectionsInterop
import org.jetbrains.kotlinx.dataframe.api.associateBy
import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper
import org.jetbrains.kotlinx.dataframe.samples.api.firstName
import org.jetbrains.kotlinx.dataframe.samples.api.lastName
import org.jetbrains.kotlinx.dataframe.samples.api.name
import org.junit.Test
class AssociateBySamples : DataFrameSampleHelper("associateBy", "api/collectionsInterop") {
private val df = peopleDf
@Test
fun notebook_test_associateBy_1() {
// SampleStart
df
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_associateBy_2() {
// SampleStart
df.associateBy { "${name.firstName} ${name.lastName}" }
// SampleEnd
}
}
@@ -0,0 +1,29 @@
package org.jetbrains.kotlinx.dataframe.samples.api.collectionsInterop
import org.jetbrains.kotlinx.dataframe.api.associate
import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper
import org.jetbrains.kotlinx.dataframe.samples.api.age
import org.jetbrains.kotlinx.dataframe.samples.api.firstName
import org.jetbrains.kotlinx.dataframe.samples.api.lastName
import org.jetbrains.kotlinx.dataframe.samples.api.name
import org.junit.Test
class AssociateSamples : DataFrameSampleHelper("associate", "api/collectionsInterop") {
private val df = peopleDf
@Test
fun notebook_test_associate_1() {
// SampleStart
df
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_associate_2() {
// SampleStart
df.associate { "${name.firstName} ${name.lastName}" to age }
// SampleEnd
}
}
@@ -0,0 +1,46 @@
package org.jetbrains.kotlinx.dataframe.samples.api.column
import org.jetbrains.kotlinx.dataframe.annotations.DataSchema
import org.jetbrains.kotlinx.dataframe.api.between
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper
import org.junit.Test
class BetweenSamples : DataFrameSampleHelper("between", "api") {
@DataSchema
interface SimplePerson {
val name: String
val age: Int
}
private val df = dataFrameOf(
"name" to listOf("Alice", "Bob", "Charlie", "Diana"),
"age" to listOf(15, 20, 25, 30),
).cast<SimplePerson>()
@Test
fun notebook_test_between_1() {
// SampleStart
df
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_between_2() {
// SampleStart
df.age.between(left = 18, right = 25)
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_between_3() {
// SampleStart
df.age.between(left = 18, right = 25, includeBoundaries = false)
// SampleEnd
.saveDfHtmlSample()
}
}
@@ -0,0 +1,34 @@
package org.jetbrains.kotlinx.dataframe.samples.api.info
import org.jetbrains.kotlinx.dataframe.api.tail
import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper
import org.junit.Test
class TailSamples : DataFrameSampleHelper("tail", "api") {
private val df = peopleDf
@Test
fun notebook_test_tail_1() {
// SampleStart
df
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_tail_2() {
// SampleStart
df.tail()
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_tail_3() {
// SampleStart
df.tail(numRows = 2)
// SampleEnd
.saveDfHtmlSample()
}
}
@@ -0,0 +1,315 @@
package org.jetbrains.kotlinx.dataframe.samples.api.multiple
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.annotations.DataSchema
import org.jetbrains.kotlinx.dataframe.api.RgbColor
import org.jetbrains.kotlinx.dataframe.api.and
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import org.jetbrains.kotlinx.dataframe.api.excludeJoin
import org.jetbrains.kotlinx.dataframe.api.filterJoin
import org.jetbrains.kotlinx.dataframe.api.format
import org.jetbrains.kotlinx.dataframe.api.fullJoin
import org.jetbrains.kotlinx.dataframe.api.innerJoin
import org.jetbrains.kotlinx.dataframe.api.join
import org.jetbrains.kotlinx.dataframe.api.leftJoin
import org.jetbrains.kotlinx.dataframe.api.perRowCol
import org.jetbrains.kotlinx.dataframe.api.rightJoin
import org.jetbrains.kotlinx.dataframe.api.with
import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper
import org.jetbrains.kotlinx.dataframe.util.defaultHeaderFormatting
import org.junit.Test
class JoinSamples : DataFrameSampleHelper("join", "api") {
@DataSchema
interface DfAges {
val age: Int
val firstName: String
}
private val dfAges = dataFrameOf(
"firstName" to listOf("Alice", "Bob", "Charlie"),
"age" to listOf(14, 45, 20),
).cast<DfAges>()
@DataSchema
interface DfCities {
val city: String
val name: String
}
private val dfCities = dataFrameOf(
"name" to listOf("Bob", "Alice", "Charlie"),
"city" to listOf("London", "Dubai", "Moscow"),
).cast<DfCities>()
@DataSchema
interface DfWithNameAndCity {
val name: String
val city: String?
}
@DataSchema
interface DfLeft : DfWithNameAndCity {
val age: Int
override val city: String
override val name: String
}
private val dfLeft = dataFrameOf(
"name" to listOf("Alice", "Bob", "Charlie", "Charlie"),
"age" to listOf(15, 45, 20, 40),
"city" to listOf("London", "Dubai", "Moscow", "Tokyo"),
).cast<DfLeft>()
@DataSchema
interface DfRight : DfWithNameAndCity {
override val city: String?
val isBusy: Boolean
override val name: String
}
private val dfRight = dataFrameOf(
"name" to listOf("Alice", "Bob", "Alice", "Charlie"),
"isBusy" to listOf(true, false, true, true),
"city" to listOf("London", "Tokyo", null, "Moscow"),
).cast<DfRight>()
private fun nameToColor(name: String): RgbColor =
when (name) {
"Alice" -> RgbColor(189, 206, 233)
"Bob" -> RgbColor(198, 224, 198)
"Charlie" -> RgbColor(219, 198, 230)
else -> RgbColor(255, 255, 255)
}
private fun nameAndCityToColor(name: String, city: String?): RgbColor =
when (name to city) {
"Alice" to "London" -> RgbColor(242, 210, 189)
"Bob" to "Dubai" -> RgbColor(245, 226, 191)
"Charlie" to "Moscow" -> RgbColor(210, 229, 199)
"Charlie" to "Tokyo" -> RgbColor(191, 223, 232)
"Bob" to "Tokyo" -> RgbColor(200, 200, 232)
"Alice" to null -> RgbColor(233, 199, 220)
else -> RgbColor(255, 255, 255)
}
private fun <T> DataFrame<T>.colorized() =
format().perRowCol { row, _ ->
val color = nameAndCityToColor(row["name"] as String, row["city"] as String?)
background(color) and textColor(black)
}
@Test
fun notebook_test_join_3() {
// SampleStart
dfAges
// SampleEnd
.format().perRowCol { row, _ ->
val color = nameToColor(row.firstName)
background(color) and textColor(black)
}
.defaultHeaderFormatting { firstName }
.saveDfHtmlSample()
}
@Test
fun notebook_test_join_5() {
// SampleStart
dfCities
// SampleEnd
.format().perRowCol { row, _ ->
val color = nameToColor(row.name)
background(color) and textColor(black)
}
.defaultHeaderFormatting { name }
.saveDfHtmlSample()
}
@Test
fun notebook_test_join_6() {
// SampleStart
// INNER JOIN on differently named keys:
// Merge a row when dfAges.firstName == dfCities.name.
// With the given data all 3 names match → all rows merge.
dfAges.join(dfCities) { firstName match right.name }
// SampleEnd
.format().perRowCol { row, _ ->
val color = nameToColor(row.firstName)
background(color) and textColor(black)
}
.defaultHeaderFormatting { firstName }
.saveDfHtmlSample()
}
@Test
fun notebook_test_join_8() {
// SampleStart
dfLeft
// SampleEnd
.colorized()
.defaultHeaderFormatting { name }
.saveDfHtmlSample()
}
@Test
fun notebook_test_join_10() {
// SampleStart
dfRight
// SampleEnd
.colorized()
.defaultHeaderFormatting { name }
.saveDfHtmlSample()
}
@Test
fun notebook_test_join_11() {
// SampleStart
// INNER JOIN on "name" only:
// Merge when left.name == right.name.
// Duplicate keys produce multiple merged rows (one per pairing).
dfLeft.join(dfRight) { name }
// SampleEnd
.colorized()
.defaultHeaderFormatting { name }
.saveDfHtmlSample()
}
@Test
fun dfLeftImplicit() {
// SampleStart
dfLeft
// SampleEnd
.colorized()
.defaultHeaderFormatting { name and city }
.saveDfHtmlSample()
}
@Test
fun dfRightImplicit() {
// SampleStart
dfRight
// SampleEnd
.colorized()
.defaultHeaderFormatting { name and city }
.saveDfHtmlSample()
}
@Test
fun notebook_test_join_12() {
// SampleStart
// INNER JOIN on all same-named columns ("name" and "city"):
// Merge when BOTH name AND city are equal; otherwise the row is dropped.
dfLeft.join(dfRight)
// SampleEnd
.colorized()
.defaultHeaderFormatting { "name" and "city" }
.saveDfHtmlSample()
}
@Test
fun notebook_test_join_13() {
// SampleStart
dfLeft
// SampleEnd
.colorized()
.defaultHeaderFormatting { name and city }
.saveDfHtmlSample()
}
@Test
fun notebook_test_join_14() {
// SampleStart
dfRight
// SampleEnd
.colorized()
.defaultHeaderFormatting { name and city }
.saveDfHtmlSample()
}
@Test
fun notebook_test_join_15() {
// SampleStart
// INNER JOIN:
// Combines columns from the left and right dataframes
// and keep only rows where (name, city) matches on both sides.
dfLeft.innerJoin(dfRight) { name and city }
// SampleEnd
.colorized()
.defaultHeaderFormatting { name and city }
.saveDfHtmlSample()
}
@Test
fun notebook_test_join_16() {
// SampleStart
// FILTER JOIN:
// Keep ONLY left rows that have ANY match on (name, city).
// No right-side columns are added.
dfLeft.filterJoin(dfRight) { name and city }
// SampleEnd
.colorized()
.defaultHeaderFormatting { name and city }
.saveDfHtmlSample()
}
@Test
fun notebook_test_join_17() {
// SampleStart
// LEFT JOIN:
// Keep ALL left rows and add columns from the right dataframe.
// If (name, city) matches, attach right columns values from
// the corresponding row in the right dataframe;
// if not (e.g. ("Bob", "Dubai") row), fill them with `null`.
dfLeft.leftJoin(dfRight) { name and city }
// SampleEnd
.colorized()
.format { all() except (name and city) }.with { if (it == null) bold else null }
.defaultHeaderFormatting { name and city }
.saveDfHtmlSample()
}
@Test
fun notebook_test_join_18() {
// SampleStart
// RIGHT JOIN:
// Keep ALL right rows and add columns from the left dataframe.
// If (name, city) matches, attach left columns values from
// the corresponding row in the left dataframe;
// if not (e.g. ("Bob", "Tokyo") row), fill them with `null`.
dfLeft.rightJoin(dfRight) { name and city }
// SampleEnd
.colorized()
.format { all() except (name and city) }.with { if (it == null) bold else null }
.defaultHeaderFormatting { name and city }
.saveDfHtmlSample()
}
@Test
fun notebook_test_join_19() {
// SampleStart
// FULL JOIN:
// Keep ALL rows from both sides. Where there's no match on (name, city),
// the other side is filled with nulls.
dfLeft.fullJoin(dfRight) { name and city }
// SampleEnd
.colorized()
.format { all() except (name and city) }.with { if (it == null) bold else null }
.defaultHeaderFormatting { name and city }
.saveDfHtmlSample()
}
@Test
fun notebook_test_join_20() {
// SampleStart
// EXCLUDE JOIN:
// Keep ONLY left rows that have NO match on (name, city).
// Useful to find "unpaired" left rows.
dfLeft.excludeJoin(dfRight) { name and city }
// SampleEnd
.colorized()
.defaultHeaderFormatting { name and city }
.saveDfHtmlSample()
}
}
@@ -0,0 +1,45 @@
package org.jetbrains.kotlinx.dataframe.samples.api.render
import org.jetbrains.kotlinx.dataframe.DataRow
import org.jetbrains.kotlinx.dataframe.annotations.DataSchema
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.colsOf
import org.jetbrains.kotlinx.dataframe.api.formatHeader
import org.jetbrains.kotlinx.dataframe.api.with
import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper
import org.junit.Test
class FormatHeaderSamples : DataFrameSampleHelper("format", "api") {
val df = peopleDf.cast<Person>()
@DataSchema
interface Name {
val firstName: String
val lastName: String
}
@DataSchema
interface Person {
val age: Int
val city: String?
val name: DataRow<Name>
val weight: Int?
val isHappy: Boolean
}
@Test
fun formatHeader() {
// SampleStart
df
// Format all column headers with bold
.formatHeader().with { bold }
// Format the "name" column (including nested) header with red text
.formatHeader { name }.with { textColor(red) }
// Override "name"/"lastName" column formating header with blue text
.formatHeader { name.lastName }.with { textColor(blue) }
// Format all numeric column headers with underlines
.formatHeader { colsOf<Number?>() }.with { underline }
// SampleEnd
.saveDfHtmlSample()
}
}
@@ -0,0 +1,37 @@
package org.jetbrains.kotlinx.dataframe.samples.api.utils
import org.jetbrains.kotlinx.dataframe.api.all
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper
import org.junit.Test
class AllSamples : DataFrameSampleHelper("all", "api") {
private val df = dataFrameOf(
"name" to listOf("Alice", "Bob"),
"age" to listOf(15, 20),
).cast<AnySamples.SimplePerson>()
@Test
fun notebook_test_all_3() {
// SampleStart
df
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_all_4() {
// SampleStart
df.all { age > 21 }
// SampleEnd
}
@Test
fun notebook_test_all_5() {
// SampleStart
df.all { name.first().isUpperCase() && age >= 15 }
// SampleEnd
}
}
@@ -0,0 +1,44 @@
package org.jetbrains.kotlinx.dataframe.samples.api.utils
import org.jetbrains.kotlinx.dataframe.annotations.DataSchema
import org.jetbrains.kotlinx.dataframe.api.any
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper
import org.junit.Test
class AnySamples : DataFrameSampleHelper("any", "api") {
@DataSchema
interface SimplePerson {
val name: String
val age: Int
}
private val df = dataFrameOf(
"name" to listOf("Alice", "Bob"),
"age" to listOf(15, 20),
).cast<SimplePerson>()
@Test
fun notebook_test_any_3() {
// SampleStart
df
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_any_4() {
// SampleStart
df.any { age > 21 }
// SampleEnd
}
@Test
fun notebook_test_any_5() {
// SampleStart
df.any { age == 15 && name == "Alice" }
// SampleEnd
}
}
@@ -0,0 +1,46 @@
package org.jetbrains.kotlinx.dataframe.samples.api.utils
import org.jetbrains.kotlinx.dataframe.annotations.DataSchema
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.chunked
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper
import org.junit.Test
class ChunkedSamples : DataFrameSampleHelper("chunked", "api") {
@DataSchema
interface SimplePerson {
val name: String
val age: Int
}
private val df = dataFrameOf(
"name" to listOf("Alice", "Bob", "Charlie", "Diana", "Eve"),
"age" to listOf(15, 20, 25, 30, 35),
).cast<SimplePerson>()
@Test
fun notebook_test_chunked_1() {
// SampleStart
df
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_chunked_2() {
// SampleStart
df.chunked(size = 2)
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_chunked_3() {
// SampleStart
df.chunked(startIndices = listOf(0, 1, 3), name = "segments")
// SampleEnd
.saveDfHtmlSample()
}
}
@@ -0,0 +1,39 @@
package org.jetbrains.kotlinx.dataframe.samples.api.utils
import org.jetbrains.kotlinx.dataframe.annotations.DataSchema
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import org.jetbrains.kotlinx.dataframe.api.shuffle
import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper
import org.junit.Test
import kotlin.random.Random
class ShuffleSamples : DataFrameSampleHelper("shuffle", "api") {
@DataSchema
interface SimplePerson {
val name: String
val age: Int
}
private val df = dataFrameOf(
"name" to listOf("Alice", "Bob", "Charlie", "Diana", "Eve"),
"age" to listOf(15, 20, 25, 30, 35),
).cast<SimplePerson>()
@Test
fun notebook_test_shuffle_1() {
// SampleStart
df
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_shuffle_2() {
// SampleStart
df.shuffle(Random(42))
// SampleEnd
.saveDfHtmlSample()
}
}
@@ -0,0 +1,246 @@
@file:Suppress("PropertyName", "UNUSED_VARIABLE", "UNUSED_EXPRESSION", "UNCHECKED_CAST")
package org.jetbrains.kotlinx.dataframe.samples.guides
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.annotations.DataSchema
import org.jetbrains.kotlinx.dataframe.api.add
import org.jetbrains.kotlinx.dataframe.api.aggregate
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.convert
import org.jetbrains.kotlinx.dataframe.api.count
import org.jetbrains.kotlinx.dataframe.api.describe
import org.jetbrains.kotlinx.dataframe.api.filter
import org.jetbrains.kotlinx.dataframe.api.groupBy
import org.jetbrains.kotlinx.dataframe.api.into
import org.jetbrains.kotlinx.dataframe.api.maxOf
import org.jetbrains.kotlinx.dataframe.api.rename
import org.jetbrains.kotlinx.dataframe.api.select
import org.jetbrains.kotlinx.dataframe.api.sortByDesc
import org.jetbrains.kotlinx.dataframe.api.sumOf
import org.jetbrains.kotlinx.dataframe.api.take
import org.jetbrains.kotlinx.dataframe.api.update
import org.jetbrains.kotlinx.dataframe.api.with
import org.jetbrains.kotlinx.dataframe.io.readCsv
import org.jetbrains.kotlinx.dataframe.io.writeExcel
import org.jetbrains.kotlinx.dataframe.samples.DataFrameSampleHelper
import org.jetbrains.kotlinx.kandy.dsl.plot
import org.jetbrains.kotlinx.kandy.letsplot.feature.layout
import org.jetbrains.kotlinx.kandy.letsplot.layers.bars
import org.junit.Ignore
import org.junit.Test
import java.net.URL
class QuickStartGuide : DataFrameSampleHelper("quickstart", "guides") {
@DataSchema
interface Repositories {
val html_url: URL
val watchers: Int
val full_name: String
val stargazers_count: Int
val topics: String
}
private val df = DataFrame.readCsv(
"https://raw.githubusercontent.com/Kotlin/dataframe/master/data/jetbrains_repositories.csv",
).cast<Repositories>()
private fun getDfSelected() = df.select { full_name and stargazers_count and topics }
private fun getDfFiltered() =
getDfSelected()
.filter { stargazers_count >= 1000 }
private fun getDfRenamed() =
getDfFiltered()
.rename { full_name }.into("name")
// And "stargazers_count" into "starsCount"
.rename { stargazers_count }.into("starsCount")
private fun getDfUpdated() =
getDfRenamed()
// Update "name" values with only its second part (after '/')
.update { name }.with { it.split("/")[1] }
// Convert "topics" `String` values into `List<String>` by splitting:
.convert { topics }.with { it.removePrefix("[").removeSuffix("]").split(", ") }
private fun getDfWithIsIntellij() =
getDfUpdated()
.add("isIntellij") {
name.contains("intellij") || "intellij" in topics
}
private fun getGroupedByIsIntellij() =
getDfWithIsIntellij()
.groupBy { isIntellij }
private fun getDfTop10() =
getDfWithIsIntellij()
// Sort by "starsCount" value descending
.sortByDesc { starsCount }.take(10)
@Test
fun notebook_test_quickstart_2() {
// SampleStart
val df = DataFrame.readCsv(
"https://raw.githubusercontent.com/Kotlin/dataframe/master/data/jetbrains_repositories.csv",
)
// SampleEnd
}
@Test
fun notebook_test_quickstart_3() {
// SampleStart
df
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_quickstart_4() {
// SampleStart
df.describe()
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_quickstart_5() {
// SampleStart
// Select "full_name", "stargazers_count" and "topics" columns
val dfSelected = df.select { full_name and stargazers_count and topics }
dfSelected
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_quickstart_6() {
val dfSelected = getDfSelected()
// SampleStart
// Keep only rows where "stargazers_count" value is more than 1000
val dfFiltered = dfSelected.filter { stargazers_count >= 1000 }
dfFiltered
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_quickstart_7() {
val dfFiltered = getDfFiltered()
// SampleStart
// Rename "full_name" column into "name"
val dfRenamed = dfFiltered.rename { full_name }.into("name")
// And "stargazers_count" into "starsCount"
.rename { stargazers_count }.into("starsCount")
dfRenamed
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_quickstart_8() {
val dfRenamed = getDfRenamed()
// SampleStart
val dfUpdated = dfRenamed
// Update "name" values with only its second part (after '/')
.update { name }.with { it.split("/")[1] }
// Convert "topics" `String` values into `List<String>` by splitting:
.convert { topics }.with { it.removePrefix("[").removeSuffix("]").split(", ") }
dfUpdated
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_quickstart_9() {
val dfUpdated = getDfUpdated()
// SampleStart
dfUpdated.topics.type()
// SampleEnd
}
@Test
fun notebook_test_quickstart_10() {
val dfUpdated = getDfUpdated()
// SampleStart
// Add a `Boolean` column indicating whether the `name` contains the "intellij" substring
// or the topics include "intellij".
val dfWithIsIntellij = dfUpdated.add("isIntellij") {
name.contains("intellij") || "intellij" in topics
}
dfWithIsIntellij
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_quickstart_11() {
val dfWithIsIntellij = getDfWithIsIntellij()
// SampleStart
val groupedByIsIntellij = dfWithIsIntellij.groupBy { isIntellij }
groupedByIsIntellij
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_quickstart_12() {
val groupedByIsIntellij = getGroupedByIsIntellij()
// SampleStart
groupedByIsIntellij.count()
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_quickstart_13() {
val groupedByIsIntellij = getGroupedByIsIntellij()
// SampleStart
groupedByIsIntellij.aggregate {
// Compute sum and max of "starsCount" within each group into "sumStars" and "maxStars" columns
sumOf { starsCount } into "sumStars"
maxOf { starsCount } into "maxStars"
}
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_quickstart_14() {
val dfWithIsIntellij = getDfWithIsIntellij()
// SampleStart
val dfTop10 = dfWithIsIntellij
// Sort by "starsCount" value descending
.sortByDesc { starsCount }.take(10)
dfTop10
// SampleEnd
.saveDfHtmlSample()
}
@Test
fun notebook_test_quickstart_16() {
val dfTop10 = getDfTop10()
// SampleStart
dfTop10.plot {
bars {
x(name)
y(starsCount)
}
layout.title = "Top 10 JetBrains repositories by stars count"
}
// SampleEnd
.savePlotSVGSample()
}
@Ignore
@Test
fun notebook_test_quickstart_17() {
val dfWithIsIntellij = getDfWithIsIntellij()
// SampleStart
dfWithIsIntellij.writeExcel("jb_repos.xlsx")
// SampleEnd
}
}
@@ -0,0 +1,48 @@
package org.jetbrains.kotlinx.dataframe.samples.io
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.io.DbConnectionConfig
import org.jetbrains.kotlinx.dataframe.io.readSqlQuery
import org.jetbrains.kotlinx.dataframe.io.readSqlTable
import org.junit.Ignore
import org.junit.Test
import java.sql.DriverManager
class DuckDb {
@Ignore
@Test
fun readSqlTable() {
// SampleStart
val url = "jdbc:duckdb:/testDatabase"
val username = "duckdb"
val password = "password"
val dbConfig = DbConnectionConfig(url, username, password)
val tableName = "Customer"
val df = DataFrame.readSqlTable(dbConfig, tableName)
// SampleEnd
}
// source: https://duckdb.org/docs/stable/core_extensions/iceberg/overview.html
@Ignore
@Test
fun readIcebergExtension() {
// SampleStart
// Creating an in-memory DuckDB database
val connection = DriverManager.getConnection("jdbc:duckdb:")
val df = connection.use { connection ->
// install and load Iceberg
connection.createStatement().execute("INSTALL iceberg; LOAD iceberg;")
// query a table from Iceberg using a specific SQL query
DataFrame.readSqlQuery(
connection = connection,
sqlQuery = "SELECT * FROM iceberg_scan('data/iceberg/lineitem_iceberg', allow_moved_paths = true);",
)
}
// SampleEnd
}
}
@@ -0,0 +1,78 @@
package org.jetbrains.kotlinx.dataframe.samples.io
import io.kotest.matchers.shouldBe
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.NullabilityOptions
import org.jetbrains.kotlinx.dataframe.io.readParquet
import org.jetbrains.kotlinx.dataframe.testParquet
import org.junit.Test
import java.io.File
import java.nio.file.Paths
class Parquet {
@Test
fun readParquetURL() {
val url = testParquet("sales")
// SampleStart
// Read from URLs
val df = DataFrame.readParquet(url)
// SampleEnd
df.rowsCount() shouldBe 300
df.columnsCount() shouldBe 20
}
@Test
fun readParquetFilePath() {
val url = testParquet("sales")
val path = Paths.get(url.toURI())
// SampleStart
val df = DataFrame.readParquet(path)
// SampleEnd
df.rowsCount() shouldBe 300
df.columnsCount() shouldBe 20
}
@Test
fun readParquetFile() {
val url = testParquet("sales")
val file = File(url.toURI())
// SampleStart
// Read from File objects
val df = DataFrame.readParquet(file)
// SampleEnd
df.rowsCount() shouldBe 300
df.columnsCount() shouldBe 20
}
@Test
fun readParquetFileWithParameters() {
val url = testParquet("sales")
val file = File(url.toURI())
// SampleStart
val df = DataFrame.readParquet(
file,
nullability = NullabilityOptions.Infer,
batchSize = 64L * 1024,
)
// SampleEnd
df.rowsCount() shouldBe 300
df.columnsCount() shouldBe 20
}
@Test
fun readMultipleParquetFiles() {
val url = testParquet("sales")
val file = File(url.toURI())
val file1 = File(url.toURI())
val file2 = File(url.toURI())
// SampleStart
val df = DataFrame.readParquet(file, file1, file2)
// SampleEnd
df.rowsCount() shouldBe 900
df.columnsCount() shouldBe 20
}
}
@@ -0,0 +1,13 @@
package org.jetbrains.kotlinx.dataframe
import java.net.URL
fun testResource(resourcePath: String): URL = object { }::class.java.classLoader.getResource(resourcePath)!!
fun testCsv(csvName: String) = testResource("$csvName.csv")
fun testJson(jsonName: String) = testResource("$jsonName.json")
fun testArrowFeather(name: String) = testResource("$name.feather")
fun testParquet(name: String) = testResource("$name.parquet")
@@ -0,0 +1,60 @@
package org.jetbrains.kotlinx.dataframe.util
import org.jetbrains.kotlinx.dataframe.ColumnsSelector
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.CellAttributes
import org.jetbrains.kotlinx.dataframe.api.FormattedFrame
import org.jetbrains.kotlinx.dataframe.api.FormattingDsl
import org.jetbrains.kotlinx.dataframe.api.and
import org.jetbrains.kotlinx.dataframe.api.formatHeader
import org.jetbrains.kotlinx.dataframe.api.getColumnsWithPaths
import org.jetbrains.kotlinx.dataframe.api.with
internal val baseColorSet = listOf(
FormattingDsl.rgb(244, 67, 54), // red
FormattingDsl.rgb(33, 150, 243), // blue
FormattingDsl.rgb(76, 175, 80), // green
FormattingDsl.rgb(255, 152, 0), // orange
FormattingDsl.rgb(156, 39, 176), // purple
FormattingDsl.rgb(0, 150, 136), // teal
FormattingDsl.rgb(233, 30, 99), // pink/magenta
)
internal val FormattingDsl.monospace: CellAttributes
get() = attr(
"font-family",
"ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, 'Liberation Mono', 'Courier New', monospace",
)
internal fun <T> DataFrame<T>.defaultHeaderFormatting(headers: ColumnsSelector<T, *>): FormattedFrame<T> {
val columns = getColumnsWithPaths(headers)
require(columns.size <= baseColorSet.size) {
"Too many headers: ${columns.size}. Max supported is ${baseColorSet.size}."
}
val start = formatHeader().with { null }
return columns.foldIndexed(start) { idx, acc, header ->
acc.formatHeader { header }
.with {
textColor(baseColorSet[idx]) and monospace
}
}
}
@Suppress("INVISIBLE_REFERENCE")
internal fun <T> FormattedFrame<T>.defaultHeaderFormatting(headers: ColumnsSelector<T, *>): FormattedFrame<T> {
val columns = df.getColumnsWithPaths(headers)
require(columns.size <= baseColorSet.size) {
"Too many headers: ${columns.size}. Max supported is ${baseColorSet.size}."
}
val start = formatHeader().with { null }
return columns.foldIndexed(start) { idx, acc, header ->
acc.formatHeader { header }
.with {
textColor(baseColorSet[idx]) and monospace
}
}
}
+3
View File
@@ -0,0 +1,3 @@
date
13/Jan/23 11:49 AM
14/Mar/23 5:35 PM
1 date
2 13/Jan/23 11:49 AM
3 14/Mar/23 5:35 PM
+3
View File
@@ -0,0 +1,3 @@
colName
"48,000"
"47,302"
1 colName
2 48,000
3 47,302
Binary file not shown.
+6
View File
@@ -0,0 +1,6 @@
[
{ "A": "1", "B": 1, "C": 1.0, "D": true },
{ "A": "2", "B": 2, "C": 1.1, "D": null },
{ "A": "3", "B": 3, "C": 1, "D": false },
{ "A": "4", "B": 4, "C": 1.3, "D": true }
]
@@ -0,0 +1 @@
{ "A": "1", "B": 1, "C": 1.0, "D": true }
@@ -0,0 +1,3 @@
12|tuv|0.12|true
41|xyz|3.6|not assigned
89|abc|7.1|false
1 12 tuv 0.12 true
2 41 xyz 3.6 not assigned
3 89 abc 7.1 false