init research

This commit is contained in:
2026-02-08 11:20:43 -10:00
commit bdf064f54d
3041 changed files with 1592200 additions and 0 deletions
+14
View File
@@ -0,0 +1,14 @@
## :dataframe-json
This module, published as `dataframe-json`, contains all logic and tests for DataFrame to be able to work with
JSON data sources; [reading](https://kotlin.github.io/dataframe/read.html#read-from-json)
and [writing](https://kotlin.github.io/dataframe/write.html#writing-to-json).
It's based on [Kotlinx Serialization](https://github.com/Kotlin/kotlinx.serialization).
It also contains some logic specific to encoding dataframes as JSON objects with metadata for
the [custom table component in Kotlin Notebook](https://kotlin.github.io/dataframe/usage-with-kotlin-notebook-plugin.html).
See [serialization_format](../docs/serialization_format.md) for more information about the format.
This module is optional but is included by default by the `dataframe` module, `dataframe-jupyter`,
`dataframe-csv`, and `dataframe-excel`.
If you want to use DataFrame without JSON support, you can exclude this module from the dependency.
+91
View File
@@ -0,0 +1,91 @@
public final class org/jetbrains/kotlinx/dataframe/io/Base64ImageEncodingOptions {
public static final field ALL_OFF I
public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/Base64ImageEncodingOptions$Companion;
public static final field GZIP_ON I
public static final field LIMIT_SIZE_ON I
public fun <init> ()V
public fun <init> (II)V
public synthetic fun <init> (IIILkotlin/jvm/internal/DefaultConstructorMarker;)V
public final fun getImageSizeLimit ()I
public final fun isGzipOn ()Z
public final fun isLimitSizeOn ()Z
}
public final class org/jetbrains/kotlinx/dataframe/io/Base64ImageEncodingOptions$Companion {
}
public abstract interface class org/jetbrains/kotlinx/dataframe/io/CustomEncoder {
public abstract fun canEncode (Ljava/lang/Object;)Z
public abstract fun encode (Ljava/lang/Object;)Lkotlinx/serialization/json/JsonElement;
}
public final class org/jetbrains/kotlinx/dataframe/io/JSON : org/jetbrains/kotlinx/dataframe/io/SupportedDataFrameFormat {
public fun <init> ()V
public fun <init> (Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Ljava/util/List;Z)V
public synthetic fun <init> (Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Ljava/util/List;ZILkotlin/jvm/internal/DefaultConstructorMarker;)V
public fun acceptsExtension (Ljava/lang/String;)Z
public fun acceptsSample (Lorg/jetbrains/kotlinx/dataframe/io/SupportedFormatSample;)Z
public fun createDefaultReadMethod (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/codeGen/DefaultReadDfMethod;
public fun getTestOrder ()I
public fun readDataFrame (Ljava/io/File;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public fun readDataFrame (Ljava/io/InputStream;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public fun readDataFrame (Ljava/nio/file/Path;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
}
public final class org/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic : java/lang/Enum {
public static final field ANY_COLUMNS Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;
public static final field ARRAY_AND_VALUE_COLUMNS Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;
public static fun getEntries ()Lkotlin/enums/EnumEntries;
public static fun valueOf (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;
public static fun values ()[Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;
}
public final class org/jetbrains/kotlinx/dataframe/io/JsonKt {
public static final fun readJson (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/File;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Z)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readJson (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/InputStream;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Z)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readJson (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/String;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Z)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readJson (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/net/URL;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Z)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readJson (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/nio/file/Path;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Z)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readJson (Lorg/jetbrains/kotlinx/dataframe/DataRow$Companion;Ljava/io/File;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Z)Lorg/jetbrains/kotlinx/dataframe/DataRow;
public static final fun readJson (Lorg/jetbrains/kotlinx/dataframe/DataRow$Companion;Ljava/io/InputStream;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Z)Lorg/jetbrains/kotlinx/dataframe/DataRow;
public static final fun readJson (Lorg/jetbrains/kotlinx/dataframe/DataRow$Companion;Ljava/lang/String;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Z)Lorg/jetbrains/kotlinx/dataframe/DataRow;
public static final fun readJson (Lorg/jetbrains/kotlinx/dataframe/DataRow$Companion;Ljava/net/URL;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Z)Lorg/jetbrains/kotlinx/dataframe/DataRow;
public static final fun readJson (Lorg/jetbrains/kotlinx/dataframe/DataRow$Companion;Ljava/nio/file/Path;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Z)Lorg/jetbrains/kotlinx/dataframe/DataRow;
public static synthetic fun readJson$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/File;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readJson$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/InputStream;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readJson$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/String;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readJson$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/net/URL;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readJson$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/nio/file/Path;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readJson$default (Lorg/jetbrains/kotlinx/dataframe/DataRow$Companion;Ljava/io/File;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataRow;
public static synthetic fun readJson$default (Lorg/jetbrains/kotlinx/dataframe/DataRow$Companion;Ljava/io/InputStream;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataRow;
public static synthetic fun readJson$default (Lorg/jetbrains/kotlinx/dataframe/DataRow$Companion;Ljava/lang/String;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataRow;
public static synthetic fun readJson$default (Lorg/jetbrains/kotlinx/dataframe/DataRow$Companion;Ljava/net/URL;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataRow;
public static synthetic fun readJson$default (Lorg/jetbrains/kotlinx/dataframe/DataRow$Companion;Ljava/nio/file/Path;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataRow;
public static final fun readJsonStr (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/String;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Z)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readJsonStr (Lorg/jetbrains/kotlinx/dataframe/DataRow$Companion;Ljava/lang/String;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Z)Lorg/jetbrains/kotlinx/dataframe/DataRow;
public static synthetic fun readJsonStr$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/String;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readJsonStr$default (Lorg/jetbrains/kotlinx/dataframe/DataRow$Companion;Ljava/lang/String;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataRow;
public static final fun toJson (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Z)Ljava/lang/String;
public static final fun toJson (Lorg/jetbrains/kotlinx/dataframe/DataRow;Z)Ljava/lang/String;
public static synthetic fun toJson$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;ZILjava/lang/Object;)Ljava/lang/String;
public static synthetic fun toJson$default (Lorg/jetbrains/kotlinx/dataframe/DataRow;ZILjava/lang/Object;)Ljava/lang/String;
public static final fun toJsonWithMetadata (Lorg/jetbrains/kotlinx/dataframe/DataFrame;ILjava/lang/Integer;ZLjava/util/List;Z)Ljava/lang/String;
public static synthetic fun toJsonWithMetadata$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;ILjava/lang/Integer;ZLjava/util/List;ZILjava/lang/Object;)Ljava/lang/String;
public static final fun writeJson (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/io/File;Z)V
public static final fun writeJson (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/lang/Appendable;Z)V
public static final fun writeJson (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/lang/String;Z)V
public static final fun writeJson (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/nio/file/Path;Z)V
public static final fun writeJson (Lorg/jetbrains/kotlinx/dataframe/DataRow;Ljava/io/File;Z)V
public static final fun writeJson (Lorg/jetbrains/kotlinx/dataframe/DataRow;Ljava/lang/Appendable;Z)V
public static final fun writeJson (Lorg/jetbrains/kotlinx/dataframe/DataRow;Ljava/lang/String;Z)V
public static final fun writeJson (Lorg/jetbrains/kotlinx/dataframe/DataRow;Ljava/nio/file/Path;Z)V
public static synthetic fun writeJson$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/io/File;ZILjava/lang/Object;)V
public static synthetic fun writeJson$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/lang/Appendable;ZILjava/lang/Object;)V
public static synthetic fun writeJson$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/lang/String;ZILjava/lang/Object;)V
public static synthetic fun writeJson$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/nio/file/Path;ZILjava/lang/Object;)V
public static synthetic fun writeJson$default (Lorg/jetbrains/kotlinx/dataframe/DataRow;Ljava/io/File;ZILjava/lang/Object;)V
public static synthetic fun writeJson$default (Lorg/jetbrains/kotlinx/dataframe/DataRow;Ljava/lang/Appendable;ZILjava/lang/Object;)V
public static synthetic fun writeJson$default (Lorg/jetbrains/kotlinx/dataframe/DataRow;Ljava/lang/String;ZILjava/lang/Object;)V
public static synthetic fun writeJson$default (Lorg/jetbrains/kotlinx/dataframe/DataRow;Ljava/nio/file/Path;ZILjava/lang/Object;)V
}
+60
View File
@@ -0,0 +1,60 @@
import org.jetbrains.kotlin.gradle.tasks.KotlinCompile
plugins {
with(convention.plugins) {
alias(kotlinJvm8)
}
with(libs.plugins) {
alias(publisher)
alias(serialization)
alias(binary.compatibility.validator)
}
}
group = "org.jetbrains.kotlinx"
dependencies {
api(projects.core)
implementation(libs.kotlin.stdlib)
implementation(libs.serialization.core)
implementation(libs.serialization.json)
implementation(libs.sl4j)
// Use Kotlin test integration for JUnit 5 to satisfy variant 'kotlin-test-framework-junit5'
testImplementation(libs.kotlin.test.junit5)
testImplementation(libs.junit.jupiter)
testImplementation(libs.junit.jupiter.engine)
testImplementation(libs.junit.jupiter.params)
testImplementation(libs.kotestAssertions) {
exclude("org.jetbrains.kotlin", "kotlin-stdlib-jdk8")
}
testImplementation(libs.sl4jsimple)
}
tasks.withType<KotlinCompile> {
friendPaths.from(project(projects.core.path).projectDir)
}
tasks.withType<Javadoc> {
enabled = false
}
tasks.test {
useJUnitPlatform()
}
sourceSets {
main {
java.srcDirs("src/main/kotlin")
}
}
kotlinPublications {
publication {
publicationName = "dataframeJson"
artifactId = project.name
description = "Kotlin DataFrame JSON integration"
packageName = artifactId
}
}
@@ -0,0 +1,5 @@
package org.jetbrains.kotlinx.dataframe.impl.io
import java.util.Base64
internal fun ByteArray.toBase64(): String = Base64.getEncoder().encodeToString(this)
@@ -0,0 +1,11 @@
package org.jetbrains.kotlinx.dataframe.impl.io
import java.io.ByteArrayOutputStream
import java.util.zip.GZIPOutputStream
internal fun ByteArray.encodeGzip(): ByteArray {
val bos = ByteArrayOutputStream()
GZIPOutputStream(bos).use { it.write(this) }
return bos.toByteArray()
}
@@ -0,0 +1,795 @@
package org.jetbrains.kotlinx.dataframe.impl.io
import kotlinx.serialization.json.JsonArray
import kotlinx.serialization.json.JsonNull
import kotlinx.serialization.json.JsonObject
import kotlinx.serialization.json.JsonPrimitive
import kotlinx.serialization.json.boolean
import kotlinx.serialization.json.booleanOrNull
import kotlinx.serialization.json.double
import kotlinx.serialization.json.doubleOrNull
import kotlinx.serialization.json.float
import kotlinx.serialization.json.floatOrNull
import kotlinx.serialization.json.int
import kotlinx.serialization.json.intOrNull
import kotlinx.serialization.json.jsonArray
import kotlinx.serialization.json.long
import kotlinx.serialization.json.longOrNull
import org.jetbrains.kotlinx.dataframe.AnyCol
import org.jetbrains.kotlinx.dataframe.AnyFrame
import org.jetbrains.kotlinx.dataframe.DataColumn
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.DataRow
import org.jetbrains.kotlinx.dataframe.api.JsonPath
import org.jetbrains.kotlinx.dataframe.api.NameValueProperty
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.chunked
import org.jetbrains.kotlinx.dataframe.api.columnOf
import org.jetbrains.kotlinx.dataframe.api.concat
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import org.jetbrains.kotlinx.dataframe.api.firstOrNull
import org.jetbrains.kotlinx.dataframe.api.getColumn
import org.jetbrains.kotlinx.dataframe.api.mapIndexed
import org.jetbrains.kotlinx.dataframe.api.named
import org.jetbrains.kotlinx.dataframe.api.schema
import org.jetbrains.kotlinx.dataframe.api.splitInto
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
import org.jetbrains.kotlinx.dataframe.columns.FrameColumn
import org.jetbrains.kotlinx.dataframe.columns.TypeSuggestion
import org.jetbrains.kotlinx.dataframe.documentation.UnifyingNumbers
import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator
import org.jetbrains.kotlinx.dataframe.impl.DataCollector
import org.jetbrains.kotlinx.dataframe.impl.asList
import org.jetbrains.kotlinx.dataframe.impl.schema.DataFrameSchemaImpl
import org.jetbrains.kotlinx.dataframe.io.ARRAY_COLUMN_NAME
import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic
import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ANY_COLUMNS
import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ARRAY_AND_VALUE_COLUMNS
import org.jetbrains.kotlinx.dataframe.io.VALUE_COLUMN_NAME
import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema
import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema
import org.jetbrains.kotlinx.dataframe.type
import org.jetbrains.kotlinx.dataframe.typeClass
import org.jetbrains.kotlinx.dataframe.values
import kotlin.reflect.KType
import kotlin.reflect.KTypeProjection
import kotlin.reflect.full.createType
import kotlin.reflect.typeOf
private fun DataFrame<Any?>.unwrapUnnamedColumns() = dataFrameOf(columns().map { it.unwrapUnnamedColumn() })
private fun AnyCol.unwrapUnnamedColumn() = if (this is UnnamedColumn) col else this
private enum class AnyColType {
ANY,
ARRAYS,
OBJECTS,
}
internal interface AnyNameValueProperty : NameValueProperty<Any?> {
override val value: Any?
}
internal fun readJsonImpl(
parsed: Any?,
unifyNumbers: Boolean,
header: List<String>,
keyValuePaths: List<JsonPath> = emptyList(),
typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS,
): DataFrame<*> {
val df: AnyFrame = when (typeClashTactic) {
ARRAY_AND_VALUE_COLUMNS -> {
when (parsed) {
is JsonArray -> fromJsonListArrayAndValueColumns(
records = parsed,
unifyNumbers = unifyNumbers,
header = header,
keyValuePaths = keyValuePaths,
)
else -> fromJsonListArrayAndValueColumns(
records = listOf(parsed),
unifyNumbers = unifyNumbers,
keyValuePaths = keyValuePaths,
)
}
}
ANY_COLUMNS -> {
when (parsed) {
is JsonArray -> fromJsonListAnyColumns(
records = parsed,
unifyNumbers = unifyNumbers,
header = header,
keyValuePaths = keyValuePaths,
)
else -> fromJsonListAnyColumns(
records = listOf(parsed),
unifyNumbers = unifyNumbers,
keyValuePaths = keyValuePaths,
)
}
}
}
return df.unwrapUnnamedColumns()
}
/**
* Json to DataFrame converter that creates [Any] columns.
* A.k.a. [TypeClashTactic.ANY_COLUMNS].
*
* @param records List of json elements to be converted to a [DataFrame].
* @param unifyNumbers Whether to [unify the numbers that are read][UnifyingNumbers].
* @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[NameValueProperty]>
* will be created.
* @param header Optional list of column names. If given, [records] will be read like an object with [header] being the keys.
* @return [DataFrame] from the given [records].
*/
internal fun fromJsonListAnyColumns(
records: List<*>,
unifyNumbers: Boolean,
keyValuePaths: List<JsonPath> = emptyList(),
header: List<String> = emptyList(),
jsonPath: JsonPath = JsonPath(),
): AnyFrame {
var hasPrimitive = false
var hasArray = false
var hasObject = false
// list element type can be JsonObject, JsonArray or primitive
val nameGenerator = ColumnNameGenerator()
records.forEach { record ->
when (record) {
is JsonObject -> {
hasObject = true
record.entries.forEach { nameGenerator.addIfAbsent(it.key) }
}
is JsonArray -> hasArray = true
is JsonNull, null -> Unit
is JsonPrimitive -> hasPrimitive = true
}
}
val colType = when {
hasArray && !hasPrimitive && !hasObject -> AnyColType.ARRAYS
hasObject && !hasPrimitive && !hasArray -> AnyColType.OBJECTS
else -> AnyColType.ANY
}
val justPrimitives = hasPrimitive && !hasArray && !hasObject
val isKeyValue = keyValuePaths.any { jsonPath.matches(it) }
if (isKeyValue && colType != AnyColType.OBJECTS) {
error("Key value path $jsonPath does not match objects.")
}
@Suppress("KotlinConstantConditions")
val columns: List<AnyCol> = when {
// Create one column of type Any? (or guessed primitive type) from all the records
colType == AnyColType.ANY -> {
val collector: DataCollector<Any?> =
if (justPrimitives) {
createDataCollector(records.size) // guess the type
} else {
createDataCollector(records.size, typeOf<Any?>()) // use Any?
}
val nanIndices = mutableListOf<Int>()
records.forEachIndexed { i, v ->
when (v) {
is JsonObject -> {
val parsed =
fromJsonListAnyColumns(
records = listOf(v),
unifyNumbers = unifyNumbers,
keyValuePaths = keyValuePaths,
jsonPath = jsonPath.replaceLastWildcardWithIndex(i),
)
collector.add(
if (parsed.isSingleUnnamedColumn()) {
(parsed.getColumn(0) as UnnamedColumn).col.values.first()
} else {
parsed.firstOrNull() ?: DataRow.empty
},
)
}
is JsonArray -> {
val parsed = fromJsonListAnyColumns(
records = v,
unifyNumbers = unifyNumbers,
keyValuePaths = keyValuePaths,
jsonPath = jsonPath.replaceLastWildcardWithIndex(i).appendArrayWithWildcard(),
)
collector.add(
if (parsed.isSingleUnnamedColumn()) {
(parsed.getColumn(0) as UnnamedColumn).col.values.asList()
} else {
parsed.unwrapUnnamedColumns()
},
)
}
is JsonNull -> collector.add(null)
is JsonPrimitive -> {
when {
v.content == "NaN" -> {
nanIndices.add(i)
collector.add(null)
}
v.isString -> collector.add(v.content)
v.booleanOrNull != null -> collector.add(v.boolean)
v.intOrNull != null -> collector.add(v.int)
v.longOrNull != null -> collector.add(v.long)
v.floatOrNull != null -> collector.add(v.float)
v.doubleOrNull != null -> collector.add(v.double)
else -> error("Malformed JSON element ${v::class}: $v")
}
}
else -> collector.add(v)
}
}
val column = createColumnGuessingType(VALUE_COLUMN_NAME, collector.data, unifyNumbers = unifyNumbers)
val res = if (nanIndices.isNotEmpty()) {
fun <C> DataColumn<C>.updateNaNs(nanValue: C): DataColumn<C> {
var j = 0
var nextNanIndex = nanIndices[j]
return mapIndexed(column.type) { i, v ->
if (i == nextNanIndex) {
j++
nextNanIndex = if (j < nanIndices.size) nanIndices[j] else -1
nanValue
} else {
v
}
}
}
when (column.typeClass) {
Double::class -> column.cast<Double?>().updateNaNs(Double.NaN)
Float::class -> column.cast<Float?>().updateNaNs(Float.NaN)
String::class -> column.cast<String?>().updateNaNs("NaN")
else -> column
}
} else {
column
}
listOf(UnnamedColumn(res))
}
// Create one column of type FrameColumn, or List<> from all the records if they are all arrays
colType == AnyColType.ARRAYS -> {
val values = mutableListOf<Any?>()
val startIndices = ArrayList<Int>()
records.forEach {
startIndices.add(values.size)
when (it) {
is JsonArray -> values.addAll(it)
is JsonNull, null -> Unit
else -> error("Expected JsonArray, got $it")
}
}
val parsed = fromJsonListAnyColumns(
records = values,
unifyNumbers = unifyNumbers,
keyValuePaths = keyValuePaths,
jsonPath = jsonPath.appendArrayWithWildcard(),
)
val res = when {
parsed.isSingleUnnamedColumn() -> {
val col = (parsed.getColumn(0) as UnnamedColumn).col
val elementType = col.type
val columnValues = col.values
.asList()
.splitByIndices(startIndices.asSequence())
.toList()
DataColumn.createValueColumn(
name = ARRAY_COLUMN_NAME,
values = columnValues,
type = List::class.createType(listOf(KTypeProjection.invariant(elementType))),
)
}
else ->
parsed.unwrapUnnamedColumns()
.chunked(
startIndices = startIndices,
name = ARRAY_COLUMN_NAME, // will be erased
)
}
listOf(UnnamedColumn(res))
}
// Create one column of type FrameColumn<KeyValueProperty>
colType == AnyColType.OBJECTS && isKeyValue -> {
// collect the value types to make sure Value columns with lists and other values aren't all turned into lists
val valueTypes = mutableSetOf<KType>()
val dataFrames = records.map { record ->
when (record) {
is JsonObject -> {
val map = record.mapValues { (key, value) ->
val parsed = fromJsonListAnyColumns(
records = listOf(value),
unifyNumbers = unifyNumbers,
keyValuePaths = keyValuePaths,
jsonPath = jsonPath.append(key),
)
if (parsed.isSingleUnnamedColumn()) {
(parsed.getColumn(0) as UnnamedColumn).col.values.first()
} else {
parsed.unwrapUnnamedColumns().firstOrNull()
}
}
val valueType = map.values.map {
guessValueType(sequenceOf(it), unifyNumbers = unifyNumbers)
}.commonType()
valueTypes += valueType
dataFrameOf(
columnOf(*map.keys.toTypedArray()).named(NameValueProperty<*>::name.name),
createColumnGuessingType(
values = map.values,
suggestedType = TypeSuggestion.Use(valueType),
unifyNumbers = unifyNumbers,
).named(NameValueProperty<*>::value.name),
)
}
is JsonNull, null -> DataFrame.emptyOf<AnyNameValueProperty>()
else -> error("Expected JsonObject, got $record")
}
}
val valueColumns = dataFrames.map { it[NameValueProperty<*>::value.name] }
val valueColumnSchema = when {
// in these cases we can safely combine the columns to get a single column schema
valueColumns.all { it is ColumnGroup<*> } || valueColumns.all { it is FrameColumn<*> } ->
valueColumns.concat().toDataFrame().schema().columns.values.single()
// to avoid listification, we create the value columns schema ourselves (https://github.com/Kotlin/dataframe/issues/184)
else -> ColumnSchema.Value(valueTypes.commonType())
}
listOf(
UnnamedColumn(
DataColumn.createFrameColumn(
name = VALUE_COLUMN_NAME, // will be erased unless at top-level
groups = dataFrames,
schema = lazy {
DataFrameSchemaImpl(
columns = mapOf(
NameValueProperty<*>::name.name to ColumnSchema.Value(typeOf<String>()),
NameValueProperty<*>::value.name to valueColumnSchema,
),
)
},
),
),
)
}
// Create multiple columns from all the records if they are all objects, merging the objects in essence
colType == AnyColType.OBJECTS && !isKeyValue -> {
nameGenerator.names.map { colName ->
val values = ArrayList<Any?>(records.size)
records.forEach {
when (it) {
is JsonObject -> values.add(it[colName])
is JsonNull, null -> values.add(null)
else -> error("Expected JsonObject, got $it")
}
}
val parsed = fromJsonListAnyColumns(
records = values,
unifyNumbers = unifyNumbers,
keyValuePaths = keyValuePaths,
jsonPath = jsonPath.append(colName),
)
when {
parsed.columnsCount() == 0 ->
DataColumn.createValueColumn(
name = colName,
values = arrayOfNulls<Any?>(values.size).toList(),
type = typeOf<Any?>(),
)
parsed.isSingleUnnamedColumn() ->
(parsed.getColumn(0) as UnnamedColumn).col.rename(colName)
else ->
DataColumn.createColumnGroup(colName, parsed.unwrapUnnamedColumns()) as AnyCol
}
}
}
else -> error("")
}
return when {
columns.isEmpty() -> DataFrame.empty(records.size)
columns.size == 1 && hasArray && header.isNotEmpty() && columns[0].typeClass == List::class ->
columns[0]
.cast<List<*>>()
.splitInto(*header.toTypedArray())
else -> columns.toDataFrame()
}
}
private fun AnyFrame.isSingleUnnamedColumn() = columnsCount() == 1 && getColumn(0) is UnnamedColumn
/**
* Json to DataFrame converter that creates allows creates `value` and `array` accessors
* instead of [Any] columns.
* A.k.a. [TypeClashTactic.ARRAY_AND_VALUE_COLUMNS].
*
* @param records List of json elements to be converted to a [DataFrame].
* @param unifyNumbers Whether to [unify the numbers that are read][UnifyingNumbers].
* @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[NameValueProperty]>
* will be created.
* @param header Optional list of column names. If given, [records] will be read like an object with [header] being the keys.
* @return [DataFrame] from the given [records].
*/
internal fun fromJsonListArrayAndValueColumns(
records: List<*>,
unifyNumbers: Boolean,
keyValuePaths: List<JsonPath> = emptyList(),
header: List<String> = emptyList(),
jsonPath: JsonPath = JsonPath(),
): AnyFrame {
var hasPrimitive = false
var hasArray = false
val isKeyValue = keyValuePaths.any { jsonPath.matches(it) }
// list element type can be JsonObject, JsonArray or primitive
// So first, we gather all properties of objects to merge including "array" and "value" if needed
// so the resulting type of a property with instances 123, ["abc"], and { "a": 1, "b": 2 } will be
// { array: List<String>, value: Int?, a: Int?, b: Int? }
// and instances will look like
// { "array": [], "value": 123, "a": null, "b": null }
val nameGenerator = ColumnNameGenerator()
records.forEach { record ->
when (record) {
is JsonObject -> record.entries.forEach {
nameGenerator.addIfAbsent(it.key)
}
is JsonArray -> hasArray = true
is JsonNull, null -> Unit
is JsonPrimitive -> hasPrimitive = true
}
}
if (records.all { it == null || it is JsonNull }) hasPrimitive = true
// Add a value column to the collected names if needed
val valueColumn = if (hasPrimitive || records.isEmpty()) {
nameGenerator.addUnique(VALUE_COLUMN_NAME)
} else {
null
}
// Add an array column to the collected names if needed
val arrayColumn = if (hasArray) {
nameGenerator.addUnique(ARRAY_COLUMN_NAME)
} else {
null
}
// only properties that consist of just objects (or are empty) can be merged to key/value FrameColumns
if (isKeyValue && (hasPrimitive || hasArray)) {
error("Key value path $jsonPath does not match objects.")
}
// Create columns from the collected names
val columns: List<AnyCol> = when {
// instead of using the names, generate a single key/value frame column
isKeyValue -> {
val dataFrames = records.map { record ->
when (record) {
is JsonObject -> {
val map = record.mapValues { (key, value) ->
val parsed = fromJsonListArrayAndValueColumns(
records = listOf(value),
unifyNumbers = unifyNumbers,
keyValuePaths = keyValuePaths,
jsonPath = jsonPath.append(key),
)
if (parsed.isSingleUnnamedColumn()) {
(parsed.getColumn(0) as UnnamedColumn).col.values.first()
} else {
parsed.unwrapUnnamedColumns().firstOrNull()
}
}
val valueType =
map.values
.map { guessValueType(sequenceOf(it), unifyNumbers = unifyNumbers) }
.commonType()
dataFrameOf(
columnOf(*map.keys.toTypedArray()).named(NameValueProperty<*>::name.name),
createColumnGuessingType(
values = map.values,
suggestedType = TypeSuggestion.Use(valueType),
unifyNumbers = unifyNumbers,
).named(NameValueProperty<*>::value.name),
)
}
is JsonNull, null -> DataFrame.emptyOf<AnyNameValueProperty>()
else -> error("Expected JsonObject, got $record")
}
}
listOf(
UnnamedColumn(
DataColumn.createFrameColumn(
name = VALUE_COLUMN_NAME, // will be erased unless at top-level
groups = dataFrames,
schema = lazy {
dataFrames.mapNotNull { it.takeIf { it.rowsCount() > 0 }?.schema() }.intersectSchemas()
},
),
),
)
}
// generate columns using the collected names
else ->
nameGenerator.names.map { colName ->
when {
// Collect primitive values from records into the `value` column if needed
colName == valueColumn && (hasPrimitive || records.isEmpty()) -> {
val collector: DataCollector<Any?> = createDataCollector(records.size)
val nanIndices = mutableListOf<Int>()
records.forEachIndexed { i, v ->
when (v) {
is JsonObject -> collector.add(null)
is JsonArray -> collector.add(null)
is JsonNull -> collector.add(null)
is JsonPrimitive -> {
when {
v.content == "NaN" -> {
nanIndices.add(i)
collector.add(null)
}
v.isString -> collector.add(v.content)
v.booleanOrNull != null -> collector.add(v.boolean)
v.intOrNull != null -> collector.add(v.int)
v.longOrNull != null -> collector.add(v.long)
v.floatOrNull != null -> collector.add(v.float)
v.doubleOrNull != null -> collector.add(v.double)
else -> error("Malformed JSON element ${v::class}: $v")
}
}
else -> collector.add(v)
}
}
val column = createColumnGuessingType(colName, collector.data, unifyNumbers = unifyNumbers)
val res = if (nanIndices.isNotEmpty()) {
fun <C> DataColumn<C>.updateNaNs(nanValue: C): DataColumn<C> {
var j = 0
var nextNanIndex = nanIndices[j]
return mapIndexed(column.type) { i, v ->
if (i == nextNanIndex) {
j++
nextNanIndex = if (j < nanIndices.size) nanIndices[j] else -1
nanValue
} else {
v
}
}
}
when (column.typeClass) {
Double::class -> column.cast<Double?>().updateNaNs(Double.NaN)
Float::class -> column.cast<Float?>().updateNaNs(Float.NaN)
String::class -> column.cast<String?>().updateNaNs("NaN")
else -> column
}
} else {
column
}
UnnamedColumn(res)
}
// Collect arrays from records into the `array` column if needed
colName == arrayColumn && hasArray -> {
val values = mutableListOf<Any?>()
val startIndices = ArrayList<Int>()
records.forEach {
startIndices.add(values.size)
if (it is JsonArray) values.addAll(it.jsonArray)
}
val parsed = fromJsonListArrayAndValueColumns(
records = values,
unifyNumbers = unifyNumbers,
keyValuePaths = keyValuePaths,
jsonPath = jsonPath.appendArrayWithWildcard(),
)
val res = when {
parsed.isSingleUnnamedColumn() -> {
val col = (parsed.getColumn(0) as UnnamedColumn).col
val elementType = col.type
val columnValues =
col.values
.asList()
.splitByIndices(startIndices.asSequence())
.toList()
DataColumn.createValueColumn(
name = colName,
values = columnValues,
type = List::class.createType(listOf(KTypeProjection.invariant(elementType))),
)
}
else -> parsed.unwrapUnnamedColumns().chunked(startIndices, colName)
}
UnnamedColumn(res)
}
// Collect the current column name as property from the objects in records
else -> {
val values = ArrayList<Any?>(records.size)
records.forEach {
when (it) {
is JsonObject -> values.add(it[colName])
else -> values.add(null)
}
}
val parsed = fromJsonListArrayAndValueColumns(
records = values,
unifyNumbers = unifyNumbers,
keyValuePaths = keyValuePaths,
jsonPath = jsonPath.append(colName),
)
when {
parsed.columnsCount() == 0 ->
DataColumn.createValueColumn(
name = colName,
values = arrayOfNulls<Any?>(values.size).toList(),
type = typeOf<Any?>(),
)
parsed.isSingleUnnamedColumn() ->
(parsed.getColumn(0) as UnnamedColumn).col.rename(colName)
else ->
DataColumn.createColumnGroup(colName, parsed.unwrapUnnamedColumns()) as AnyCol
}
}
}
}
}
return when {
columns.isEmpty() ->
DataFrame.empty(records.size)
columns.size == 1 && hasArray && header.isNotEmpty() && columns[0].typeClass == List::class ->
columns[0]
.cast<List<*>>()
.splitInto(*header.toTypedArray())
else ->
columns.toDataFrame()
}
}
// we need it to check if AnyFrame created by recursive call has single unnamed column,
// unnamed column means this column is not created from field of a record [{"value": 1}, {"value": 2}],
// but filtered values [1, { ... }, []] -> [1, null, null]
// or arrays: [1, { ...}, []] -> [null, null, []]
private class UnnamedColumn(val col: DataColumn<Any?>) : DataColumn<Any?> by col
// region friend module error suppression
@Suppress("INVISIBLE_REFERENCE")
private fun createDataCollector(initCapacity: Int = 0) =
org.jetbrains.kotlinx.dataframe.impl.createDataCollector(initCapacity)
@Suppress("INVISIBLE_REFERENCE")
private fun <T> createDataCollector(initCapacity: Int = 0, type: KType) =
org.jetbrains.kotlinx.dataframe.impl.createDataCollector<T>(initCapacity, type)
@Suppress("INVISIBLE_REFERENCE")
private fun <T> createColumnGuessingType(
name: String,
values: Iterable<T>,
suggestedType: TypeSuggestion = TypeSuggestion.Infer,
defaultValue: T? = null,
nullable: Boolean? = null,
listifyValues: Boolean = false,
allColsMakesColGroup: Boolean = false,
unifyNumbers: Boolean = false,
) = org.jetbrains.kotlinx.dataframe.impl.columns.createColumnGuessingType(
name = name,
values = values,
suggestedType = suggestedType,
defaultValue = defaultValue,
nullable = nullable,
listifyValues = listifyValues,
allColsMakesColGroup = allColsMakesColGroup,
unifyNumbers = unifyNumbers,
)
@Suppress("INVISIBLE_REFERENCE")
private fun <T> createColumnGuessingType(
values: Iterable<T>,
suggestedType: TypeSuggestion = TypeSuggestion.Infer,
defaultValue: T? = null,
nullable: Boolean? = null,
listifyValues: Boolean = false,
allColsMakesColGroup: Boolean = false,
unifyNumbers: Boolean = false,
) = org.jetbrains.kotlinx.dataframe.impl.columns.createColumnGuessingType(
values = values,
suggestedType = suggestedType,
defaultValue = defaultValue,
nullable = nullable,
listifyValues = listifyValues,
allColsMakesColGroup = allColsMakesColGroup,
unifyNumbers = unifyNumbers,
)
@Suppress("INVISIBLE_REFERENCE")
private fun guessValueType(
values: Sequence<Any?>,
upperBound: KType? = null,
listifyValues: Boolean = false,
allColsMakesRow: Boolean = false,
unifyNumbers: Boolean = false,
) = org.jetbrains.kotlinx.dataframe.impl.guessValueType(
values = values,
upperBound = upperBound,
listifyValues = listifyValues,
allColsMakesRow = allColsMakesRow,
unifyNumbers = unifyNumbers,
)
@Suppress("INVISIBLE_REFERENCE")
private fun <T> List<T>.splitByIndices(startIndices: Sequence<Int>) =
org.jetbrains.kotlinx.dataframe.impl.splitByIndices(list = this, startIndices = startIndices)
@Suppress("INVISIBLE_REFERENCE")
private fun Iterable<KType?>.commonType(useStar: Boolean = true) =
org.jetbrains.kotlinx.dataframe.impl.commonType(types = this, useStar)
@Suppress("INVISIBLE_REFERENCE")
private fun Iterable<DataFrameSchema>.intersectSchemas() =
org.jetbrains.kotlinx.dataframe.impl.schema.intersectSchemas(schemas = this)
// endregion
@@ -0,0 +1,460 @@
@file:OptIn(ExperimentalSerializationApi::class)
package org.jetbrains.kotlinx.dataframe.impl.io
import kotlinx.serialization.ExperimentalSerializationApi
import kotlinx.serialization.json.Json
import kotlinx.serialization.json.JsonArray
import kotlinx.serialization.json.JsonElement
import kotlinx.serialization.json.JsonObject
import kotlinx.serialization.json.JsonPrimitive
import kotlinx.serialization.json.addAll
import kotlinx.serialization.json.buildJsonArray
import kotlinx.serialization.json.buildJsonObject
import kotlinx.serialization.json.encodeToJsonElement
import kotlinx.serialization.json.put
import kotlinx.serialization.json.putJsonArray
import kotlinx.serialization.json.putJsonObject
import org.jetbrains.kotlinx.dataframe.AnyCol
import org.jetbrains.kotlinx.dataframe.AnyFrame
import org.jetbrains.kotlinx.dataframe.ColumnsContainer
import org.jetbrains.kotlinx.dataframe.DataColumn
import org.jetbrains.kotlinx.dataframe.api.FormattedFrame
import org.jetbrains.kotlinx.dataframe.api.indices
import org.jetbrains.kotlinx.dataframe.api.isList
import org.jetbrains.kotlinx.dataframe.api.rows
import org.jetbrains.kotlinx.dataframe.api.schema
import org.jetbrains.kotlinx.dataframe.api.take
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
import org.jetbrains.kotlinx.dataframe.columns.ColumnKind
import org.jetbrains.kotlinx.dataframe.columns.FrameColumn
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.COLUMNS
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.DATA
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.IS_FORMATTED
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KIND
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAME
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.METADATA
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NCOL
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.NROW
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.TYPE
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.TYPES
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.VERSION
import org.jetbrains.kotlinx.dataframe.io.ARRAY_COLUMN_NAME
import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions
import org.jetbrains.kotlinx.dataframe.io.CustomEncoder
import org.jetbrains.kotlinx.dataframe.io.VALUE_COLUMN_NAME
import org.jetbrains.kotlinx.dataframe.jupyter.KotlinNotebookPluginUtils
import org.jetbrains.kotlinx.dataframe.name
import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema
import org.jetbrains.kotlinx.dataframe.typeClass
import java.awt.RenderingHints
import java.awt.image.BufferedImage
import java.awt.image.ImageObserver
import java.io.IOException
// See docs/serialization_format.md for a description of
// serialization versions and format.
internal const val SERIALIZATION_VERSION = "2.2.0"
internal object SerializationKeys {
const val DATA = "data"
const val METADATA = "metadata"
const val KIND = "kind"
const val NCOL = "ncol"
const val NROW = "nrow"
const val VERSION = "\$version"
const val COLUMNS = "columns"
const val KOTLIN_DATAFRAME = "kotlin_dataframe"
const val TYPE = "type"
const val TYPES = "types"
const val IS_FORMATTED = "is_formatted"
}
private val valueTypes =
setOf(Boolean::class, Double::class, Int::class, Float::class, Long::class, Short::class, Byte::class)
@OptIn(ExperimentalSerializationApi::class)
private fun convert(value: Any?): JsonElement =
when (value) {
is JsonElement -> value
is Number -> JsonPrimitive(value)
is String -> JsonPrimitive(value)
is Char -> JsonPrimitive(value.toString())
is Boolean -> JsonPrimitive(value)
null -> JsonPrimitive(null)
else -> JsonPrimitive(value.toString())
}
internal fun encodeRow(frame: ColumnsContainer<*>, index: Int): JsonObject {
val values: Map<String, JsonElement> = frame.columns().associate { col ->
col.name to when {
col is ColumnGroup<*> -> encodeRow(col, index)
col is FrameColumn<*> -> encodeFrame(col[index])
col.isList() -> {
col[index]?.let {
JsonArray((it as List<*>).map { value -> convert(value) })
} ?: JsonPrimitive(null)
}
col.typeClass in valueTypes -> {
val v = col[index]
convert(v)
}
else -> JsonPrimitive(col[index]?.toString())
}
}
if (values.isEmpty()) return buildJsonObject { }
return JsonObject(values)
}
internal fun encodeRowWithMetadata(
frame: ColumnsContainer<*>,
index: Int,
rowLimit: Int? = null,
customEncoders: List<CustomEncoder> = emptyList(),
): JsonElement? {
val values: List<Pair<String, JsonElement>> = frame.columns().map { col ->
when (col) {
is ColumnGroup<*> -> {
val schema = col.schema()
buildJsonObject {
put(DATA, encodeRowWithMetadata(col, index, rowLimit, customEncoders) ?: JsonPrimitive(null))
putJsonObject(METADATA) {
put(KIND, JsonPrimitive(ColumnKind.Group.toString()))
put(COLUMNS, Json.encodeToJsonElement(schema.columns.keys))
putJsonArray(TYPES) {
addAll(
schema.columns.values.map { columnSchema ->
createJsonTypeDescriptor(columnSchema)
},
)
}
}
}
}
is FrameColumn<*> -> {
val data = if (rowLimit == null) {
encodeFrameWithMetadata(col[index], null, customEncoders)
} else {
encodeFrameWithMetadata(col[index].take(rowLimit), rowLimit, customEncoders)
}
val schema = col.schema.value
buildJsonObject {
put(DATA, data)
putJsonObject(METADATA) {
put(KIND, JsonPrimitive(ColumnKind.Frame.toString()))
put(COLUMNS, Json.encodeToJsonElement(schema.columns.keys))
putJsonArray(TYPES) {
addAll(
schema.columns.values.map { columnSchema ->
createJsonTypeDescriptor(columnSchema)
},
)
}
put(NCOL, JsonPrimitive(col[index].columnsCount()))
put(NROW, JsonPrimitive(col[index].rowsCount()))
}
}
}
else -> encodeValue(col, index, customEncoders)
}.let { col.name to it }
}
if (values.isEmpty()) return null
return JsonObject(values.toMap())
}
internal fun encodeValue(col: AnyCol, index: Int, customEncoders: List<CustomEncoder> = emptyList()): JsonElement {
val matchingEncoder = customEncoders.firstOrNull { it.canEncode(col[index]) }
return when {
matchingEncoder != null -> matchingEncoder.encode(col[index])
col.isList() -> col[index]?.let { list ->
val values = (list as List<*>).map { convert(it) }
JsonArray(values)
} ?: JsonArray(emptyList())
col.typeClass in valueTypes -> convert(col[index])
else -> JsonPrimitive(col[index]?.toString())
}
}
internal class DataframeConvertableEncoder(
private val encoders: List<CustomEncoder>,
private val rowLimit: Int? = null,
) : CustomEncoder {
override fun canEncode(input: Any?): Boolean = isDataframeConvertable(input)
override fun encode(input: Any?): JsonElement =
input?.let {
val data = encodeFrameWithMetadata(
KotlinNotebookPluginUtils.convertToDataFrame(input),
rowLimit,
encoders,
)
val isFormatted = input is FormattedFrame<*>
buildJsonObject {
put(DATA, data)
putJsonObject(METADATA) {
put(KIND, JsonPrimitive(CellKind.DataFrameConvertable.toString()))
put(IS_FORMATTED, JsonPrimitive(isFormatted))
}
}
} ?: JsonPrimitive(null)
}
internal class BufferedImageEncoder(private val options: Base64ImageEncodingOptions) : CustomEncoder {
override fun canEncode(input: Any?): Boolean = input is BufferedImage
override fun encode(input: Any?): JsonElement =
JsonPrimitive(
input?.let { image -> encodeBufferedImageAsBase64(image as BufferedImage, options) } ?: "",
)
private fun encodeBufferedImageAsBase64(
image: BufferedImage,
imageEncodingOptions: Base64ImageEncodingOptions = Base64ImageEncodingOptions(),
): String =
try {
val preparedImage = if (imageEncodingOptions.isLimitSizeOn) {
image.resizeKeepingAspectRatio(imageEncodingOptions.imageSizeLimit)
} else {
image
}
val bytes = if (imageEncodingOptions.isGzipOn) {
preparedImage.toByteArray().encodeGzip()
} else {
preparedImage.toByteArray()
}
bytes.toBase64()
} catch (_: IOException) {
""
}
}
private fun createJsonTypeDescriptor(columnSchema: ColumnSchema): JsonObject =
JsonObject(
mutableMapOf(KIND to JsonPrimitive(columnSchema.kind.toString())).also {
if (columnSchema.kind == ColumnKind.Value) {
it[TYPE] = JsonPrimitive(columnSchema.type.toString())
}
},
)
internal fun encodeFrameWithMetadata(
frame: AnyFrame,
rowLimit: Int? = null,
customEncoders: List<CustomEncoder> = emptyList(),
): JsonArray {
val valueColumn = frame.extractValueColumn()
val arrayColumn = frame.extractArrayColumn()
val arraysAreFrames = arrayColumn?.kind() == ColumnKind.Frame
val data = frame.indices().map { rowIndex ->
valueColumn?.get(rowIndex)
?: arrayColumn?.get(rowIndex)?.let {
if (arraysAreFrames) {
encodeFrameWithMetadata(
it as AnyFrame,
rowLimit,
customEncoders,
)
} else {
null
}
}
?: encodeRowWithMetadata(frame, rowIndex, rowLimit, customEncoders)
}
return buildJsonArray { addAll(data.map { convert(it) }) }
}
internal fun AnyFrame.extractValueColumn(): DataColumn<*>? {
val allColumns = columns()
return allColumns.filter { it.name.startsWith(VALUE_COLUMN_NAME) }
.takeIf { isPossibleToFindUnnamedColumns }
?.maxByOrNull { it.name }
?.let { valueCol ->
// check that value in this column is not null only when other values are null
if (valueCol.kind() != ColumnKind.Value) {
null
} else {
// check that value in this column is not null only when other values are null
val isValidValueColumn = rows().all { row ->
if (valueCol[row] != null) {
allColumns.all { col ->
if (col.name != valueCol.name) {
col[row] == null
} else {
true
}
}
} else {
true
}
}
if (isValidValueColumn) {
valueCol
} else {
null
}
}
}
}
// If there is only 1 column, then `isValidValueColumn` always true.
// But at the same time, we shouldn't treat dataFrameOf("value")(1,2,3) like an unnamed column
// because it was created by the user.
internal val AnyFrame.isPossibleToFindUnnamedColumns: Boolean
get() = columns().size != 1
internal fun AnyFrame.extractArrayColumn(): DataColumn<*>? {
val allColumns = columns()
return columns().filter { it.name.startsWith(ARRAY_COLUMN_NAME) }
.takeIf { isPossibleToFindUnnamedColumns }
?.maxByOrNull { it.name }
?.let { arrayCol ->
if (arrayCol.kind() == ColumnKind.Group) {
null
} else {
// check that value in this column is not null only when other values are null
val isValidArrayColumn = rows().all { row ->
if (arrayCol[row] != null) {
allColumns.all { col ->
if (col.name != arrayCol.name) {
col[row] == null
} else {
true
}
}
} else {
true
}
}
if (isValidArrayColumn) {
arrayCol
} else {
null
}
}
}
}
internal fun encodeFrame(frame: AnyFrame): JsonArray {
val valueColumn = frame.extractValueColumn()
val arrayColumn = frame.extractArrayColumn()
val arraysAreFrames = arrayColumn?.kind() == ColumnKind.Frame
val data = frame.indices().map { rowIndex ->
when {
valueColumn != null -> valueColumn[rowIndex]
arrayColumn != null -> arrayColumn[rowIndex]?.let {
if (arraysAreFrames) {
encodeFrame(it as AnyFrame)
} else {
null
}
}
else -> encodeRow(frame, rowIndex)
}
}
return buildJsonArray { addAll(data.map { convert(it) }) }
}
internal fun encodeDataFrameWithMetadata(
frame: AnyFrame,
rowLimit: Int,
nestedRowLimit: Int? = null,
customEncoders: List<CustomEncoder> = emptyList(),
isFormatted: Boolean = false,
): JsonObject =
buildJsonObject {
put(VERSION, JsonPrimitive(SERIALIZATION_VERSION))
putJsonObject(METADATA) {
putJsonArray(COLUMNS) { addAll(frame.columnNames().map { JsonPrimitive(it) }) }
putJsonArray(TYPES) {
addAll(
frame.schema().columns.values.map { colSchema ->
createJsonTypeDescriptor(colSchema)
},
)
}
put(NROW, JsonPrimitive(frame.rowsCount()))
put(NCOL, JsonPrimitive(frame.columnsCount()))
put(IS_FORMATTED, JsonPrimitive(isFormatted))
}
put(
KOTLIN_DATAFRAME,
encodeFrameWithMetadata(
frame = frame.take(rowLimit),
rowLimit = nestedRowLimit,
customEncoders = customEncoders,
),
)
}
@OptIn(ExperimentalSerializationApi::class)
internal fun encodeFrameNoDynamicNestedTables(df: AnyFrame, limit: Int, isFormatted: Boolean): JsonObject =
buildJsonObject {
put(NROW, df.rowsCount())
put(NCOL, df.columnsCount())
putJsonArray(COLUMNS) { addAll(df.columnNames()) }
put(IS_FORMATTED, JsonPrimitive(isFormatted))
put(
KOTLIN_DATAFRAME,
encodeFrame(df.take(limit)),
)
}
// region friend module error suppression
@Suppress("INVISIBLE_REFERENCE")
private object CellKind {
val DataFrameConvertable = org.jetbrains.kotlinx.dataframe.columns.CellKind.DataFrameConvertable
}
@Suppress("INVISIBLE_REFERENCE")
private fun isDataframeConvertable(dataframeLike: Any?) =
KotlinNotebookPluginUtils.isDataframeConvertable(dataframeLike = dataframeLike)
@Suppress("INVISIBLE_REFERENCE")
internal fun BufferedImage.resizeKeepingAspectRatio(
maxSize: Int,
resultImageType: Int = BufferedImage.TYPE_INT_ARGB,
interpolation: Any = RenderingHints.VALUE_INTERPOLATION_NEAREST_NEIGHBOR,
renderingQuality: Any = RenderingHints.VALUE_RENDER_QUALITY,
antialiasing: Any = RenderingHints.VALUE_ANTIALIAS_ON,
observer: ImageObserver? = null,
) = org.jetbrains.kotlinx.dataframe.impl.io.resizeKeepingAspectRatio(
image = this,
maxSize = maxSize,
resultImageType = resultImageType,
interpolation = interpolation,
renderingQuality = renderingQuality,
antialiasing = antialiasing,
observer = observer,
)
private const val DEFAULT_IMG_FORMAT: String = "png"
@Suppress("INVISIBLE_REFERENCE")
private fun BufferedImage.toByteArray(format: String = DEFAULT_IMG_FORMAT) =
org.jetbrains.kotlinx.dataframe.impl.io.toByteArray(image = this, format = format)
// endregion
@@ -0,0 +1,19 @@
package org.jetbrains.kotlinx.dataframe.io;
import org.jetbrains.kotlinx.dataframe.DataFrame;
import org.jetbrains.kotlinx.dataframe.annotations.RequiredByIntellijPlugin;
import java.util.Collections;
class JsonFacadeForDebugger {
/**
* utility for rendering dataframe as interactive table in the debugger - it needs json model
* Java class easier to discover in the debugger
* DO NOT BREAK ABI OF THIS METHOD!!
* Keep it for backward compatibility, create a new method if signature must change
*/
@RequiredByIntellijPlugin
static String convertToJson(DataFrame<?> df, int rowLimit, Integer nestedRowLimit) {
return JsonKt.toJsonWithMetadata(df, rowLimit, nestedRowLimit, false, Collections.emptyList(), false);
}
}
@@ -0,0 +1,484 @@
package org.jetbrains.kotlinx.dataframe.io
import kotlinx.serialization.ExperimentalSerializationApi
import kotlinx.serialization.json.Json
import kotlinx.serialization.json.JsonElement
import kotlinx.serialization.json.decodeFromStream
import org.intellij.lang.annotations.Language
import org.jetbrains.kotlinx.dataframe.AnyFrame
import org.jetbrains.kotlinx.dataframe.AnyRow
import org.jetbrains.kotlinx.dataframe.DataColumn
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.DataRow
import org.jetbrains.kotlinx.dataframe.api.FormattedFrame
import org.jetbrains.kotlinx.dataframe.api.JsonPath
import org.jetbrains.kotlinx.dataframe.api.NameValueProperty
import org.jetbrains.kotlinx.dataframe.api.single
import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod
import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
import org.jetbrains.kotlinx.dataframe.columns.FrameColumn
import org.jetbrains.kotlinx.dataframe.documentation.UnifyingNumbers
import org.jetbrains.kotlinx.dataframe.impl.io.encodeDataFrameWithMetadata
import org.jetbrains.kotlinx.dataframe.impl.io.encodeFrame
import org.jetbrains.kotlinx.dataframe.impl.io.encodeRow
import org.jetbrains.kotlinx.dataframe.impl.io.readJsonImpl
import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic
import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ANY_COLUMNS
import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ARRAY_AND_VALUE_COLUMNS
import java.io.File
import java.io.InputStream
import java.net.URL
import java.nio.file.Path
import kotlin.io.path.writeText
import kotlin.reflect.typeOf
public class JSON(
private val typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS,
private val keyValuePaths: List<JsonPath> = emptyList(),
private val unifyNumbers: Boolean = true,
) : SupportedDataFrameFormat {
override fun readDataFrame(stream: InputStream, header: List<String>): AnyFrame =
DataFrame.readJson(
stream = stream,
header = header,
typeClashTactic = typeClashTactic,
keyValuePaths = keyValuePaths,
unifyNumbers = unifyNumbers,
)
override fun readDataFrame(path: Path, header: List<String>): AnyFrame =
DataFrame.readJson(
path = path,
header = header,
keyValuePaths = keyValuePaths,
typeClashTactic = typeClashTactic,
unifyNumbers = unifyNumbers,
)
override fun acceptsExtension(ext: String): Boolean = ext == "json"
override fun acceptsSample(sample: SupportedFormatSample): Boolean = true // Extension is enough
override val testOrder: Int = 10_000
override fun createDefaultReadMethod(pathRepresentation: String?): DefaultReadDfMethod =
DefaultReadJsonMethod(
path = pathRepresentation,
arguments = MethodArguments()
.add(
"keyValuePaths",
typeOf<List<JsonPath>>(),
"listOf(${
keyValuePaths.joinToString {
"org.jetbrains.kotlinx.dataframe.api.JsonPath(\"\"\"${it.path}\"\"\")"
}
})",
)
.add(
"typeClashTactic",
typeOf<TypeClashTactic>(),
"org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.${typeClashTactic.name}",
)
.add(
"unifyNumbers",
typeOf<Boolean>(),
unifyNumbers.toString(),
),
)
/**
* Allows the choice of how to handle type clashes when reading a JSON file.
* Such as:
* ```json
* [
* { "a": "text" },
* { "a": { "b": 2 } },
* { "a": [6, 7, 8] }
* ]
* ```
*
* [ARRAY_AND_VALUE_COLUMNS] (default) will create a [DataFrame] looking like (including `null` and `[]` values):
* ```
* ⌌----------------------------------------------⌍
* | | a:{b:Int?, value:String?, array:List<Int>}|
* |--|-------------------------------------------|
* | 0| { b:null, value:"text", array:[] }|
* | 1| { b:2, value:null, array:[] }|
* | 2| { b:null, value:null, array:[6, 7, 8] }|
* ⌎----------------------------------------------⌏
* ```
* So, for the type clashing argument it will create a [ColumnGroup] with the properties `value`, `array`,
* and the unwrapped properties of the objects the property can be.
*
* [ANY_COLUMNS] will create a [DataFrame] looking like:
* ```
* ⌌-------------⌍
* | | a:Any|
* |--|----------|
* | 0| "text"|
* | 1| { b:2 }|
* | 2| [6, 7, 8]|
* ⌎-------------⌏
* ```
*/
public enum class TypeClashTactic {
ARRAY_AND_VALUE_COLUMNS,
ANY_COLUMNS,
}
}
internal const val ARRAY_COLUMN_NAME: String = "array"
internal const val VALUE_COLUMN_NAME: String = "value"
/**
* @param file Where to fetch the Json as [InputStream] to be converted to a [DataFrame].
* @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[NameValueProperty]>
* will be created.
* @param typeClashTactic How to handle type clashes when reading a JSON file.
* @param header Optional list of column names. If given, the file will be read like an object with [header] being the keys.
* @param unifyNumbers Whether to [unify the numbers that are read][UnifyingNumbers]. `true` by default.
* @return [DataFrame] from the given [file].
*/
public fun DataFrame.Companion.readJson(
file: File,
header: List<String> = emptyList(),
keyValuePaths: List<JsonPath> = emptyList(),
typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS,
unifyNumbers: Boolean = true,
): AnyFrame = DataFrame.readJson(file.toPath(), header, keyValuePaths, typeClashTactic, unifyNumbers)
/**
* @param path Where to fetch the Json as [InputStream] to be converted to a [DataFrame].
* @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[NameValueProperty]>
* will be created.
* @param typeClashTactic How to handle type clashes when reading a JSON file.
* @param header Optional list of column names. If given, the file will be read like an object with [header] being the keys.
* @param unifyNumbers Whether to [unify the numbers that are read][UnifyingNumbers]. `true` by default.
* @return [DataFrame] from the given [path].
*/
public fun DataFrame.Companion.readJson(
path: Path,
header: List<String> = emptyList(),
keyValuePaths: List<JsonPath> = emptyList(),
typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS,
unifyNumbers: Boolean = true,
): AnyFrame = DataFrame.readJson(path.toUri().toURL(), header, keyValuePaths, typeClashTactic, unifyNumbers)
/**
* @param file Where to fetch the Json as [InputStream] to be converted to a [DataRow].
* @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[NameValueProperty]>
* will be created.
* @param typeClashTactic How to handle type clashes when reading a JSON file.
* @param header Optional list of column names. If given, the file will be read like an object with [header] being the keys.
* @param unifyNumbers Whether to [unify the numbers that are read][UnifyingNumbers]. `true` by default.
* @return [DataRow] from the given [file].
*/
public fun DataRow.Companion.readJson(
file: File,
header: List<String> = emptyList(),
keyValuePaths: List<JsonPath> = emptyList(),
typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS,
unifyNumbers: Boolean = true,
): AnyRow = DataFrame.readJson(file.toPath(), header, keyValuePaths, typeClashTactic, unifyNumbers).single()
/**
* @param path Where to fetch the Json as [InputStream] to be converted to a [DataRow].
* @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[NameValueProperty]>
* will be created.
* @param typeClashTactic How to handle type clashes when reading a JSON file.
* @param header Optional list of column names. If given, the file will be read like an object with [header] being the keys.
* @param unifyNumbers Whether to [unify the numbers that are read][UnifyingNumbers]. `true` by default.
* @return [DataRow] from the given [path].
*/
public fun DataRow.Companion.readJson(
path: Path,
header: List<String> = emptyList(),
keyValuePaths: List<JsonPath> = emptyList(),
typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS,
unifyNumbers: Boolean = true,
): AnyRow = DataFrame.readJson(path, header, keyValuePaths, typeClashTactic, unifyNumbers).single()
/**
* @param path URL or file path from where to fetch the Json as [InputStream] to be converted to a [DataFrame].
* @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[NameValueProperty]>
* will be created.
* @param typeClashTactic How to handle type clashes when reading a JSON file.
* @param header Optional list of column names. If given, the stream will be read like an object with [header] being the keys.
* @param unifyNumbers Whether to [unify the numbers that are read][UnifyingNumbers]. `true` by default.
* @return [DataFrame] from the given [path].
*/
public fun DataFrame.Companion.readJson(
path: String,
header: List<String> = emptyList(),
keyValuePaths: List<JsonPath> = emptyList(),
typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS,
unifyNumbers: Boolean = true,
): AnyFrame = DataFrame.readJson(asUrl(path), header, keyValuePaths, typeClashTactic, unifyNumbers)
/**
* @param path URL or file path from where to fetch the Json as [InputStream] to be converted to a [DataRow].
* @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[NameValueProperty]>
* will be created.
* @param typeClashTactic How to handle type clashes when reading a JSON file.
* @param header Optional list of column names. If given, the stream will be read like an object with [header] being the keys.
* @param unifyNumbers Whether to [unify the numbers that are read][UnifyingNumbers]. `true` by default.
* @return [DataRow] from the given [path].
*/
public fun DataRow.Companion.readJson(
path: String,
header: List<String> = emptyList(),
keyValuePaths: List<JsonPath> = emptyList(),
typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS,
unifyNumbers: Boolean = true,
): AnyRow = DataFrame.readJson(path, header, keyValuePaths, typeClashTactic, unifyNumbers).single()
/**
* @param url Where to fetch the Json as [InputStream] to be converted to a [DataFrame].
* @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[NameValueProperty]>
* will be created.
* @param typeClashTactic How to handle type clashes when reading a JSON file.
* @param header Optional list of column names. If given, the stream will be read like an object with [header] being the keys.
* @param unifyNumbers Whether to [unify the numbers that are read][UnifyingNumbers]. `true` by default.
* @return [DataFrame] from the given [url].
*/
public fun DataFrame.Companion.readJson(
url: URL,
header: List<String> = emptyList(),
keyValuePaths: List<JsonPath> = emptyList(),
typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS,
unifyNumbers: Boolean = true,
): AnyFrame = catchHttpResponse(url) { DataFrame.readJson(it, header, keyValuePaths, typeClashTactic, unifyNumbers) }
/**
* @param url Where to fetch the Json as [InputStream] to be converted to a [DataRow].
* @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[NameValueProperty]>
* will be created.
* @param typeClashTactic How to handle type clashes when reading a JSON file.
* @param header Optional list of column names. If given, the stream will be read like an object with [header] being the keys.
* @param unifyNumbers Whether to [unify the numbers that are read][UnifyingNumbers]. `true` by default.
* @return [DataRow] from the given [url].
*/
public fun DataRow.Companion.readJson(
url: URL,
header: List<String> = emptyList(),
keyValuePaths: List<JsonPath> = emptyList(),
typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS,
unifyNumbers: Boolean = true,
): AnyRow = DataFrame.readJson(url, header, keyValuePaths, typeClashTactic, unifyNumbers).single()
/**
* @param stream Json as [InputStream] to be converted to a [DataFrame].
* @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[NameValueProperty]>
* will be created.
* @param typeClashTactic How to handle type clashes when reading a JSON file.
* @param header Optional list of column names. If given, [stream] will be read like an object with [header] being the keys.
* @param unifyNumbers Whether to [unify the numbers that are read][UnifyingNumbers]. `true` by default.
* @return [DataFrame] from the given [stream].
*/
@OptIn(ExperimentalSerializationApi::class)
public fun DataFrame.Companion.readJson(
stream: InputStream,
header: List<String> = emptyList(),
keyValuePaths: List<JsonPath> = emptyList(),
typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS,
unifyNumbers: Boolean = true,
): AnyFrame =
readJsonImpl(Json.decodeFromStream<JsonElement>(stream), unifyNumbers, header, keyValuePaths, typeClashTactic)
/**
* @param stream Json as [InputStream] to be converted to a [DataRow].
* @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[NameValueProperty]>
* will be created.
* @param typeClashTactic How to handle type clashes when reading a JSON file.
* @param header Optional list of column names. If given, [stream] will be read like an object with [header] being the keys.
* @param unifyNumbers Whether to [unify the numbers that are read][UnifyingNumbers]. `true` by default.
* @return [DataRow] from the given [stream].
*/
public fun DataRow.Companion.readJson(
stream: InputStream,
header: List<String> = emptyList(),
keyValuePaths: List<JsonPath> = emptyList(),
typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS,
unifyNumbers: Boolean = true,
): AnyRow = DataFrame.readJson(stream, header, keyValuePaths, typeClashTactic, unifyNumbers).single()
/**
* @param text Json as [String] to be converted to a [DataFrame].
* @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[NameValueProperty]>
* will be created.
* @param typeClashTactic How to handle type clashes when reading a JSON file.
* @param header Optional list of column names. If given, [text] will be read like an object with [header] being the keys.
* @param unifyNumbers Whether to [unify the numbers that are read][UnifyingNumbers]. `true` by default.
* @return [DataFrame] from the given [text].
*/
public fun DataFrame.Companion.readJsonStr(
@Language("json") text: String,
header: List<String> = emptyList(),
keyValuePaths: List<JsonPath> = emptyList(),
typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS,
unifyNumbers: Boolean = true,
): AnyFrame = readJsonImpl(Json.parseToJsonElement(text), unifyNumbers, header, keyValuePaths, typeClashTactic)
/**
* @param text Json as [String] to be converted to a [DataRow].
* @param keyValuePaths List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[NameValueProperty]>
* will be created.
* @param typeClashTactic How to handle type clashes when reading a JSON file.
* @param header Optional list of column names. If given, [text] will be read like an object with [header] being the keys.
* @param unifyNumbers Whether to [unify the numbers that are read][UnifyingNumbers]. `true` by default.
* @return [DataRow] from the given [text].
*/
public fun DataRow.Companion.readJsonStr(
@Language("json") text: String,
header: List<String> = emptyList(),
keyValuePaths: List<JsonPath> = emptyList(),
typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS,
unifyNumbers: Boolean = true,
): AnyRow = DataFrame.readJsonStr(text, header, keyValuePaths, typeClashTactic, unifyNumbers).single()
public fun AnyFrame.toJson(prettyPrint: Boolean = false): String {
val json = Json {
this.prettyPrint = prettyPrint
isLenient = true
allowSpecialFloatingPointValues = true
}
return json.encodeToString(JsonElement.serializer(), encodeFrame(this@toJson))
}
/**
* Converts the DataFrame to a JSON string representation with additional metadata about serialized data.
* It is heavily used to implement some integration features in Kotlin Notebook IntelliJ IDEA plugin.
*
* @param rowLimit The maximum number of top-level dataframe rows to include in the output JSON.
* @param nestedRowLimit The maximum number of nested frame rows to include in the output JSON.
* If null, all rows are included.
* Applied for each frame column recursively
* @param prettyPrint Specifies whether the output JSON should be formatted with indentation and line breaks.
* @param customEncoders The options for encoding things like images.
* The default is empty list, which indicates that the image is not encoded as Base64.
* @param isFormatted Specifies whether the DataFrame should be formatted,
* a.k.a. it comes from [FormattedFrame.df] or it contains a
* [DataColumn][DataColumn]`<`[FormattedFrame][FormattedFrame]`<*>>` at any depth.
* This is just a marker; formatting is applied by the renderer. Defaults to `false`.
*
* @return The DataFrame converted to a JSON string with metadata.
*/
public fun AnyFrame.toJsonWithMetadata(
rowLimit: Int,
nestedRowLimit: Int? = null,
prettyPrint: Boolean = false,
customEncoders: List<CustomEncoder> = emptyList(),
isFormatted: Boolean = false,
): String {
val json = Json {
this.prettyPrint = prettyPrint
isLenient = true
allowSpecialFloatingPointValues = true
}
return json.encodeToString(
JsonElement.serializer(),
encodeDataFrameWithMetadata(
frame = this@toJsonWithMetadata,
rowLimit = rowLimit,
nestedRowLimit = nestedRowLimit,
customEncoders = customEncoders,
isFormatted = isFormatted,
),
)
}
/**
* Interface for defining a custom encoder. That applied to the value during dataframe JSON serialization
*/
public interface CustomEncoder {
/**
* Determines whether this encoder can encode the given input.
*
* @param input The input object to be checked for suitability.
* @return `true` if the input can be encoded, otherwise `false`.
*/
public fun canEncode(input: Any?): Boolean
/**
* Encodes the provided input into a JSON element.
*
* @param input The input object to be encoded.
* @return A JsonElement representing the encoded input.
*/
public fun encode(input: Any?): JsonElement
}
internal const val DEFAULT_IMG_SIZE = 600
/**
* Class representing the options for encoding images.
*
* @property imageSizeLimit The maximum size to which images should be resized. Defaults to the value of DEFAULT_IMG_SIZE.
* @property options Bitwise-OR of the [GZIP_ON] and [LIMIT_SIZE_ON] constants. Defaults to [GZIP_ON] or [LIMIT_SIZE_ON].
*/
public class Base64ImageEncodingOptions(
public val imageSizeLimit: Int = DEFAULT_IMG_SIZE,
private val options: Int = GZIP_ON or LIMIT_SIZE_ON,
) {
public val isGzipOn: Boolean
get() = options and GZIP_ON == GZIP_ON
public val isLimitSizeOn: Boolean
get() = options and LIMIT_SIZE_ON == LIMIT_SIZE_ON
public companion object {
public const val ALL_OFF: Int = 0
public const val GZIP_ON: Int = 1 // 2^0
public const val LIMIT_SIZE_ON: Int = 2 // 2^1
}
}
public fun AnyRow.toJson(prettyPrint: Boolean = false): String {
val json = Json {
this.prettyPrint = prettyPrint
isLenient = true
allowSpecialFloatingPointValues = true
}
return json.encodeToString(JsonElement.serializer(), encodeRow(df(), index()))
}
public fun AnyFrame.writeJson(file: File, prettyPrint: Boolean = false) {
writeJson(file.toPath(), prettyPrint)
}
public fun AnyFrame.writeJson(path: Path, prettyPrint: Boolean = false) {
path.writeText(toJson(prettyPrint))
}
public fun AnyFrame.writeJson(path: String, prettyPrint: Boolean = false): Unit = writeJson(File(path), prettyPrint)
public fun AnyFrame.writeJson(writer: Appendable, prettyPrint: Boolean = false) {
writer.append(toJson(prettyPrint))
}
public fun AnyRow.writeJson(file: File, prettyPrint: Boolean = false) {
writeJson(file.toPath(), prettyPrint)
}
public fun AnyRow.writeJson(path: Path, prettyPrint: Boolean = false) {
path.writeText(toJson(prettyPrint))
}
public fun AnyRow.writeJson(path: String, prettyPrint: Boolean = false) {
writeJson(File(path), prettyPrint)
}
public fun AnyRow.writeJson(writer: Appendable, prettyPrint: Boolean = false) {
writer.append(toJson(prettyPrint))
}
private const val READ_JSON = "readJson"
internal class DefaultReadJsonMethod(path: String?, arguments: MethodArguments) :
AbstractDefaultReadMethod(
path = path,
arguments = arguments,
methodName = READ_JSON,
)
@@ -0,0 +1 @@
org.jetbrains.kotlinx.dataframe.io.JSON
@@ -0,0 +1,177 @@
package org.jetbrains.kotlinx.dataframe.io
import io.kotest.matchers.shouldBe
import io.kotest.matchers.string.shouldContain
import kotlinx.serialization.json.JsonObject
import kotlinx.serialization.json.jsonArray
import kotlinx.serialization.json.jsonObject
import kotlinx.serialization.json.jsonPrimitive
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import org.jetbrains.kotlinx.dataframe.impl.io.BufferedImageEncoder
import org.jetbrains.kotlinx.dataframe.impl.io.SerializationKeys.KOTLIN_DATAFRAME
import org.jetbrains.kotlinx.dataframe.impl.io.resizeKeepingAspectRatio
import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions.Companion.ALL_OFF
import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions.Companion.GZIP_ON
import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions.Companion.LIMIT_SIZE_ON
import org.junit.jupiter.params.ParameterizedTest
import org.junit.jupiter.params.provider.MethodSource
import java.awt.image.BufferedImage
import java.io.ByteArrayInputStream
import java.io.ByteArrayOutputStream
import java.io.File
import java.util.Base64
import java.util.zip.GZIPInputStream
import javax.imageio.ImageIO
import kotlin.math.abs
class ImageSerializationTests {
@ParameterizedTest
@MethodSource("imageEncodingOptionsToTest")
fun `serialize images as base64`(encodingOptions: Base64ImageEncodingOptions?) {
val images = readImagesFromResources()
val json = encodeImagesAsJson(images, encodingOptions)
if (encodingOptions == DISABLED) {
checkImagesEncodedAsToString(json, images.size)
return
}
val decodedImages = decodeImagesFromJson(json, images.size, encodingOptions)
for ((decodedImage, original) in decodedImages.zip(images)) {
val expectedImage = resizeIfNeeded(original, encodingOptions)
isImagesIdentical(decodedImage, expectedImage, 2) shouldBe true
}
}
private fun readImagesFromResources(): List<BufferedImage> {
val dir = File(testResource("imgs").path)
return dir.listFiles()?.map { file ->
try {
ImageIO.read(file)
} catch (ex: Exception) {
throw IllegalArgumentException("Error reading ${file.name}: ${ex.message}")
}
} ?: emptyList()
}
private fun encodeImagesAsJson(
images: List<BufferedImage>,
encodingOptions: Base64ImageEncodingOptions?,
): JsonObject {
val df = dataFrameOf("imgs" to images)
val jsonStr = df.toJsonWithMetadata(
20,
nestedRowLimit = 20,
customEncoders = listOfNotNull(encodingOptions?.let { BufferedImageEncoder(encodingOptions) }),
)
return parseJsonStr(jsonStr)
}
private fun checkImagesEncodedAsToString(json: JsonObject, numImgs: Int) {
for (i in 0..<numImgs) {
val row = json[KOTLIN_DATAFRAME]!!.jsonArray[i].jsonObject
val img = row["imgs"]?.jsonPrimitive?.content
img shouldContain "BufferedImage"
}
}
private fun decodeImagesFromJson(
json: JsonObject,
imgsNum: Int,
encodingOptions: Base64ImageEncodingOptions,
): List<BufferedImage> {
val result = mutableListOf<BufferedImage>()
for (i in 0..<imgsNum) {
val row = json[KOTLIN_DATAFRAME]!!.jsonArray[i].jsonObject
val imgString = row["imgs"]!!.jsonPrimitive.content
val bytes = decodeBase64Image(imgString, encodingOptions)
val decodedImage = createImageFromBytes(bytes)
result.add(decodedImage)
}
return result
}
private fun decodeBase64Image(imgString: String, encodingOptions: Base64ImageEncodingOptions): ByteArray =
when {
encodingOptions.isGzipOn -> decompressGzip(Base64.getDecoder().decode(imgString))
else -> Base64.getDecoder().decode(imgString)
}
private fun decompressGzip(input: ByteArray): ByteArray =
ByteArrayOutputStream().use { byteArrayOutputStream ->
GZIPInputStream(input.inputStream()).use { inputStream ->
inputStream.copyTo(byteArrayOutputStream)
}
byteArrayOutputStream.toByteArray()
}
private fun resizeIfNeeded(image: BufferedImage, encodingOptions: Base64ImageEncodingOptions): BufferedImage =
when {
!encodingOptions.isLimitSizeOn -> image
else -> image.resizeKeepingAspectRatio(encodingOptions.imageSizeLimit)
}
private fun createImageFromBytes(bytes: ByteArray): BufferedImage {
val bais = ByteArrayInputStream(bytes)
return ImageIO.read(bais)
}
private fun isImagesIdentical(img1: BufferedImage, img2: BufferedImage, allowedDelta: Int): Boolean {
// First check dimensions
if (img1.width != img2.width || img1.height != img2.height) {
return false
}
// Then check each pixel
for (y in 0 until img1.height) {
for (x in 0 until img1.width) {
val rgb1 = img1.getRGB(x, y)
val rgb2 = img2.getRGB(x, y)
val r1 = (rgb1 shr 16) and 0xFF
val g1 = (rgb1 shr 8) and 0xFF
val b1 = rgb1 and 0xFF
val r2 = (rgb2 shr 16) and 0xFF
val g2 = (rgb2 shr 8) and 0xFF
val b2 = rgb2 and 0xFF
val diff = abs(r1 - r2) + abs(g1 - g2) + abs(b1 - b2)
// If the difference in color components exceed our allowance return false
if (diff > allowedDelta) {
return false
}
}
}
// If no exceeding difference was found, the images are identical within our allowedDelta
return true
}
companion object {
private val DEFAULT = Base64ImageEncodingOptions()
private val GZIP_ON_RESIZE_OFF = Base64ImageEncodingOptions(options = GZIP_ON)
private val GZIP_OFF_RESIZE_OFF = Base64ImageEncodingOptions(options = ALL_OFF)
private val GZIP_ON_RESIZE_TO_700 =
Base64ImageEncodingOptions(imageSizeLimit = 700, options = GZIP_ON or LIMIT_SIZE_ON)
private val DISABLED = null
@JvmStatic
fun imageEncodingOptionsToTest(): List<Base64ImageEncodingOptions?> =
listOf(
DEFAULT,
GZIP_ON_RESIZE_OFF,
GZIP_OFF_RESIZE_OFF,
GZIP_ON_RESIZE_TO_700,
null,
)
}
}
File diff suppressed because it is too large Load Diff
Binary file not shown.

After

Width:  |  Height:  |  Size: 67 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

File diff suppressed because one or more lines are too long