init research

This commit is contained in:
2026-02-08 11:20:43 -10:00
commit bdf064f54d
3041 changed files with 1592200 additions and 0 deletions
+8
View File
@@ -0,0 +1,8 @@
## :dataframe-arrow
This module, published as `dataframe-arrow`, contains all logic and tests for DataFrame to be able to work with
Apache Arrow.
See [Read Apache Arrow formats](https://kotlin.github.io/dataframe/read.html#read-apache-arrow-formats) and
[Writing to Apache Arrow formats](https://kotlin.github.io/dataframe/write.html#writing-to-apache-arrow-formats)
for more information about how to use it.
+325
View File
@@ -0,0 +1,325 @@
public final class org/jetbrains/kotlinx/dataframe/io/ArrowFeather : org/jetbrains/kotlinx/dataframe/io/SupportedDataFrameFormat {
public fun <init> ()V
public fun acceptsExtension (Ljava/lang/String;)Z
public fun acceptsSample (Lorg/jetbrains/kotlinx/dataframe/io/SupportedFormatSample;)Z
public fun createDefaultReadMethod (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/codeGen/DefaultReadDfMethod;
public fun getTestOrder ()I
public fun readDataFrame (Ljava/io/File;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public fun readDataFrame (Ljava/io/InputStream;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public fun readDataFrame (Ljava/nio/file/Path;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
}
public final class org/jetbrains/kotlinx/dataframe/io/ArrowReadingKt {
public static final fun readArrow (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Lorg/apache/arrow/vector/ipc/ArrowReader;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readArrow$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Lorg/apache/arrow/vector/ipc/ArrowReader;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/File;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/InputStream;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/net/URL;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/nio/channels/SeekableByteChannel;Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/nio/file/Path;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[BLorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readArrowFeather$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/File;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readArrowFeather$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/InputStream;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readArrowFeather$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readArrowFeather$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/net/URL;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readArrowFeather$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/nio/channels/SeekableByteChannel;Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readArrowFeather$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/nio/file/Path;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readArrowFeather$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[BLorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/File;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/InputStream;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/net/URL;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/nio/channels/ReadableByteChannel;Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/nio/file/Path;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[BLorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/File;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/InputStream;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/net/URL;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/nio/channels/ReadableByteChannel;Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/nio/file/Path;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[BLorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readParquet (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[Ljava/io/File;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;J)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readParquet (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;J)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readParquet (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[Ljava/net/URL;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;J)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun readParquet (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[Ljava/nio/file/Path;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;J)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readParquet$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[Ljava/io/File;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;JILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readParquet$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;JILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readParquet$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[Ljava/net/URL;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;JILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun readParquet$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[Ljava/nio/file/Path;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;JILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun toDataFrame (Lorg/apache/arrow/vector/ipc/ArrowReader;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun toDataFrame$default (Lorg/apache/arrow/vector/ipc/ArrowReader;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
}
public final class org/jetbrains/kotlinx/dataframe/io/ArrowTypesMatchingKt {
public static final fun toArrowField (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lkotlin/jvm/functions/Function1;)Lorg/apache/arrow/vector/types/pojo/Field;
public static synthetic fun toArrowField$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lkotlin/jvm/functions/Function1;ILjava/lang/Object;)Lorg/apache/arrow/vector/types/pojo/Field;
public static final fun toArrowSchema (Ljava/util/List;Lkotlin/jvm/functions/Function1;)Lorg/apache/arrow/vector/types/pojo/Schema;
public static synthetic fun toArrowSchema$default (Ljava/util/List;Lkotlin/jvm/functions/Function1;ILjava/lang/Object;)Lorg/apache/arrow/vector/types/pojo/Schema;
}
public abstract interface class org/jetbrains/kotlinx/dataframe/io/ArrowWriter : java/lang/AutoCloseable {
public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Companion;
public abstract fun allocateVectorSchemaRoot ()Lorg/apache/arrow/vector/VectorSchemaRoot;
public abstract fun getDataFrame ()Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public abstract fun getMismatchSubscriber ()Lkotlin/jvm/functions/Function1;
public abstract fun getMode ()Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;
public abstract fun getTargetSchema ()Lorg/apache/arrow/vector/types/pojo/Schema;
public fun saveArrowFeatherToByteArray ()[B
public fun saveArrowIPCToByteArray ()[B
public fun writeArrowFeather (Ljava/io/File;)V
public fun writeArrowFeather (Ljava/io/OutputStream;)V
public fun writeArrowFeather (Ljava/nio/channels/WritableByteChannel;)V
public fun writeArrowFeather (Ljava/nio/file/Path;)V
public fun writeArrowIPC (Ljava/io/File;Z)V
public fun writeArrowIPC (Ljava/io/OutputStream;)V
public fun writeArrowIPC (Ljava/nio/channels/WritableByteChannel;)V
public fun writeArrowIPC (Ljava/nio/file/Path;Z)V
public static synthetic fun writeArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/io/File;ZILjava/lang/Object;)V
public static synthetic fun writeArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/nio/file/Path;ZILjava/lang/Object;)V
}
public final class org/jetbrains/kotlinx/dataframe/io/ArrowWriter$Companion {
public final fun create (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lorg/apache/arrow/vector/types/pojo/Schema;Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;Lkotlin/jvm/functions/Function1;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;
public static synthetic fun create$default (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Companion;Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lorg/apache/arrow/vector/types/pojo/Schema;Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;Lkotlin/jvm/functions/Function1;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;
}
public final class org/jetbrains/kotlinx/dataframe/io/ArrowWriter$DefaultImpls {
public static fun saveArrowFeatherToByteArray (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;)[B
public static fun saveArrowIPCToByteArray (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;)[B
public static fun writeArrowFeather (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/io/File;)V
public static fun writeArrowFeather (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/io/OutputStream;)V
public static fun writeArrowFeather (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/nio/channels/WritableByteChannel;)V
public static fun writeArrowFeather (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/nio/file/Path;)V
public static fun writeArrowIPC (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/io/File;Z)V
public static fun writeArrowIPC (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/io/OutputStream;)V
public static fun writeArrowIPC (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/nio/channels/WritableByteChannel;)V
public static fun writeArrowIPC (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/nio/file/Path;Z)V
public static synthetic fun writeArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/io/File;ZILjava/lang/Object;)V
public static synthetic fun writeArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/nio/file/Path;ZILjava/lang/Object;)V
}
public final class org/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode {
public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode$Companion;
public fun <init> (ZZZZ)V
public final fun component1 ()Z
public final fun component2 ()Z
public final fun component3 ()Z
public final fun component4 ()Z
public final fun copy (ZZZZ)Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;ZZZZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;
public fun equals (Ljava/lang/Object;)Z
public final fun getRestrictNarrowing ()Z
public final fun getRestrictWidening ()Z
public final fun getStrictNullable ()Z
public final fun getStrictType ()Z
public fun hashCode ()I
public fun toString ()Ljava/lang/String;
}
public final class org/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode$Companion {
public final fun getLOYAL ()Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;
public final fun getSTRICT ()Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;
}
public final class org/jetbrains/kotlinx/dataframe/io/ArrowWriterKt {
public static final fun getIgnoreMismatchMessage ()Lkotlin/jvm/functions/Function1;
public static final fun getLogMismatchMessage ()Lkotlin/jvm/functions/Function1;
public static final fun getWriteMismatchMessage ()Lkotlin/jvm/functions/Function1;
}
public final class org/jetbrains/kotlinx/dataframe/io/ArrowWritingKt {
public static final fun arrowWriter (Lorg/jetbrains/kotlinx/dataframe/DataFrame;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;
public static final fun arrowWriter (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lorg/apache/arrow/vector/types/pojo/Schema;Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;Lkotlin/jvm/functions/Function1;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;
public static synthetic fun arrowWriter$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lorg/apache/arrow/vector/types/pojo/Schema;Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;Lkotlin/jvm/functions/Function1;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;
public static final fun saveArrowFeatherToByteArray (Lorg/jetbrains/kotlinx/dataframe/DataFrame;)[B
public static final fun saveArrowIPCToByteArray (Lorg/jetbrains/kotlinx/dataframe/DataFrame;)[B
public static final fun writeArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/io/File;)V
public static final fun writeArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/io/OutputStream;)V
public static final fun writeArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/nio/channels/WritableByteChannel;)V
public static final fun writeArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/nio/file/Path;)V
public static final fun writeArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/io/File;Z)V
public static final fun writeArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/io/OutputStream;)V
public static final fun writeArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/nio/channels/WritableByteChannel;)V
public static final fun writeArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/nio/file/Path;Z)V
public static synthetic fun writeArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/io/File;ZILjava/lang/Object;)V
public static synthetic fun writeArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/nio/file/Path;ZILjava/lang/Object;)V
}
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingException : java/lang/IllegalArgumentException {
public fun <init> (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch;)V
public final fun getMismatchCase ()Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch;
}
public abstract class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch {
public synthetic fun <init> (Ljava/lang/String;Ljava/lang/Integer;Ljava/lang/Exception;Lkotlin/jvm/internal/DefaultConstructorMarker;)V
public fun getCause ()Ljava/lang/Exception;
public fun getColumn ()Ljava/lang/String;
public fun getRow ()Ljava/lang/Integer;
}
public abstract class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch {
public synthetic fun <init> (Ljava/lang/String;Lkotlin/jvm/internal/DefaultConstructorMarker;)V
}
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch$NotPresentedColumnError : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch {
public fun <init> (Ljava/lang/String;)V
public final fun component1 ()Ljava/lang/String;
public final fun copy (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch$NotPresentedColumnError;
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch$NotPresentedColumnError;Ljava/lang/String;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch$NotPresentedColumnError;
public fun equals (Ljava/lang/Object;)Z
public fun getColumn ()Ljava/lang/String;
public fun hashCode ()I
public fun toString ()Ljava/lang/String;
}
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch$NotPresentedColumnIgnored : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch {
public fun <init> (Ljava/lang/String;)V
public final fun component1 ()Ljava/lang/String;
public final fun copy (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch$NotPresentedColumnIgnored;
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch$NotPresentedColumnIgnored;Ljava/lang/String;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch$NotPresentedColumnIgnored;
public fun equals (Ljava/lang/Object;)Z
public fun getColumn ()Ljava/lang/String;
public fun hashCode ()I
public fun toString ()Ljava/lang/String;
}
public abstract class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch {
public synthetic fun <init> (Ljava/lang/String;Ljava/lang/Integer;Lkotlin/jvm/internal/DefaultConstructorMarker;)V
}
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch$NullValueError : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch {
public fun <init> (Ljava/lang/String;Ljava/lang/Integer;)V
public final fun component1 ()Ljava/lang/String;
public final fun component2 ()Ljava/lang/Integer;
public final fun copy (Ljava/lang/String;Ljava/lang/Integer;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch$NullValueError;
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch$NullValueError;Ljava/lang/String;Ljava/lang/Integer;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch$NullValueError;
public fun equals (Ljava/lang/Object;)Z
public fun getColumn ()Ljava/lang/String;
public fun getRow ()Ljava/lang/Integer;
public fun hashCode ()I
public fun toString ()Ljava/lang/String;
}
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch$NullValueIgnored : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch {
public fun <init> (Ljava/lang/String;Ljava/lang/Integer;)V
public final fun component1 ()Ljava/lang/String;
public final fun component2 ()Ljava/lang/Integer;
public final fun copy (Ljava/lang/String;Ljava/lang/Integer;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch$NullValueIgnored;
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch$NullValueIgnored;Ljava/lang/String;Ljava/lang/Integer;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch$NullValueIgnored;
public fun equals (Ljava/lang/Object;)Z
public fun getColumn ()Ljava/lang/String;
public fun getRow ()Ljava/lang/Integer;
public fun hashCode ()I
public fun toString ()Ljava/lang/String;
}
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$SavedAsString : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch {
public fun <init> (Ljava/lang/String;Ljava/lang/Class;)V
public final fun component1 ()Ljava/lang/String;
public final fun component2 ()Ljava/lang/Class;
public final fun copy (Ljava/lang/String;Ljava/lang/Class;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$SavedAsString;
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$SavedAsString;Ljava/lang/String;Ljava/lang/Class;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$SavedAsString;
public fun equals (Ljava/lang/Object;)Z
public fun getColumn ()Ljava/lang/String;
public final fun getType ()Ljava/lang/Class;
public fun hashCode ()I
public fun toString ()Ljava/lang/String;
}
public abstract class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch {
public synthetic fun <init> (Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;Lkotlin/jvm/internal/DefaultConstructorMarker;)V
public synthetic fun getCause ()Ljava/lang/Exception;
public fun getCause ()Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;
}
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail$ConversionFailError : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail {
public fun <init> (Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;)V
public final fun component1 ()Ljava/lang/String;
public final fun component2 ()Ljava/lang/Integer;
public final fun component3 ()Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;
public final fun copy (Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail$ConversionFailError;
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail$ConversionFailError;Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail$ConversionFailError;
public fun equals (Ljava/lang/Object;)Z
public synthetic fun getCause ()Ljava/lang/Exception;
public fun getCause ()Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;
public fun getColumn ()Ljava/lang/String;
public fun getRow ()Ljava/lang/Integer;
public fun hashCode ()I
public fun toString ()Ljava/lang/String;
}
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail$ConversionFailIgnored : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail {
public fun <init> (Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;)V
public final fun component1 ()Ljava/lang/String;
public final fun component2 ()Ljava/lang/Integer;
public final fun component3 ()Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;
public final fun copy (Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail$ConversionFailIgnored;
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail$ConversionFailIgnored;Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail$ConversionFailIgnored;
public fun equals (Ljava/lang/Object;)Z
public synthetic fun getCause ()Ljava/lang/Exception;
public fun getCause ()Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;
public fun getColumn ()Ljava/lang/String;
public fun getRow ()Ljava/lang/Integer;
public fun hashCode ()I
public fun toString ()Ljava/lang/String;
}
public abstract class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch {
public synthetic fun <init> (Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;Lkotlin/jvm/internal/DefaultConstructorMarker;)V
}
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound$ConversionNotFoundError : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound {
public fun <init> (Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;)V
public final fun component1 ()Ljava/lang/String;
public final fun component2 ()Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;
public final fun copy (Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound$ConversionNotFoundError;
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound$ConversionNotFoundError;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound$ConversionNotFoundError;
public fun equals (Ljava/lang/Object;)Z
public fun getColumn ()Ljava/lang/String;
public final fun getE ()Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;
public fun hashCode ()I
public fun toString ()Ljava/lang/String;
}
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound$ConversionNotFoundIgnored : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound {
public fun <init> (Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;)V
public final fun component1 ()Ljava/lang/String;
public final fun component2 ()Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;
public final fun copy (Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound$ConversionNotFoundIgnored;
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound$ConversionNotFoundIgnored;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound$ConversionNotFoundIgnored;
public fun equals (Ljava/lang/Object;)Z
public synthetic fun getCause ()Ljava/lang/Exception;
public fun getCause ()Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;
public fun getColumn ()Ljava/lang/String;
public fun hashCode ()I
public fun toString ()Ljava/lang/String;
}
public abstract class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch {
public synthetic fun <init> (Ljava/lang/String;Lkotlin/jvm/internal/DefaultConstructorMarker;)V
}
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch$AddedColumn : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch {
public fun <init> (Ljava/lang/String;)V
public final fun component1 ()Ljava/lang/String;
public final fun copy (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch$AddedColumn;
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch$AddedColumn;Ljava/lang/String;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch$AddedColumn;
public fun equals (Ljava/lang/Object;)Z
public fun getColumn ()Ljava/lang/String;
public fun hashCode ()I
public fun toString ()Ljava/lang/String;
}
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch$RejectedColumn : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch {
public fun <init> (Ljava/lang/String;)V
public final fun component1 ()Ljava/lang/String;
public final fun copy (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch$RejectedColumn;
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch$RejectedColumn;Ljava/lang/String;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch$RejectedColumn;
public fun equals (Ljava/lang/Object;)Z
public fun getColumn ()Ljava/lang/String;
public fun hashCode ()I
public fun toString ()Ljava/lang/String;
}
+44
View File
@@ -0,0 +1,44 @@
plugins {
with(convention.plugins) {
alias(kotlinJvm8)
}
with(libs.plugins) {
alias(publisher)
alias(binary.compatibility.validator)
}
}
group = "org.jetbrains.kotlinx"
dependencies {
api(projects.core)
implementation(libs.arrow.vector)
implementation(libs.arrow.format)
implementation(libs.arrow.memory)
implementation(libs.arrow.dataset)
implementation(libs.commonsCompress)
implementation(libs.kotlin.reflect)
implementation(libs.kotlin.datetimeJvm)
testImplementation(libs.junit)
testImplementation(projects.dataframeJson)
testImplementation(libs.kotestAssertions) {
exclude("org.jetbrains.kotlin", "kotlin-stdlib-jdk8")
}
testImplementation(libs.arrow.c.data)
testImplementation(libs.duckdb.jdbc)
}
kotlinPublications {
publication {
publicationName = "dataframeArrow"
artifactId = project.name
description = "Apache Arrow support for Kotlin DataFrame"
packageName = artifactId
}
}
tasks.test {
jvmArgs = listOf("--add-opens", "java.base/java.nio=ALL-UNNAMED")
}
@@ -0,0 +1,174 @@
package org.jetbrains.kotlinx.dataframe.io
import org.apache.arrow.vector.VectorSchemaRoot
import org.apache.arrow.vector.ipc.ArrowFileWriter
import org.apache.arrow.vector.ipc.ArrowStreamWriter
import org.apache.arrow.vector.types.pojo.Schema
import org.jetbrains.kotlinx.dataframe.AnyFrame
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.slf4j.LoggerFactory
import java.io.ByteArrayOutputStream
import java.io.File
import java.io.OutputStream
import java.nio.channels.Channels
import java.nio.channels.WritableByteChannel
import java.nio.file.Path
import java.nio.file.StandardOpenOption
import kotlin.io.path.outputStream
public val ignoreMismatchMessage: (ConvertingMismatch) -> Unit = { message: ConvertingMismatch -> }
public val writeMismatchMessage: (ConvertingMismatch) -> Unit = { message: ConvertingMismatch ->
System.err.println(message)
}
private val logger = LoggerFactory.getLogger(ArrowWriter::class.java)
public val logMismatchMessage: (ConvertingMismatch) -> Unit = { message: ConvertingMismatch ->
logger.debug(message.toString())
}
/**
* Save [dataFrame] content in Apache Arrow format (can be written to File, ByteArray, OutputStream or raw Channel) with [targetSchema].
* If [dataFrame] content does not match with [targetSchema], behaviour is specified by [mode], mismatches would be sent to [mismatchSubscriber]
*/
public interface ArrowWriter : AutoCloseable {
public val dataFrame: DataFrame<*>
public val targetSchema: Schema
public val mode: Mode
public val mismatchSubscriber: (ConvertingMismatch) -> Unit
public companion object {
public fun create(
dataFrame: AnyFrame,
targetSchema: Schema,
mode: Mode,
mismatchSubscriber: (ConvertingMismatch) -> Unit = ignoreMismatchMessage,
): ArrowWriter = ArrowWriterImpl(dataFrame, targetSchema, mode, mismatchSubscriber)
}
/**
* If [restrictWidening] is true, [dataFrame] columns not described in [targetSchema] would not be saved (otherwise, would be saved as is).
* If [restrictNarrowing] is true, [targetSchema] fields that are not nullable and do not exist in [dataFrame] will produce exception (otherwise, would not be saved).
* If [strictType] is true, [dataFrame] columns described in [targetSchema] with non-compatible type will produce exception (otherwise, would be saved as is).
* If [strictNullable] is true, [targetSchema] fields that are not nullable and contain nulls in [dataFrame] will produce exception (otherwise, would be saved as is with nullable = true).
*/
public data class Mode(
public val restrictWidening: Boolean,
public val restrictNarrowing: Boolean,
public val strictType: Boolean,
public val strictNullable: Boolean,
) {
public companion object {
public val STRICT: Mode = Mode(true, true, true, true)
public val LOYAL: Mode = Mode(false, false, false, false)
}
}
/**
* Create Arrow [VectorSchemaRoot] with [dataFrame] content cast to [targetSchema] according to the [mode].
*/
public fun allocateVectorSchemaRoot(): VectorSchemaRoot
// IPC saving block
/**
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format), write to opened [channel].
*/
public fun writeArrowIPC(channel: WritableByteChannel) {
allocateVectorSchemaRoot().use { vectorSchemaRoot ->
ArrowStreamWriter(vectorSchemaRoot, null, channel).use { writer ->
writer.writeBatch()
}
}
}
/**
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format), write to opened [stream].
*/
public fun writeArrowIPC(stream: OutputStream) {
writeArrowIPC(Channels.newChannel(stream))
}
/**
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format), write to new or existing [file].
* If file exists, it can be recreated or expanded.
*/
public fun writeArrowIPC(file: File, append: Boolean = true) {
writeArrowIPC(file.toPath(), append)
}
/**
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format),
* write to a new or existing file on the given [path].
* If file on the given [path] exists, it can be recreated or expanded.
*/
public fun writeArrowIPC(path: Path, append: Boolean = true) {
val options = if (append) {
arrayOf(StandardOpenOption.CREATE, StandardOpenOption.APPEND)
} else {
arrayOf(StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)
}
path.outputStream(*options).use { outputStream -> writeArrowIPC(outputStream) }
}
/**
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format), write to new [ByteArray]
*/
public fun saveArrowIPCToByteArray(): ByteArray {
val stream = ByteArrayOutputStream()
writeArrowIPC(stream)
return stream.toByteArray()
}
// Feather saving block
/**
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files), write to opened [channel].
*/
public fun writeArrowFeather(channel: WritableByteChannel) {
allocateVectorSchemaRoot().use { vectorSchemaRoot ->
ArrowFileWriter(vectorSchemaRoot, null, channel).use { writer ->
writer.writeBatch()
}
}
}
/**
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files), write to opened [stream].
*/
public fun writeArrowFeather(stream: OutputStream) {
writeArrowFeather(Channels.newChannel(stream))
}
/**
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files), write to new or existing [file].
* If [file] exists, it would be recreated.
*/
public fun writeArrowFeather(file: File) {
writeArrowFeather(file.toPath())
}
/**
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files),
* write to a new or existing file on the given [path].
* If file on the given [path] exists, it would be recreated.
*/
public fun writeArrowFeather(path: Path) {
path.outputStream(
StandardOpenOption.CREATE,
StandardOpenOption.TRUNCATE_EXISTING,
).use { outputStream ->
writeArrowFeather(outputStream)
}
}
/**
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files), write to new [ByteArray]
*/
public fun saveArrowFeatherToByteArray(): ByteArray {
val stream = ByteArrayOutputStream()
writeArrowFeather(stream)
return stream.toByteArray()
}
}
@@ -0,0 +1,447 @@
package org.jetbrains.kotlinx.dataframe.io
import kotlinx.datetime.TimeZone
import kotlinx.datetime.toInstant
import org.apache.arrow.memory.RootAllocator
import org.apache.arrow.vector.BaseFixedWidthVector
import org.apache.arrow.vector.BaseVariableWidthVector
import org.apache.arrow.vector.BigIntVector
import org.apache.arrow.vector.BitVector
import org.apache.arrow.vector.DateDayVector
import org.apache.arrow.vector.DateMilliVector
import org.apache.arrow.vector.Decimal256Vector
import org.apache.arrow.vector.DecimalVector
import org.apache.arrow.vector.FieldVector
import org.apache.arrow.vector.FixedWidthVector
import org.apache.arrow.vector.Float4Vector
import org.apache.arrow.vector.Float8Vector
import org.apache.arrow.vector.IntVector
import org.apache.arrow.vector.LargeVarCharVector
import org.apache.arrow.vector.SmallIntVector
import org.apache.arrow.vector.TimeMicroVector
import org.apache.arrow.vector.TimeMilliVector
import org.apache.arrow.vector.TimeNanoVector
import org.apache.arrow.vector.TimeSecVector
import org.apache.arrow.vector.TinyIntVector
import org.apache.arrow.vector.VarCharVector
import org.apache.arrow.vector.VariableWidthVector
import org.apache.arrow.vector.VectorSchemaRoot
import org.apache.arrow.vector.complex.StructVector
import org.apache.arrow.vector.types.DateUnit
import org.apache.arrow.vector.types.FloatingPointPrecision
import org.apache.arrow.vector.types.pojo.ArrowType
import org.apache.arrow.vector.types.pojo.Field
import org.apache.arrow.vector.types.pojo.FieldType
import org.apache.arrow.vector.types.pojo.Schema
import org.apache.arrow.vector.util.Text
import org.jetbrains.kotlinx.dataframe.AnyCol
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.convertToBigDecimal
import org.jetbrains.kotlinx.dataframe.api.convertToBoolean
import org.jetbrains.kotlinx.dataframe.api.convertToByte
import org.jetbrains.kotlinx.dataframe.api.convertToDouble
import org.jetbrains.kotlinx.dataframe.api.convertToFloat
import org.jetbrains.kotlinx.dataframe.api.convertToInt
import org.jetbrains.kotlinx.dataframe.api.convertToLocalDate
import org.jetbrains.kotlinx.dataframe.api.convertToLocalDateTime
import org.jetbrains.kotlinx.dataframe.api.convertToLocalTime
import org.jetbrains.kotlinx.dataframe.api.convertToLong
import org.jetbrains.kotlinx.dataframe.api.convertToShort
import org.jetbrains.kotlinx.dataframe.api.convertToString
import org.jetbrains.kotlinx.dataframe.api.forEachIndexed
import org.jetbrains.kotlinx.dataframe.api.map
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
import org.jetbrains.kotlinx.dataframe.exceptions.CellConversionException
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConverterNotFoundException
import org.jetbrains.kotlinx.dataframe.indices
import org.jetbrains.kotlinx.dataframe.name
import org.jetbrains.kotlinx.dataframe.values
import kotlin.reflect.full.isSubtypeOf
import kotlin.reflect.typeOf
/**
* Save [dataFrame] content in Apache Arrow format (can be written to File, ByteArray, OutputStream or raw Channel) with [targetSchema].
* If [dataFrame] content does not match with [targetSchema], behaviour is specified by [mode], mismatches would be sent to [mismatchSubscriber]
*/
internal class ArrowWriterImpl(
override val dataFrame: DataFrame<*>,
override val targetSchema: Schema,
override val mode: ArrowWriter.Mode,
override val mismatchSubscriber: (ConvertingMismatch) -> Unit = ignoreMismatchMessage,
) : ArrowWriter {
private val allocator = RootAllocator()
private fun allocateVector(vector: FieldVector, size: Int, totalBytes: Long? = null) {
when (vector) {
is FixedWidthVector -> vector.allocateNew(size)
is VariableWidthVector -> totalBytes?.let { vector.allocateNew(it, size) } ?: vector.allocateNew(size)
is StructVector -> {
vector.childrenFromFields.forEach { child ->
allocateVector(child, size)
}
}
else -> throw IllegalArgumentException("Can not allocate ${vector.javaClass.canonicalName}")
}
}
/**
* Calculate buffer size for VariableWidthVector (return null for FixedWidthVector)
*/
private fun countTotalBytes(column: AnyCol): Long? {
val columnType = column.type()
return when {
columnType.isSubtypeOf(typeOf<String?>()) ->
column.values.fold(0L) { totalBytes, value ->
totalBytes + value.toString().length * 4
}
else -> null
}
}
private fun infillWithNulls(vector: FieldVector, size: Int) {
when (vector) {
is BaseFixedWidthVector -> for (i in 0 until size) {
vector.setNull(i)
}
is BaseVariableWidthVector -> for (i in 0 until size) {
vector.setNull(i)
}
else -> throw IllegalArgumentException("Can not infill ${vector.javaClass.canonicalName}")
}
vector.valueCount = size
}
private fun convertColumnToTarget(column: AnyCol?, targetFieldType: ArrowType): AnyCol? {
if (column == null) return null
return when (targetFieldType) {
ArrowType.Utf8() -> column.map { it?.toString() }
ArrowType.LargeUtf8() -> column.map { it?.toString() }
ArrowType.Bool() -> column.convertToBoolean()
ArrowType.Int(8, true) -> column.convertToByte()
ArrowType.Int(16, true) -> column.convertToShort()
ArrowType.Int(32, true) -> column.convertToInt()
ArrowType.Int(64, true) -> column.convertToLong()
is ArrowType.Decimal -> column.convertToBigDecimal()
// Use [convertToDouble] as locale logic step
ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE) ->
column.convertToDouble().convertToFloat()
ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE) -> column.convertToDouble()
ArrowType.Date(DateUnit.DAY) -> column.convertToLocalDate()
ArrowType.Date(DateUnit.MILLISECOND) -> column.convertToLocalDateTime()
is ArrowType.Time -> column.convertToLocalTime()
is ArrowType.Struct -> column
else ->
throw NotImplementedError(
"Saving ${targetFieldType.javaClass.canonicalName} is currently not implemented",
)
}
}
private fun convertColumnToCompatible(column: AnyCol): Pair<AnyCol, Field> {
val actualField = column.toArrowField(mismatchSubscriber)
val result = try {
convertColumnToTarget(column, actualField.type)!!
} catch (e: Exception) {
column
}
return result to actualField
}
private fun infillVector(vector: FieldVector, column: AnyCol) {
when (vector) {
is VarCharVector ->
column.convertToString()
.forEachIndexed { i, value ->
value?.also { vector.set(i, Text(value)) }
?: vector.setNull(i)
}
is LargeVarCharVector ->
column.convertToString()
.forEachIndexed { i, value ->
value?.also { vector.set(i, Text(value)) }
?: vector.setNull(i)
}
is BitVector ->
column.convertToBoolean()
.forEachIndexed { i, value ->
value?.also { vector.set(i, value.compareTo(false)) }
?: vector.setNull(i)
}
is TinyIntVector ->
column.convertToInt()
.forEachIndexed { i, value ->
value?.also { vector.set(i, value) }
?: vector.setNull(i)
}
is SmallIntVector ->
column.convertToInt()
.forEachIndexed { i, value ->
value?.also { vector.set(i, value) }
?: vector.setNull(i)
}
is IntVector ->
column.convertToInt()
.forEachIndexed { i, value ->
value?.also { vector.set(i, value) }
?: vector.setNull(i)
}
is BigIntVector ->
column.convertToLong()
.forEachIndexed { i, value ->
value?.also { vector.set(i, value) }
?: vector.setNull(i)
}
is DecimalVector ->
column.convertToBigDecimal()
.forEachIndexed { i, value ->
value?.also { vector.set(i, value) }
?: vector.setNull(i)
}
is Decimal256Vector ->
column.convertToBigDecimal()
.forEachIndexed { i, value ->
value?.also { vector.set(i, value) }
?: vector.setNull(i)
}
is Float8Vector ->
column.convertToDouble()
.forEachIndexed { i, value ->
value?.also { vector.set(i, value) }
?: vector.setNull(i)
}
is Float4Vector ->
column.convertToFloat()
.forEachIndexed { i, value ->
value?.also { vector.set(i, value) }
?: vector.setNull(i)
}
is DateDayVector ->
column.convertToLocalDate()
.forEachIndexed { i, value ->
value?.also { vector.set(i, value.toEpochDays().toInt()) }
?: vector.setNull(i)
}
is DateMilliVector ->
column.convertToLocalDateTime()
.forEachIndexed { i, value ->
value?.also { vector.set(i, value.toInstant(TimeZone.UTC).toEpochMilliseconds()) }
?: vector.setNull(i)
}
is TimeNanoVector ->
column.convertToLocalTime()
.forEachIndexed { i, value ->
value?.also { vector.set(i, value.toNanosecondOfDay()) }
?: vector.setNull(i)
}
is TimeMicroVector ->
column.convertToLocalTime()
.forEachIndexed { i, value ->
value?.also { vector.set(i, value.toNanosecondOfDay() / 1000) }
?: vector.setNull(i)
}
is TimeMilliVector ->
column.convertToLocalTime()
.forEachIndexed { i, value ->
value?.also { vector.set(i, (value.toNanosecondOfDay() / 1000 / 1000).toInt()) }
?: vector.setNull(i)
}
is TimeSecVector ->
column.convertToLocalTime()
.forEachIndexed { i, value ->
value?.also {
vector.set(i, (value.toNanosecondOfDay() / 1000 / 1000 / 1000).toInt())
} ?: vector.setNull(i)
}
is StructVector -> {
require(column is ColumnGroup<*>) {
"StructVector expects ColumnGroup, but got ${column::class.simpleName}"
}
column.columns().forEach { childColumn ->
infillVector(vector.getChild(childColumn.name()), childColumn)
}
column.indices.forEach { i -> vector.setIndexDefined(i) }
}
else -> {
// TODO implement other vector types from [readField] (VarBinaryVector, UIntVector, DurationVector, StructVector) and may be others (ListVector, FixedSizeListVector etc)
throw NotImplementedError("Saving to ${vector.javaClass.canonicalName} is currently not implemented")
}
}
vector.valueCount = dataFrame.rowsCount()
}
/**
* Create Arrow FieldVector with [column] content cast to [field] type according to [strictType] and [strictNullable] settings.
*/
private fun allocateVectorAndInfill(
field: Field,
column: AnyCol?,
strictType: Boolean,
strictNullable: Boolean,
): FieldVector {
val containNulls = (column == null || column.hasNulls())
// Convert the column to type specified in field. (If we already have target type, convertTo will do nothing)
val (convertedColumn, actualField) = try {
convertColumnToTarget(column, field.type) to field
} catch (e: CellConversionException) {
if (strictType) {
// If conversion failed but strictType is enabled, throw the exception
val mismatch =
ConvertingMismatch.TypeConversionFail.ConversionFailError(e.column?.name() ?: "", e.row, e)
mismatchSubscriber(mismatch)
throw ConvertingException(mismatch)
} else {
// If strictType is not enabled, use original data with its type. Target nullable is saved at this step.
mismatchSubscriber(
ConvertingMismatch.TypeConversionFail.ConversionFailIgnored(
column = e.column?.name() ?: "",
row = e.row,
cause = e,
),
)
convertColumnToCompatible(column!!)
}
} catch (e: TypeConverterNotFoundException) {
if (strictType) {
// If conversion failed but strictType is enabled, throw the exception
val mismatch = ConvertingMismatch.TypeConversionNotFound.ConversionNotFoundError(field.name, e)
mismatchSubscriber(mismatch)
throw ConvertingException(mismatch)
} else {
// If strictType is not enabled, use original data with its type. Target nullable is saved at this step.
mismatchSubscriber(ConvertingMismatch.TypeConversionNotFound.ConversionNotFoundIgnored(field.name, e))
convertColumnToCompatible(column!!)
}
}
val vector = if (!actualField.isNullable && containNulls) {
var firstNullValue: Int? = null
for (i in 0 until (column?.size() ?: -1)) {
if (column!![i] == null) {
firstNullValue = i
break
}
}
if (strictNullable) {
val mismatch = ConvertingMismatch.NullableMismatch.NullValueError(actualField.name, firstNullValue)
mismatchSubscriber(mismatch)
throw ConvertingException(mismatch)
} else {
mismatchSubscriber(
ConvertingMismatch.NullableMismatch.NullValueIgnored(
actualField.name,
firstNullValue,
),
)
Field(
actualField.name,
FieldType(true, actualField.fieldType.type, actualField.fieldType.dictionary),
actualField.children,
).createVector(allocator)!!
}
} else {
actualField.createVector(allocator)!!
}
if (convertedColumn == null) {
check(actualField.isNullable)
allocateVector(vector, dataFrame.rowsCount())
infillWithNulls(vector, dataFrame.rowsCount())
} else {
allocateVector(vector, dataFrame.rowsCount(), countTotalBytes(convertedColumn))
infillVector(vector, convertedColumn)
}
return vector
}
private fun List<AnyCol>.toVectors(): List<FieldVector> =
this.map {
val field = it.toArrowField(mismatchSubscriber)
allocateVectorAndInfill(field = field, column = it, strictType = true, strictNullable = true)
}
override fun allocateVectorSchemaRoot(): VectorSchemaRoot {
val mainVectors = LinkedHashMap<String, FieldVector>()
try {
for (field in targetSchema.fields) {
val column = dataFrame.getColumnOrNull(field.name)
if (column == null && !field.isNullable) {
if (mode.restrictNarrowing) {
val mismatch = ConvertingMismatch.NarrowingMismatch.NotPresentedColumnError(field.name)
mismatchSubscriber(mismatch)
throw ConvertingException(mismatch)
} else {
mismatchSubscriber(ConvertingMismatch.NarrowingMismatch.NotPresentedColumnIgnored(field.name))
continue
}
}
val vector = allocateVectorAndInfill(field, column, mode.strictType, mode.strictNullable)
mainVectors[field.name] = vector
}
} catch (e: Exception) {
mainVectors.values.forEach { it.close() } // Clear buffers before throwing exception
throw e
}
val vectors = ArrayList<FieldVector>()
vectors.addAll(mainVectors.values)
val otherColumns = dataFrame.columns().filter { column -> !mainVectors.containsKey(column.name()) }
if (!mode.restrictWidening) {
vectors.addAll(otherColumns.toVectors())
otherColumns.forEach {
mismatchSubscriber(ConvertingMismatch.WideningMismatch.AddedColumn(it.name))
}
} else {
otherColumns.forEach {
mismatchSubscriber(ConvertingMismatch.WideningMismatch.RejectedColumn(it.name))
}
}
return VectorSchemaRoot(vectors)
}
override fun close() {
allocator.close()
}
}
@@ -0,0 +1,101 @@
package org.jetbrains.kotlinx.dataframe.io
import org.jetbrains.kotlinx.dataframe.exceptions.CellConversionException
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConverterNotFoundException
/**
* Detailed message about any mismatch when saving to Arrow format with user-defined schema that does not match with actual data.
* Can be sent to callback, written to log or encapsulated to exception
*/
public sealed class ConvertingMismatch(
/** Name of the column with mismatch */
public open val column: String,
/** Number of first row with mismatch (0-based) if defined */
public open val row: Int?,
/** Original exception if exist */
public open val cause: Exception?,
) {
public sealed class WideningMismatch(column: String) : ConvertingMismatch(column, null, null) {
public data class AddedColumn(override val column: String) : WideningMismatch(column) {
override fun toString(): String = "Added column \"$column\" not described in target schema"
}
public data class RejectedColumn(override val column: String) : WideningMismatch(column) {
override fun toString(): String = "Column \"$column\" is not described in target schema and was ignored"
}
}
public sealed class NarrowingMismatch(column: String) : ConvertingMismatch(column, null, null) {
public data class NotPresentedColumnIgnored(override val column: String) : NarrowingMismatch(column) {
override fun toString(): String =
"Not nullable column \"$column\" is not presented in actual data, saving as is"
}
public data class NotPresentedColumnError(override val column: String) : NarrowingMismatch(column) {
override fun toString(): String =
"Not nullable column \"$column\" is not presented in actual data, can not save"
}
}
public sealed class TypeConversionNotFound(column: String, cause: TypeConverterNotFoundException) :
ConvertingMismatch(column, null, cause) {
public data class ConversionNotFoundIgnored(
override val column: String,
override val cause: TypeConverterNotFoundException,
) : TypeConversionNotFound(column, cause) {
override fun toString(): String = "${cause.message} for column \"$column\", saving as is"
}
public data class ConversionNotFoundError(override val column: String, val e: TypeConverterNotFoundException) :
TypeConversionNotFound(column, e) {
override fun toString(): String = "${e.message} for column \"$column\", can not save"
}
}
public sealed class TypeConversionFail(
column: String,
row: Int?,
public override val cause: CellConversionException,
) : ConvertingMismatch(column, row, cause) {
public data class ConversionFailIgnored(
override val column: String,
override val row: Int?,
override val cause: CellConversionException,
) : TypeConversionFail(column, row, cause) {
override fun toString(): String = "${cause.message}, saving as is"
}
public data class ConversionFailError(
override val column: String,
override val row: Int?,
override val cause: CellConversionException,
) : TypeConversionFail(column, row, cause) {
override fun toString(): String = "${cause.message}, can not save"
}
}
public data class SavedAsString(override val column: String, val type: Class<*>) :
ConvertingMismatch(column, null, null) {
override fun toString(): String = "Column \"$column\" has type ${type.canonicalName}, will be saved as String\""
}
public sealed class NullableMismatch(column: String, row: Int?) : ConvertingMismatch(column, row, null) {
public data class NullValueIgnored(override val column: String, override val row: Int?) :
NullableMismatch(column, row) {
override fun toString(): String =
"Column \"$column\" contains nulls in row $row but expected not nullable, saving as is"
}
public data class NullValueError(override val column: String, override val row: Int?) :
NullableMismatch(column, row) {
override fun toString(): String =
"Column \"$column\" contains nulls in row $row but expected not nullable, can not save"
}
}
}
public class ConvertingException(public val mismatchCase: ConvertingMismatch) :
IllegalArgumentException(mismatchCase.toString(), mismatchCase.cause)
@@ -0,0 +1,269 @@
package org.jetbrains.kotlinx.dataframe.io
import org.apache.arrow.dataset.file.FileFormat
import org.apache.arrow.memory.RootAllocator
import org.apache.arrow.vector.ipc.ArrowReader
import org.apache.commons.compress.utils.SeekableInMemoryByteChannel
import org.jetbrains.kotlinx.dataframe.AnyFrame
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.NullabilityOptions
import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod
import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod
import java.io.File
import java.io.InputStream
import java.net.URI
import java.net.URL
import java.nio.channels.Channels
import java.nio.channels.ReadableByteChannel
import java.nio.channels.SeekableByteChannel
import java.nio.file.Files
import java.nio.file.Path
public class ArrowFeather : SupportedDataFrameFormat {
override fun readDataFrame(stream: InputStream, header: List<String>): AnyFrame =
DataFrame.readArrowFeather(stream, NullabilityOptions.Widening)
override fun readDataFrame(path: Path, header: List<String>): AnyFrame =
DataFrame.readArrowFeather(path, NullabilityOptions.Widening)
override fun acceptsExtension(ext: String): Boolean = ext == "feather"
override fun acceptsSample(sample: SupportedFormatSample): Boolean = true // Extension is enough
override val testOrder: Int = 50000
override fun createDefaultReadMethod(pathRepresentation: String?): DefaultReadDfMethod =
DefaultReadArrowMethod(pathRepresentation)
}
private const val READ_ARROW_FEATHER = "readArrowFeather"
internal const val ARROW_PARQUET_DEFAULT_BATCH_SIZE = 32768L
private class DefaultReadArrowMethod(path: String?) :
AbstractDefaultReadMethod(path, MethodArguments.EMPTY, READ_ARROW_FEATHER)
internal object Allocator {
val ROOT by lazy {
RootAllocator(Long.MAX_VALUE)
}
}
/**
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format) data from existing [channel]
*/
public fun DataFrame.Companion.readArrowIPC(
channel: ReadableByteChannel,
allocator: RootAllocator = Allocator.ROOT,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame = readArrowIPCImpl(channel, allocator, nullability)
/**
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [channel]
*/
public fun DataFrame.Companion.readArrowFeather(
channel: SeekableByteChannel,
allocator: RootAllocator = Allocator.ROOT,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame = readArrowFeatherImpl(channel, allocator, nullability)
// IPC reading block
/**
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format) data from existing [file]
*/
public fun DataFrame.Companion.readArrowIPC(
file: File,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame = readArrowIPC(file.toPath(), nullability)
/**
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format)
* data from existing file on the given [path].
*/
public fun DataFrame.Companion.readArrowIPC(
path: Path,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame = Files.newByteChannel(path).use { readArrowIPC(it, nullability = nullability) }
/**
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format) data from existing [byteArray]
*/
public fun DataFrame.Companion.readArrowIPC(
byteArray: ByteArray,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame = SeekableInMemoryByteChannel(byteArray).use { readArrowIPC(it, nullability = nullability) }
/**
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format) data from existing [stream]
*/
public fun DataFrame.Companion.readArrowIPC(
stream: InputStream,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame = Channels.newChannel(stream).use { readArrowIPC(it, nullability = nullability) }
/**
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format) data from existing [url]
*/
public fun DataFrame.Companion.readArrowIPC(
url: URL,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame =
when {
isFile(url) -> readArrowIPC(urlAsFile(url), nullability)
isProtocolSupported(url) -> url.openStream().use { readArrowIPC(it, nullability) }
else -> {
throw IllegalArgumentException("Invalid protocol for url $url")
}
}
public fun DataFrame.Companion.readArrowIPC(
path: String,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame =
if (isUrl(path)) {
readArrowIPC(URI(path).toURL(), nullability)
} else {
readArrowIPC(File(path), nullability)
}
// Feather reading block
/**
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [file]
*/
public fun DataFrame.Companion.readArrowFeather(
file: File,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame = readArrowFeather(file.toPath(), nullability)
/**
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files)
* data from an existing file on the given [path].
*/
public fun DataFrame.Companion.readArrowFeather(
path: Path,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame = Files.newByteChannel(path).use { readArrowFeather(it, nullability = nullability) }
/**
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [byteArray]
*/
public fun DataFrame.Companion.readArrowFeather(
byteArray: ByteArray,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame = SeekableInMemoryByteChannel(byteArray).use { readArrowFeather(it, nullability = nullability) }
/**
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [stream]
*/
public fun DataFrame.Companion.readArrowFeather(
stream: InputStream,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame = readArrowFeather(stream.readBytes(), nullability)
/**
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [url]
*/
public fun DataFrame.Companion.readArrowFeather(
url: URL,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame =
when {
isFile(url) -> readArrowFeather(urlAsFile(url), nullability)
isProtocolSupported(url) -> readArrowFeather(url.readBytes(), nullability)
else -> {
throw IllegalArgumentException("Invalid protocol for url $url")
}
}
/**
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [path]
*/
public fun DataFrame.Companion.readArrowFeather(
path: String,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame =
if (isUrl(path)) {
readArrowFeather(URI(path).toURL(), nullability)
} else {
readArrowFeather(File(path), nullability)
}
/**
* Read [Arrow any format](https://arrow.apache.org/docs/java/ipc.html#reading-writing-ipc-formats) data from existing [reader]
*/
public fun DataFrame.Companion.readArrow(
reader: ArrowReader,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame = readArrowImpl(reader, nullability)
/**
* Read [Arrow any format](https://arrow.apache.org/docs/java/ipc.html#reading-writing-ipc-formats) data from existing [ArrowReader]
*/
public fun ArrowReader.toDataFrame(nullability: NullabilityOptions = NullabilityOptions.Infer): AnyFrame =
DataFrame.Companion.readArrowImpl(this, nullability)
/**
* Read [Parquet](https://parquet.apache.org/) data from existing [urls] by using [Arrow Dataset](https://arrow.apache.org/docs/java/dataset.html)
*/
public fun DataFrame.Companion.readParquet(
vararg urls: URL,
nullability: NullabilityOptions = NullabilityOptions.Infer,
batchSize: Long = ARROW_PARQUET_DEFAULT_BATCH_SIZE,
): AnyFrame =
readArrowDatasetImpl(
urls.map {
it.toString()
}.toTypedArray(),
FileFormat.PARQUET,
nullability,
batchSize,
)
/**
* Read [Parquet](https://parquet.apache.org/) data from existing [strUrls] by using [Arrow Dataset](https://arrow.apache.org/docs/java/dataset.html)
*/
public fun DataFrame.Companion.readParquet(
vararg strUrls: String,
nullability: NullabilityOptions = NullabilityOptions.Infer,
batchSize: Long = ARROW_PARQUET_DEFAULT_BATCH_SIZE,
): AnyFrame = readArrowDatasetImpl(arrayOf(*strUrls), FileFormat.PARQUET, nullability, batchSize)
/**
* Read [Parquet](https://parquet.apache.org/) data from existing [paths] by using [Arrow Dataset](https://arrow.apache.org/docs/java/dataset.html)
*/
public fun DataFrame.Companion.readParquet(
vararg paths: Path,
nullability: NullabilityOptions = NullabilityOptions.Infer,
batchSize: Long = ARROW_PARQUET_DEFAULT_BATCH_SIZE,
): AnyFrame =
readArrowDatasetImpl(
paths.map {
it.toUri().toString()
}.toTypedArray(),
FileFormat.PARQUET,
nullability,
batchSize,
)
/**
* Read [Parquet](https://parquet.apache.org/) data from existing [files] by using [Arrow Dataset](https://arrow.apache.org/docs/java/dataset.html)
*/
public fun DataFrame.Companion.readParquet(
vararg files: File,
nullability: NullabilityOptions = NullabilityOptions.Infer,
batchSize: Long = ARROW_PARQUET_DEFAULT_BATCH_SIZE,
): AnyFrame =
readArrowDatasetImpl(
files.map {
it.toURI().toString()
}.toTypedArray(),
FileFormat.PARQUET,
nullability,
batchSize,
)
@@ -0,0 +1,481 @@
package org.jetbrains.kotlinx.dataframe.io
import kotlinx.datetime.LocalDate
import kotlinx.datetime.LocalDateTime
import kotlinx.datetime.LocalTime
import kotlinx.datetime.toKotlinLocalDate
import kotlinx.datetime.toKotlinLocalDateTime
import kotlinx.datetime.toKotlinLocalTime
import org.apache.arrow.dataset.file.FileFormat
import org.apache.arrow.dataset.file.FileSystemDatasetFactory
import org.apache.arrow.dataset.jni.DirectReservationListener
import org.apache.arrow.dataset.jni.NativeMemoryPool
import org.apache.arrow.dataset.scanner.ScanOptions
import org.apache.arrow.memory.RootAllocator
import org.apache.arrow.vector.BigIntVector
import org.apache.arrow.vector.BitVector
import org.apache.arrow.vector.DateDayVector
import org.apache.arrow.vector.DateMilliVector
import org.apache.arrow.vector.Decimal256Vector
import org.apache.arrow.vector.DecimalVector
import org.apache.arrow.vector.DurationVector
import org.apache.arrow.vector.FieldVector
import org.apache.arrow.vector.Float4Vector
import org.apache.arrow.vector.Float8Vector
import org.apache.arrow.vector.IntVector
import org.apache.arrow.vector.LargeVarBinaryVector
import org.apache.arrow.vector.LargeVarCharVector
import org.apache.arrow.vector.NullVector
import org.apache.arrow.vector.SmallIntVector
import org.apache.arrow.vector.TimeMicroVector
import org.apache.arrow.vector.TimeMilliVector
import org.apache.arrow.vector.TimeNanoVector
import org.apache.arrow.vector.TimeSecVector
import org.apache.arrow.vector.TimeStampMicroVector
import org.apache.arrow.vector.TimeStampMilliVector
import org.apache.arrow.vector.TimeStampNanoVector
import org.apache.arrow.vector.TimeStampSecVector
import org.apache.arrow.vector.TinyIntVector
import org.apache.arrow.vector.UInt1Vector
import org.apache.arrow.vector.UInt2Vector
import org.apache.arrow.vector.UInt4Vector
import org.apache.arrow.vector.UInt8Vector
import org.apache.arrow.vector.VarBinaryVector
import org.apache.arrow.vector.VarCharVector
import org.apache.arrow.vector.VectorSchemaRoot
import org.apache.arrow.vector.ViewVarBinaryVector
import org.apache.arrow.vector.ViewVarCharVector
import org.apache.arrow.vector.complex.StructVector
import org.apache.arrow.vector.ipc.ArrowFileReader
import org.apache.arrow.vector.ipc.ArrowReader
import org.apache.arrow.vector.ipc.ArrowStreamReader
import org.apache.arrow.vector.types.pojo.Field
import org.apache.arrow.vector.util.DateUtility
import org.jetbrains.kotlinx.dataframe.AnyBaseCol
import org.jetbrains.kotlinx.dataframe.AnyFrame
import org.jetbrains.kotlinx.dataframe.DataColumn
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.Infer
import org.jetbrains.kotlinx.dataframe.api.NullabilityException
import org.jetbrains.kotlinx.dataframe.api.NullabilityOptions
import org.jetbrains.kotlinx.dataframe.api.applyNullability
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import org.jetbrains.kotlinx.dataframe.api.emptyDataFrame
import org.jetbrains.kotlinx.dataframe.api.getColumn
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
import org.jetbrains.kotlinx.dataframe.impl.asList
import java.io.File
import java.math.BigDecimal
import java.math.BigInteger
import java.net.URI
import java.nio.channels.ReadableByteChannel
import java.nio.channels.SeekableByteChannel
import java.nio.file.Files
import kotlin.reflect.KType
import kotlin.reflect.full.withNullability
import kotlin.reflect.typeOf
import kotlin.time.Duration
import kotlin.time.toKotlinDuration
import java.time.LocalTime as JavaLocalTime
/**
* same as [Iterable<DataFrame<T>>.concat()] without internal type guessing (all batches should have the same schema)
*/
internal fun <T> Iterable<DataFrame<T>>.concatKeepingSchema(): DataFrame<T> {
val dataFrames = asList()
when (dataFrames.size) {
0 -> return emptyDataFrame()
1 -> return dataFrames[0]
}
val columnNames = dataFrames.first().columnNames()
val columns = columnNames.map { name ->
val values = dataFrames.flatMap { it.getColumn(name).values() }
DataColumn.createValueColumn(name, values, dataFrames.first().getColumn(name).type())
}
return dataFrameOf(columns).cast()
}
private fun BitVector.values(range: IntRange): List<Boolean?> = range.map { getObject(it) }
private fun UInt1Vector.values(range: IntRange): List<Short?> = range.map { getObjectNoOverflow(it) }
private fun UInt2Vector.values(range: IntRange): List<Int?> = range.map { getObject(it)?.code }
private fun UInt4Vector.values(range: IntRange): List<Long?> = range.map { getObjectNoOverflow(it) }
private fun UInt8Vector.values(range: IntRange): List<BigInteger?> = range.map { getObjectNoOverflow(it) }
private fun TinyIntVector.values(range: IntRange): List<Byte?> = range.map { getObject(it) }
private fun SmallIntVector.values(range: IntRange): List<Short?> = range.map { getObject(it) }
private fun IntVector.values(range: IntRange): List<Int?> = range.map { getObject(it) }
private fun BigIntVector.values(range: IntRange): List<Long?> = range.map { getObject(it) }
private fun DecimalVector.values(range: IntRange): List<BigDecimal?> = range.map { getObject(it) }
private fun Decimal256Vector.values(range: IntRange): List<BigDecimal?> = range.map { getObject(it) }
private fun Float4Vector.values(range: IntRange): List<Float?> = range.map { getObject(it) }
private fun Float8Vector.values(range: IntRange): List<Double?> = range.map { getObject(it) }
private fun DurationVector.values(range: IntRange): List<Duration?> = range.map { getObject(it).toKotlinDuration() }
private fun DateDayVector.values(range: IntRange): List<LocalDate?> =
range.map {
if (getObject(it) == null) {
null
} else {
DateUtility.getLocalDateTimeFromEpochMilli(getObject(it).toLong() * DateUtility.daysToStandardMillis)
.toLocalDate()
.toKotlinLocalDate()
}
}
private fun DateMilliVector.values(range: IntRange): List<LocalDateTime?> =
range.map { getObject(it)?.toKotlinLocalDateTime() }
private fun TimeNanoVector.values(range: IntRange): List<LocalTime?> =
range.mapIndexed { i, it ->
if (isNull(i)) {
null
} else {
JavaLocalTime.ofNanoOfDay(get(it)).toKotlinLocalTime()
}
}
private fun TimeMicroVector.values(range: IntRange): List<LocalTime?> =
range.mapIndexed { i, it ->
if (isNull(i)) {
null
} else {
JavaLocalTime.ofNanoOfDay(getObject(it) * 1000).toKotlinLocalTime()
}
}
private fun TimeMilliVector.values(range: IntRange): List<LocalTime?> =
range.mapIndexed { i, it ->
if (isNull(i)) {
null
} else {
JavaLocalTime.ofNanoOfDay(get(it).toLong() * 1000_000).toKotlinLocalTime()
}
}
private fun TimeSecVector.values(range: IntRange): List<LocalTime?> =
range.map { getObject(it)?.let { JavaLocalTime.ofSecondOfDay(it.toLong()).toKotlinLocalTime() } }
private fun TimeStampNanoVector.values(range: IntRange): List<LocalDateTime?> =
range.mapIndexed { i, it ->
if (isNull(i)) {
null
} else {
getObject(it).toKotlinLocalDateTime()
}
}
private fun TimeStampMicroVector.values(range: IntRange): List<LocalDateTime?> =
range.mapIndexed { i, it ->
if (isNull(i)) {
null
} else {
getObject(it).toKotlinLocalDateTime()
}
}
private fun TimeStampMilliVector.values(range: IntRange): List<LocalDateTime?> =
range.mapIndexed { i, it ->
if (isNull(i)) {
null
} else {
getObject(it).toKotlinLocalDateTime()
}
}
private fun TimeStampSecVector.values(range: IntRange): List<LocalDateTime?> =
range.mapIndexed { i, it ->
if (isNull(i)) {
null
} else {
getObject(it).toKotlinLocalDateTime()
}
}
private fun StructVector.values(range: IntRange): List<Map<String, Any?>?> =
range.map {
getObject(it)
}
private fun NullVector.values(range: IntRange): List<Nothing?> =
range.map {
getObject(it) as Nothing?
}
private fun VarCharVector.values(range: IntRange): List<String?> =
range.map {
if (isNull(it)) {
null
} else {
String(get(it))
}
}
private fun LargeVarCharVector.values(range: IntRange): List<String?> =
range.map {
if (isNull(it)) {
null
} else {
String(get(it))
}
}
private fun ViewVarCharVector.values(range: IntRange): List<String?> =
range.map {
if (isNull(it)) {
null
} else {
String(get(it))
}
}
private fun VarBinaryVector.values(range: IntRange): List<ByteArray?> =
range.map {
if (isNull(it)) {
null
} else {
get(it)
}
}
private fun LargeVarBinaryVector.values(range: IntRange): List<ByteArray?> =
range.map {
if (isNull(it)) {
null
} else {
get(it)
}
}
private fun ViewVarBinaryVector.values(range: IntRange): List<ByteArray?> =
range.map {
if (isNull(it)) {
null
} else {
get(it)
}
}
internal fun nothingType(nullable: Boolean): KType =
if (nullable) {
typeOf<List<Nothing?>>()
} else {
typeOf<List<Nothing>>()
}.arguments.first().type!!
private inline fun <reified T> List<T?>.withTypeNullable(
expectedNulls: Boolean,
nullabilityOptions: NullabilityOptions,
): Pair<List<T?>, KType> {
val nullable = nullabilityOptions.applyNullability(this, expectedNulls)
return this to typeOf<T>().withNullability(nullable)
}
@JvmName("withTypeNullableNothingList")
private fun List<Nothing?>.withTypeNullable(
expectedNulls: Boolean,
nullabilityOptions: NullabilityOptions,
): Pair<List<Nothing?>, KType> {
val nullable = nullabilityOptions.applyNullability(this, expectedNulls)
return this to nothingType(nullable)
}
private fun readField(vector: FieldVector, field: Field, nullability: NullabilityOptions): AnyBaseCol {
try {
val range = 0 until vector.valueCount
if (vector is StructVector) {
val columns = field.children.map { childField ->
readField(vector.getChild(childField.name), childField, nullability)
}
return DataColumn.createColumnGroup(field.name, columns.toDataFrame())
}
val (list, type) = when (vector) {
is VarCharVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is LargeVarCharVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is ViewVarCharVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is VarBinaryVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is LargeVarBinaryVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is ViewVarBinaryVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is BitVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is SmallIntVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is TinyIntVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is UInt1Vector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is UInt2Vector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is UInt4Vector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is UInt8Vector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is IntVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is BigIntVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is DecimalVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is Decimal256Vector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is Float8Vector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is Float4Vector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is DurationVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is DateDayVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is DateMilliVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is TimeNanoVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is TimeMicroVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is TimeMilliVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is TimeSecVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is TimeStampNanoVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is TimeStampMicroVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is TimeStampMilliVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is TimeStampSecVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
is NullVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
else -> {
throw NotImplementedError("reading from ${vector.javaClass.canonicalName} is not implemented")
}
}
return DataColumn.createValueColumn(field.name, list, type, Infer.None)
} catch (unexpectedNull: NullabilityException) {
throw IllegalArgumentException("Column `${field.name}` should be not nullable but has nulls")
}
}
private fun readField(root: VectorSchemaRoot, field: Field, nullability: NullabilityOptions): AnyBaseCol =
readField(root.getVector(field), field, nullability)
/**
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format) data from existing [channel]
*/
internal fun DataFrame.Companion.readArrowIPCImpl(
channel: ReadableByteChannel,
allocator: RootAllocator = Allocator.ROOT,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame = readArrowImpl(ArrowStreamReader(channel, allocator), nullability)
/**
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [channel]
*/
internal fun DataFrame.Companion.readArrowFeatherImpl(
channel: SeekableByteChannel,
allocator: RootAllocator = Allocator.ROOT,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame = readArrowImpl(ArrowFileReader(channel, allocator), nullability)
/**
* Read [Arrow any format](https://arrow.apache.org/docs/java/ipc.html#reading-writing-ipc-formats) data from existing [reader]
*/
internal fun DataFrame.Companion.readArrowImpl(
reader: ArrowReader,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame {
reader.use {
val flattened = buildList {
when (reader) {
is ArrowFileReader -> {
reader.recordBlocks.forEach { block ->
reader.loadRecordBatch(block)
val root = reader.vectorSchemaRoot
val schema = root.schema
val df = schema.fields.map { f -> readField(root, f, nullability) }.toDataFrame()
add(df)
}
}
else -> {
val root = reader.vectorSchemaRoot
val schema = root.schema
while (reader.loadNextBatch()) {
val df = schema.fields.map { f -> readField(root, f, nullability) }.toDataFrame()
add(df)
}
}
}
}
return flattened.concatKeepingSchema()
}
}
private fun resolveArrowDatasetUris(fileUris: Array<String>): Array<String> =
fileUris.map {
when {
it.startsWith("http:", true) -> {
val url = URI.create(it).toURL()
val tempFile = File.createTempFile("kdf", ".parquet")
tempFile.deleteOnExit()
url.openStream().use { input ->
Files.copy(input, tempFile.toPath())
tempFile.toURI().toString()
}
}
!it.startsWith("file:", true) && File(it).exists() -> {
File(it).toURI().toString()
}
else -> it
}
}.toTypedArray()
/**
* Read [Arrow Dataset](https://arrow.apache.org/docs/java/dataset.html) from [fileUris]
*/
internal fun DataFrame.Companion.readArrowDatasetImpl(
fileUris: Array<String>,
fileFormat: FileFormat,
nullability: NullabilityOptions = NullabilityOptions.Infer,
batchSize: Long = ARROW_PARQUET_DEFAULT_BATCH_SIZE,
): AnyFrame {
val scanOptions = ScanOptions(batchSize)
RootAllocator().use { allocator ->
FileSystemDatasetFactory(
allocator,
NativeMemoryPool.createListenable(DirectReservationListener.instance()),
fileFormat,
resolveArrowDatasetUris(fileUris),
).use { datasetFactory ->
datasetFactory.finish().use { dataset ->
dataset.newScan(scanOptions).use { scanner ->
scanner.scanBatches().use { reader ->
return readArrowImpl(reader, nullability)
}
}
}
}
}
}
@@ -0,0 +1,136 @@
package org.jetbrains.kotlinx.dataframe.io
import kotlinx.datetime.LocalDate
import kotlinx.datetime.LocalDateTime
import kotlinx.datetime.LocalTime
import org.apache.arrow.vector.types.DateUnit
import org.apache.arrow.vector.types.FloatingPointPrecision
import org.apache.arrow.vector.types.TimeUnit
import org.apache.arrow.vector.types.pojo.ArrowType
import org.apache.arrow.vector.types.pojo.Field
import org.apache.arrow.vector.types.pojo.FieldType
import org.apache.arrow.vector.types.pojo.Schema
import org.jetbrains.kotlinx.dataframe.AnyCol
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
import org.jetbrains.kotlinx.dataframe.typeClass
import kotlin.reflect.full.isSubtypeOf
import kotlin.reflect.typeOf
import java.time.LocalDate as JavaLocalDate
import java.time.LocalDateTime as JavaLocalDateTime
import java.time.LocalTime as JavaLocalTime
/**
* Create Arrow [Field] (note: this is part of [Schema], does not contain data itself) that has the same
* name, type and nullable as [this]
*/
public fun AnyCol.toArrowField(mismatchSubscriber: (ConvertingMismatch) -> Unit = ignoreMismatchMessage): Field {
val column = this
val columnType = column.type()
val nullable = columnType.isMarkedNullable
return when {
column is ColumnGroup<*> -> {
val childFields = column.columns().map { it.toArrowField(mismatchSubscriber) }
Field(
column.name(),
FieldType(nullable, ArrowType.Struct(), null),
childFields,
)
}
columnType.isSubtypeOf(typeOf<String?>()) ->
Field(
column.name(),
FieldType(nullable, ArrowType.Utf8(), null),
emptyList(),
)
columnType.isSubtypeOf(typeOf<Boolean?>()) ->
Field(
column.name(),
FieldType(nullable, ArrowType.Bool(), null),
emptyList(),
)
columnType.isSubtypeOf(typeOf<Byte?>()) ->
Field(
column.name(),
FieldType(nullable, ArrowType.Int(8, true), null),
emptyList(),
)
columnType.isSubtypeOf(typeOf<Short?>()) ->
Field(
column.name(),
FieldType(nullable, ArrowType.Int(16, true), null),
emptyList(),
)
columnType.isSubtypeOf(typeOf<Int?>()) ->
Field(
column.name(),
FieldType(nullable, ArrowType.Int(32, true), null),
emptyList(),
)
columnType.isSubtypeOf(typeOf<Long?>()) ->
Field(
column.name(),
FieldType(nullable, ArrowType.Int(64, true), null),
emptyList(),
)
columnType.isSubtypeOf(typeOf<Float?>()) ->
Field(
column.name(),
FieldType(nullable, ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null),
emptyList(),
)
columnType.isSubtypeOf(typeOf<Double?>()) ->
Field(
column.name(),
FieldType(nullable, ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE), null),
emptyList(),
)
columnType.isSubtypeOf(typeOf<JavaLocalDate?>()) ||
columnType.isSubtypeOf(typeOf<LocalDate?>()) ->
Field(
column.name(),
FieldType(nullable, ArrowType.Date(DateUnit.DAY), null),
emptyList(),
)
columnType.isSubtypeOf(typeOf<JavaLocalDateTime?>()) ||
columnType.isSubtypeOf(typeOf<LocalDateTime?>()) ->
Field(
column.name(),
FieldType(nullable, ArrowType.Date(DateUnit.MILLISECOND), null),
emptyList(),
)
columnType.isSubtypeOf(typeOf<JavaLocalTime?>()) ||
columnType.isSubtypeOf(typeOf<LocalTime>()) ->
Field(
column.name(),
FieldType(nullable, ArrowType.Time(TimeUnit.NANOSECOND, 64), null),
emptyList(),
)
else -> {
mismatchSubscriber(ConvertingMismatch.SavedAsString(column.name(), column.typeClass.java))
Field(column.name(), FieldType(true, ArrowType.Utf8(), null), emptyList())
}
}
}
/**
* Create Arrow [Schema] matching [this] actual data.
* Columns with not supported types will be interpreted as String
*/
public fun List<AnyCol>.toArrowSchema(
mismatchSubscriber: (ConvertingMismatch) -> Unit = ignoreMismatchMessage,
): Schema {
val fields = this.map { it.toArrowField(mismatchSubscriber) }
return Schema(fields)
}
@@ -0,0 +1,117 @@
package org.jetbrains.kotlinx.dataframe.io
import org.apache.arrow.vector.types.pojo.Schema
import org.jetbrains.kotlinx.dataframe.AnyFrame
import java.io.File
import java.io.OutputStream
import java.nio.channels.WritableByteChannel
import java.nio.file.Path
/**
* Create [ArrowWriter] for [this] DataFrame with target schema matching actual data
*/
public fun AnyFrame.arrowWriter(): ArrowWriter = this.arrowWriter(this.columns().toArrowSchema())
/**
* Create [ArrowWriter] for [this] DataFrame with explicit [targetSchema].
* If DataFrame does not match with [targetSchema], behaviour is specified by [mode], mismatches would be sent to [mismatchSubscriber]
*/
public fun AnyFrame.arrowWriter(
targetSchema: Schema,
mode: ArrowWriter.Mode = ArrowWriter.Mode.STRICT,
mismatchSubscriber: (ConvertingMismatch) -> Unit = ignoreMismatchMessage,
): ArrowWriter = ArrowWriter.create(this, targetSchema, mode, mismatchSubscriber)
// IPC saving block with default parameters
/**
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format), write to opened [channel].
*/
public fun AnyFrame.writeArrowIPC(channel: WritableByteChannel) {
this.arrowWriter().use { writer ->
writer.writeArrowIPC(channel)
}
}
/**
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format), write to opened [stream].
*/
public fun AnyFrame.writeArrowIPC(stream: OutputStream) {
this.arrowWriter().use { writer ->
writer.writeArrowIPC(stream)
}
}
/**
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format), write to new or existing [file].
* If [file] exists, it can be recreated or expanded.
*/
public fun AnyFrame.writeArrowIPC(file: File, append: Boolean = true) {
writeArrowIPC(file.toPath(), append)
}
/**
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format),
* write to new or existing file on the given [path].
* If file exists, it can be recreated or expanded.
*/
public fun AnyFrame.writeArrowIPC(path: Path, append: Boolean = true) {
this.arrowWriter().use { writer ->
writer.writeArrowIPC(path, append)
}
}
/**
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format), write to new [ByteArray]
*/
public fun AnyFrame.saveArrowIPCToByteArray(): ByteArray =
this.arrowWriter().use { writer ->
writer.saveArrowIPCToByteArray()
}
// Feather saving block with default parameters
/**
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files), write to opened [channel].
*/
public fun AnyFrame.writeArrowFeather(channel: WritableByteChannel) {
this.arrowWriter().use { writer ->
writer.writeArrowFeather(channel)
}
}
/**
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files), write to opened [stream].
*/
public fun AnyFrame.writeArrowFeather(stream: OutputStream) {
this.arrowWriter().use { writer ->
writer.writeArrowFeather(stream)
}
}
/**
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files), write to new or existing [file].
* If file exists, it would be recreated.
*/
public fun AnyFrame.writeArrowFeather(file: File) {
writeArrowFeather(file.toPath())
}
/**
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files),
* write to new or existing file on the given [path].
* If file exists, it would be recreated.
*/
public fun AnyFrame.writeArrowFeather(path: Path) {
this.arrowWriter().use { writer ->
writer.writeArrowFeather(path)
}
}
/**
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files), write to new [ByteArray]
*/
public fun AnyFrame.saveArrowFeatherToByteArray(): ByteArray =
this.arrowWriter().use { writer ->
writer.saveArrowFeatherToByteArray()
}
@@ -0,0 +1 @@
org.jetbrains.kotlinx.dataframe.io.ArrowFeather
@@ -0,0 +1,813 @@
package org.jetbrains.kotlinx.dataframe.io
import io.kotest.assertions.throwables.shouldThrow
import io.kotest.matchers.collections.shouldContain
import io.kotest.matchers.shouldBe
import kotlinx.datetime.LocalDate
import kotlinx.datetime.LocalDateTime
import kotlinx.datetime.UtcOffset
import kotlinx.datetime.toInstant
import org.apache.arrow.memory.RootAllocator
import org.apache.arrow.vector.TimeStampMicroVector
import org.apache.arrow.vector.TimeStampMilliVector
import org.apache.arrow.vector.TimeStampNanoVector
import org.apache.arrow.vector.TimeStampSecVector
import org.apache.arrow.vector.VectorSchemaRoot
import org.apache.arrow.vector.ipc.ArrowFileReader
import org.apache.arrow.vector.ipc.ArrowFileWriter
import org.apache.arrow.vector.ipc.ArrowReader
import org.apache.arrow.vector.ipc.ArrowStreamReader
import org.apache.arrow.vector.ipc.ArrowStreamWriter
import org.apache.arrow.vector.types.FloatingPointPrecision
import org.apache.arrow.vector.types.TimeUnit
import org.apache.arrow.vector.types.pojo.ArrowType
import org.apache.arrow.vector.types.pojo.Field
import org.apache.arrow.vector.types.pojo.FieldType
import org.apache.arrow.vector.types.pojo.Schema
import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel
import org.duckdb.DuckDBConnection
import org.duckdb.DuckDBResultSet
import org.jetbrains.kotlinx.dataframe.AnyFrame
import org.jetbrains.kotlinx.dataframe.DataColumn
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.NullabilityOptions
import org.jetbrains.kotlinx.dataframe.api.add
import org.jetbrains.kotlinx.dataframe.api.columnOf
import org.jetbrains.kotlinx.dataframe.api.convertToBoolean
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import org.jetbrains.kotlinx.dataframe.api.map
import org.jetbrains.kotlinx.dataframe.api.pathOf
import org.jetbrains.kotlinx.dataframe.api.remove
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConverterNotFoundException
import org.junit.Assert
import org.junit.Test
import java.io.ByteArrayInputStream
import java.io.ByteArrayOutputStream
import java.io.File
import java.net.URL
import java.nio.channels.Channels
import java.sql.DriverManager
import java.util.Locale
import kotlin.io.path.toPath
import kotlin.reflect.typeOf
import kotlin.time.toJavaInstant
internal class ArrowKtTest {
fun testResource(resourcePath: String): URL = ArrowKtTest::class.java.classLoader.getResource(resourcePath)!!
fun testArrowFeather(name: String) = testResource("$name.feather")
fun testArrowIPC(name: String) = testResource("$name.ipc")
@Test
fun testReadingFromFile() {
val feather = testArrowFeather("data-arrow_2.0.0_uncompressed")
val df = DataFrame.readArrowFeather(feather)
val a by columnOf("one")
val b by columnOf(2.0)
val c by columnOf(
"c1" to columnOf("inner"),
"c2" to columnOf(4.0),
"c3" to columnOf(50.0),
)
val d by columnOf("four")
val expected = dataFrameOf(a, b, c, d)
df shouldBe expected
}
@Test
fun testReadingAllTypesAsEstimated() {
assertEstimations(
exampleFrame = DataFrame.readArrowFeather(testArrowFeather("test.arrow"), NullabilityOptions.Infer),
expectedNullable = false,
hasNulls = false,
)
assertEstimations(
exampleFrame = DataFrame.readArrowIPC(testArrowIPC("test.arrow"), NullabilityOptions.Infer),
expectedNullable = false,
hasNulls = false,
)
assertEstimations(
exampleFrame = DataFrame.readArrowFeather(testArrowFeather("test.arrow"), NullabilityOptions.Checking),
expectedNullable = true,
hasNulls = false,
)
assertEstimations(
exampleFrame = DataFrame.readArrowIPC(testArrowIPC("test.arrow"), NullabilityOptions.Checking),
expectedNullable = true,
hasNulls = false,
)
assertEstimations(
exampleFrame = DataFrame.readArrowFeather(testArrowFeather("test.arrow"), NullabilityOptions.Widening),
expectedNullable = true,
hasNulls = false,
)
assertEstimations(
exampleFrame = DataFrame.readArrowIPC(testArrowIPC("test.arrow"), NullabilityOptions.Widening),
expectedNullable = true,
hasNulls = false,
)
}
@Test
fun testReadingAllTypesAsEstimatedWithNulls() {
assertEstimations(
exampleFrame = DataFrame.readArrowFeather(
testArrowFeather("test-with-nulls.arrow"),
NullabilityOptions.Infer,
),
expectedNullable = true,
hasNulls = true,
)
assertEstimations(
exampleFrame = DataFrame.readArrowIPC(
testArrowIPC("test-with-nulls.arrow"),
NullabilityOptions.Infer,
),
expectedNullable = true,
hasNulls = true,
)
assertEstimations(
exampleFrame = DataFrame.readArrowFeather(
testArrowFeather("test-with-nulls.arrow"),
NullabilityOptions.Checking,
),
expectedNullable = true,
hasNulls = true,
)
assertEstimations(
exampleFrame = DataFrame.readArrowIPC(
testArrowIPC("test-with-nulls.arrow"),
NullabilityOptions.Checking,
),
expectedNullable = true,
hasNulls = true,
)
assertEstimations(
exampleFrame = DataFrame.readArrowFeather(
testArrowFeather("test-with-nulls.arrow"),
NullabilityOptions.Widening,
),
expectedNullable = true,
hasNulls = true,
)
assertEstimations(
exampleFrame = DataFrame.readArrowIPC(
testArrowIPC("test-with-nulls.arrow"),
NullabilityOptions.Widening,
),
expectedNullable = true,
hasNulls = true,
)
}
@Test
fun testReadingAllTypesAsEstimatedNotNullable() {
assertEstimations(
exampleFrame = DataFrame.readArrowFeather(
testArrowFeather("test-not-nullable.arrow"),
NullabilityOptions.Infer,
),
expectedNullable = false,
hasNulls = false,
)
assertEstimations(
exampleFrame = DataFrame.readArrowIPC(
testArrowIPC("test-not-nullable.arrow"),
NullabilityOptions.Infer,
),
expectedNullable = false,
hasNulls = false,
)
assertEstimations(
exampleFrame = DataFrame.readArrowFeather(
testArrowFeather("test-not-nullable.arrow"),
NullabilityOptions.Checking,
),
expectedNullable = false,
hasNulls = false,
)
assertEstimations(
exampleFrame = DataFrame.readArrowIPC(
testArrowIPC("test-not-nullable.arrow"),
NullabilityOptions.Checking,
),
expectedNullable = false,
hasNulls = false,
)
assertEstimations(
exampleFrame = DataFrame.readArrowFeather(
testArrowFeather("test-not-nullable.arrow"),
NullabilityOptions.Widening,
),
expectedNullable = false,
hasNulls = false,
)
assertEstimations(
exampleFrame = DataFrame.readArrowIPC(
testArrowIPC("test-not-nullable.arrow"),
NullabilityOptions.Widening,
),
expectedNullable = false,
hasNulls = false,
)
}
@Test
fun testReadingAllTypesAsEstimatedNotNullableWithNulls() {
assertEstimations(
exampleFrame = DataFrame.readArrowFeather(
testArrowFeather("test-illegal.arrow"),
NullabilityOptions.Infer,
),
expectedNullable = true,
hasNulls = true,
)
assertEstimations(
exampleFrame = DataFrame.readArrowIPC(
testArrowIPC("test-illegal.arrow"),
NullabilityOptions.Infer,
),
expectedNullable = true,
hasNulls = true,
)
shouldThrow<IllegalArgumentException> {
assertEstimations(
exampleFrame = DataFrame.readArrowFeather(
testArrowFeather("test-illegal.arrow"),
NullabilityOptions.Checking,
),
expectedNullable = false,
hasNulls = true,
)
}
shouldThrow<IllegalArgumentException> {
assertEstimations(
exampleFrame = DataFrame.readArrowIPC(
testArrowIPC("test-illegal.arrow"),
NullabilityOptions.Checking,
),
expectedNullable = false,
hasNulls = true,
)
}
assertEstimations(
exampleFrame = DataFrame.readArrowFeather(
testArrowFeather("test-illegal.arrow"),
NullabilityOptions.Widening,
),
expectedNullable = true,
hasNulls = true,
)
assertEstimations(
exampleFrame = DataFrame.readArrowIPC(
testArrowIPC("test-illegal.arrow"),
NullabilityOptions.Widening,
),
expectedNullable = true,
hasNulls = true,
)
}
@Test
fun testWritingGeneral() {
fun assertEstimation(citiesDeserialized: DataFrame<*>) {
citiesDeserialized["name"] shouldBe citiesExampleFrame["name"]
citiesDeserialized["affiliation"] shouldBe citiesExampleFrame["affiliation"]
citiesDeserialized["is_capital"] shouldBe citiesExampleFrame["is_capital"]
citiesDeserialized["population"] shouldBe citiesExampleFrame["population"]
citiesDeserialized["area"] shouldBe citiesExampleFrame["area"]
// cities["settled"].type() refers to FlexibleTypeImpl(LocalDate..LocalDate?)
// and does not match typeOf<LocalDate>()
citiesDeserialized["settled"].type() shouldBe typeOf<LocalDate>()
citiesDeserialized["settled"].values() shouldBe citiesExampleFrame["settled"].values()
// cities["page_in_wiki"].type() is URI, not supported by Arrow directly
citiesDeserialized["page_in_wiki"].type() shouldBe typeOf<String>()
citiesDeserialized["page_in_wiki"].values() shouldBe
citiesExampleFrame["page_in_wiki"].values().map { it.toString() }
}
val testFile = File.createTempFile("cities", "arrow")
citiesExampleFrame.writeArrowFeather(testFile)
assertEstimation(DataFrame.readArrowFeather(testFile))
val testByteArray = citiesExampleFrame.saveArrowIPCToByteArray()
assertEstimation(DataFrame.readArrowIPC(testByteArray))
}
@Test
fun testWritingBySchema() {
val testFile = File.createTempFile("cities", "arrow")
citiesExampleFrame.arrowWriter(Schema.fromJSON(citiesExampleSchema)).use { it.writeArrowFeather(testFile) }
val citiesDeserialized = DataFrame.readArrowFeather(testFile, NullabilityOptions.Checking)
citiesDeserialized["population"].type() shouldBe typeOf<Long?>()
citiesDeserialized["area"].type() shouldBe typeOf<Float>()
citiesDeserialized["settled"].type() shouldBe typeOf<LocalDateTime>()
shouldThrow<IllegalArgumentException> { citiesDeserialized["page_in_wiki"] }
citiesDeserialized["film_in_youtube"] shouldBe
DataColumn.createValueColumn(
name = "film_in_youtube",
values = arrayOfNulls<String>(citiesExampleFrame.rowsCount()).asList(),
)
}
@Test
fun testWidening() {
val warnings = ArrayList<ConvertingMismatch>()
val testRestrictWidening = citiesExampleFrame.arrowWriter(
targetSchema = Schema.fromJSON(citiesExampleSchema),
mode = ArrowWriter.Mode.STRICT,
) { warning ->
warnings.add(warning)
}.use { it.saveArrowFeatherToByteArray() }
warnings.shouldContain(ConvertingMismatch.WideningMismatch.RejectedColumn("page_in_wiki"))
shouldThrow<IllegalArgumentException> { DataFrame.readArrowFeather(testRestrictWidening)["page_in_wiki"] }
val testAllowWidening = citiesExampleFrame.arrowWriter(
targetSchema = Schema.fromJSON(citiesExampleSchema),
mode = ArrowWriter.Mode(
restrictWidening = false,
restrictNarrowing = true,
strictType = true,
strictNullable = true,
),
).use { it.saveArrowFeatherToByteArray() }
DataFrame.readArrowFeather(testAllowWidening)["page_in_wiki"].values() shouldBe
citiesExampleFrame["page_in_wiki"]
.values()
.map { it.toString() }
}
@Test
fun testNarrowing() {
val frameWithoutRequiredField = citiesExampleFrame.remove("settled")
frameWithoutRequiredField.arrowWriter(
targetSchema = Schema.fromJSON(citiesExampleSchema),
mode = ArrowWriter.Mode.STRICT,
).use {
shouldThrow<ConvertingException> { it.saveArrowFeatherToByteArray() }
}
val warnings = ArrayList<ConvertingMismatch>()
val testAllowNarrowing = frameWithoutRequiredField.arrowWriter(
targetSchema = Schema.fromJSON(citiesExampleSchema),
mode = ArrowWriter.Mode(
restrictWidening = true,
restrictNarrowing = false,
strictType = true,
strictNullable = true,
),
) { warning ->
warnings.add(warning)
}.use { it.saveArrowFeatherToByteArray() }
warnings.shouldContain(ConvertingMismatch.NarrowingMismatch.NotPresentedColumnIgnored("settled"))
shouldThrow<IllegalArgumentException> { DataFrame.readArrowFeather(testAllowNarrowing)["settled"] }
}
@Test
fun testStrictType() {
val frameRenaming = citiesExampleFrame.remove("settled")
val frameWithIncompatibleField =
frameRenaming.add(
frameRenaming["is_capital"]
.map { value -> value ?: false }
.rename("settled")
.convertToBoolean(),
)
frameWithIncompatibleField.arrowWriter(
Schema.fromJSON(citiesExampleSchema),
ArrowWriter.Mode.STRICT,
).use {
shouldThrow<ConvertingException> { it.saveArrowFeatherToByteArray() }
}
val warnings = ArrayList<ConvertingMismatch>()
val testLoyalType = frameWithIncompatibleField.arrowWriter(
targetSchema = Schema.fromJSON(citiesExampleSchema),
mode = ArrowWriter.Mode(
restrictWidening = true,
restrictNarrowing = true,
strictType = false,
strictNullable = true,
),
) { warning ->
warnings.add(warning)
}.use { it.saveArrowFeatherToByteArray() }
warnings.map { it.toString() }.shouldContain(
ConvertingMismatch.TypeConversionNotFound.ConversionNotFoundIgnored(
"settled",
TypeConverterNotFoundException(
typeOf<Boolean>(),
typeOf<kotlinx.datetime.LocalDateTime?>(),
pathOf("settled"),
),
).toString(),
)
DataFrame.readArrowFeather(testLoyalType)["settled"].type() shouldBe typeOf<Boolean>()
}
@Test
fun testStrictNullable() {
val frameRenaming = citiesExampleFrame.remove("settled")
val frameWithNulls = frameRenaming.add(
DataColumn.createValueColumn(
"settled",
arrayOfNulls<LocalDate>(frameRenaming.rowsCount()).asList(),
),
)
frameWithNulls.arrowWriter(
targetSchema = Schema.fromJSON(citiesExampleSchema),
mode = ArrowWriter.Mode.STRICT,
).use {
shouldThrow<ConvertingException> { it.saveArrowFeatherToByteArray() }
}
val warnings = ArrayList<ConvertingMismatch>()
val testLoyalNullable = frameWithNulls.arrowWriter(
targetSchema = Schema.fromJSON(citiesExampleSchema),
mode = ArrowWriter.Mode(
restrictWidening = true,
restrictNarrowing = true,
strictType = true,
strictNullable = false,
),
) { warning ->
warnings.add(warning)
}.use { it.saveArrowFeatherToByteArray() }
warnings.shouldContain(ConvertingMismatch.NullableMismatch.NullValueIgnored("settled", 0))
DataFrame.readArrowFeather(testLoyalNullable)["settled"].type() shouldBe typeOf<LocalDateTime?>()
DataFrame.readArrowFeather(testLoyalNullable)["settled"].values() shouldBe
arrayOfNulls<LocalDate>(frameRenaming.rowsCount()).asList()
}
@Test
fun testParsing() {
val columnStringDot = columnOf("12.345", "67.890")
val columnStringComma = columnOf("12,345", "67,890")
val frameString = dataFrameOf("columnDot", "columnComma")(columnStringDot, columnStringComma)
val columnDoubleFraction = columnOf(12.345, 67.890)
val columnDoubleRound = columnOf(12345.0, 67890.0)
val targetType = FieldType.notNullable(ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE))
val targetSchema = Schema(
listOf(
Field("columnDot", targetType, emptyList()),
Field("columnComma", targetType, emptyList()),
),
)
val currentLocale = Locale.getDefault()
try {
Locale.setDefault(Locale.forLanguageTag("en-US"))
val serializedAsUs = frameString.arrowWriter(targetSchema).saveArrowFeatherToByteArray()
DataFrame.readArrowFeather(serializedAsUs) shouldBe dataFrameOf("columnDot", "columnComma")(
columnDoubleFraction,
columnDoubleRound,
)
Locale.setDefault(Locale.forLanguageTag("ru-RU"))
val serializedAsRu = frameString.arrowWriter(targetSchema).saveArrowFeatherToByteArray()
DataFrame.readArrowFeather(serializedAsRu) shouldBe
dataFrameOf("columnDot", "columnComma")(
columnDoubleFraction,
columnDoubleFraction,
)
} finally {
Locale.setDefault(currentLocale)
}
}
@Test
fun testBigStringColumn() {
val dataFrame = dataFrameOf(bigStringColumn)
val data = dataFrame.saveArrowFeatherToByteArray()
DataFrame.readArrowFeather(data) shouldBe dataFrame
}
@Test
fun testBigMixedColumn() {
val dataFrame = dataFrameOf(bigMixedColumn)
val warnings = ArrayList<ConvertingMismatch>()
val writer = dataFrame.arrowWriter(
targetSchema = Schema(
listOf(
Field("bigMixedColumn", FieldType.nullable(ArrowType.Int(64, true)), emptyList()),
),
),
mode = ArrowWriter.Mode.LOYAL,
) {
warnings.add(it)
}
val stream = ByteArrayOutputStream()
writer.writeArrowFeather(stream)
val data = stream.toByteArray()
assert(warnings.filterIsInstance<ConvertingMismatch.TypeConversionFail.ConversionFailIgnored>().size == 1)
assert(warnings.filterIsInstance<ConvertingMismatch.SavedAsString>().size == 1)
DataFrame.readArrowFeather(data)["bigMixedColumn"] shouldBe dataFrame[bigMixedColumn].map { it.toString() }
}
@Test
fun testTimeStamp() {
val dates = listOf(
LocalDateTime(2023, 11, 23, 9, 30, 25),
LocalDateTime(2015, 5, 25, 14, 20, 13),
LocalDateTime(2013, 6, 19, 11, 20, 13),
)
val dataFrame = dataFrameOf(
"ts_nano" to dates,
"ts_micro" to dates,
"ts_milli" to dates,
"ts_sec" to dates,
)
DataFrame.readArrowFeather(writeArrowTimestamp(dates)) shouldBe dataFrame
DataFrame.readArrowIPC(writeArrowTimestamp(dates, true)) shouldBe dataFrame
}
private fun writeArrowTimestamp(dates: List<LocalDateTime>, streaming: Boolean = false): ByteArray {
RootAllocator().use { allocator ->
val timeStampMilli = Field(
"ts_milli",
FieldType.nullable(ArrowType.Timestamp(TimeUnit.MILLISECOND, null)),
null,
)
val timeStampMicro = Field(
"ts_micro",
FieldType.nullable(ArrowType.Timestamp(TimeUnit.MICROSECOND, null)),
null,
)
val timeStampNano = Field(
"ts_nano",
FieldType.nullable(ArrowType.Timestamp(TimeUnit.NANOSECOND, null)),
null,
)
val timeStampSec = Field(
"ts_sec",
FieldType.nullable(ArrowType.Timestamp(TimeUnit.SECOND, null)),
null,
)
val schemaTimeStamp = Schema(
listOf(timeStampNano, timeStampMicro, timeStampMilli, timeStampSec),
)
VectorSchemaRoot.create(schemaTimeStamp, allocator).use { vectorSchemaRoot ->
val timeStampMilliVector = vectorSchemaRoot.getVector("ts_milli") as TimeStampMilliVector
val timeStampNanoVector = vectorSchemaRoot.getVector("ts_nano") as TimeStampNanoVector
val timeStampMicroVector = vectorSchemaRoot.getVector("ts_micro") as TimeStampMicroVector
val timeStampSecVector = vectorSchemaRoot.getVector("ts_sec") as TimeStampSecVector
timeStampMilliVector.allocateNew(dates.size)
timeStampNanoVector.allocateNew(dates.size)
timeStampMicroVector.allocateNew(dates.size)
timeStampSecVector.allocateNew(dates.size)
dates.forEachIndexed { index, localDateTime ->
val instant = localDateTime.toInstant(UtcOffset.ZERO).toJavaInstant()
timeStampNanoVector[index] = instant.toEpochMilli() * 1_000_000L + instant.nano
timeStampMicroVector[index] = instant.toEpochMilli() * 1_000L
timeStampMilliVector[index] = instant.toEpochMilli()
timeStampSecVector[index] = instant.toEpochMilli() / 1_000L
}
vectorSchemaRoot.setRowCount(dates.size)
val bos = ByteArrayOutputStream()
bos.use { out ->
val arrowWriter = if (streaming) {
ArrowStreamWriter(vectorSchemaRoot, null, Channels.newChannel(out))
} else {
ArrowFileWriter(vectorSchemaRoot, null, Channels.newChannel(out))
}
arrowWriter.use { writer ->
writer.start()
writer.writeBatch()
}
}
return bos.toByteArray()
}
}
}
private fun expectedSimpleDataFrame(): AnyFrame {
val dates = listOf(
LocalDateTime(2020, 11, 23, 9, 30, 25),
LocalDateTime(2015, 5, 25, 14, 20, 13),
LocalDateTime(2013, 6, 19, 11, 20, 13),
LocalDateTime(2000, 1, 1, 0, 0, 0),
)
return dataFrameOf(
"string" to listOf("a", "b", "c", "d"),
"int" to listOf(1, 2, 3, 4),
"float" to listOf(1.0f, 2.0f, 3.0f, 4.0f),
"double" to listOf(1.0, 2.0, 3.0, 4.0),
"datetime" to dates,
)
}
@Test
fun testArrowReaderExtension() {
val expected = expectedSimpleDataFrame()
val featherChannel = ByteArrayReadableSeekableByteChannel(expected.saveArrowFeatherToByteArray())
val arrowFileReader = ArrowFileReader(featherChannel, RootAllocator())
arrowFileReader.toDataFrame() shouldBe expected
val ipcInputStream = ByteArrayInputStream(expected.saveArrowIPCToByteArray())
val arrowStreamReader = ArrowStreamReader(ipcInputStream, RootAllocator())
arrowStreamReader.toDataFrame() shouldBe expected
}
@Test
fun testDuckDBArrowIntegration() {
val expected = expectedSimpleDataFrame()
val query =
"""
select 'a' as string, 1 as int, CAST(1.0 as FLOAT) as float, CAST(1.0 as DOUBLE) as double, TIMESTAMP '2020-11-23 09:30:25' as datetime
UNION ALL SELECT 'b', 2, 2.0, 2.0, TIMESTAMP '2015-05-25 14:20:13'
UNION ALL SELECT 'c', 3, 3.0, 3.0, TIMESTAMP '2013-06-19 11:20:13'
UNION ALL SELECT 'd', 4, 4.0, 4.0, TIMESTAMP '2000-01-01 00:00:00'
""".trimIndent()
Class.forName("org.duckdb.DuckDBDriver")
val conn = DriverManager.getConnection("jdbc:duckdb:") as DuckDBConnection
conn.use {
val resultSet = it.createStatement().executeQuery(query) as DuckDBResultSet
val dbArrowReader = resultSet.arrowExportStream(RootAllocator(), 256) as ArrowReader
Assert.assertTrue(dbArrowReader.javaClass.name.equals("org.apache.arrow.c.ArrowArrayStreamReader"))
DataFrame.readArrow(dbArrowReader) shouldBe expected
}
}
@Test
fun testReadParquetPath() {
val resourceUrl = testResource("test.arrow.parquet")
val resourcePath = resourceUrl.toURI().toPath()
val dataFrame = DataFrame.readParquet(resourcePath)
dataFrame.rowsCount() shouldBe 300
assertEstimations(
exampleFrame = dataFrame,
expectedNullable = false,
hasNulls = false,
fromParquet = true,
)
}
@Test
fun testReadParquetFile() {
val resourceUrl = testResource("test.arrow.parquet")
val resourcePath = resourceUrl.toURI().toPath()
val dataFrame = DataFrame.readParquet(resourcePath.toFile())
dataFrame.rowsCount() shouldBe 300
assertEstimations(
exampleFrame = dataFrame,
expectedNullable = false,
hasNulls = false,
fromParquet = true,
)
}
@Test
fun testReadParquetStringPath() {
val resourceUrl = testResource("test.arrow.parquet")
val resourcePath = resourceUrl.toURI().toPath()
val dataFrame = DataFrame.readParquet("$resourcePath")
dataFrame.rowsCount() shouldBe 300
assertEstimations(
exampleFrame = dataFrame,
expectedNullable = false,
hasNulls = false,
fromParquet = true,
)
}
@Test
fun testReadParquetUrl() {
val resourceUrl = testResource("test.arrow.parquet")
val resourcePath = resourceUrl.toURI().toPath()
val fileUrl = resourcePath.toUri().toURL()
val dataFrame = DataFrame.readParquet(fileUrl)
dataFrame.rowsCount() shouldBe 300
assertEstimations(
exampleFrame = dataFrame,
expectedNullable = false,
hasNulls = false,
fromParquet = true,
)
}
@Test
fun testReadMultipleParquetFiles() {
val resourceUrl = testResource("test.arrow.parquet")
val resourcePath = resourceUrl.toURI().toPath()
val dataFrame = DataFrame.readParquet(resourcePath, resourcePath, resourcePath)
dataFrame.rowsCount() shouldBe 900
}
@Test
fun testColumnGroupRoundtrip() {
val original = dataFrameOf(
"outer" to columnOf("x", "y", "z"),
"inner" to columnOf(
"nested1" to columnOf("a", "b", "c"),
"nested2" to columnOf(1, 2, 3),
),
)
val featherBytes = original.saveArrowFeatherToByteArray()
val fromFeather = DataFrame.readArrowFeather(featherBytes)
fromFeather shouldBe original
val ipcBytes = original.saveArrowIPCToByteArray()
val fromIpc = DataFrame.readArrowIPC(ipcBytes)
fromIpc shouldBe original
}
@Test
fun testNestedColumnGroupRoundtrip() {
val deeplyNested by columnOf(
"level2" to columnOf(
"level3" to columnOf(1, 2, 3),
),
)
val original = dataFrameOf(deeplyNested)
val bytes = original.saveArrowFeatherToByteArray()
val restored = DataFrame.readArrowFeather(bytes)
restored shouldBe original
}
@Test
fun testColumnGroupWithNulls() {
val group by columnOf(
"a" to columnOf("x", null, "z"),
"b" to columnOf(1, 2, null),
)
val original = dataFrameOf(group)
val bytes = original.saveArrowFeatherToByteArray()
val restored = DataFrame.readArrowFeather(bytes)
restored shouldBe original
}
@Test
fun testReadParquetWithNestedStruct() {
val resourceUrl = testResource("books.parquet")
val resourcePath = resourceUrl.toURI().toPath()
val df = DataFrame.readParquet(resourcePath)
df.columnNames() shouldBe listOf("id", "title", "author", "genre", "publisher")
val authorGroup = df["author"] as ColumnGroup<*>
authorGroup.columnNames() shouldBe listOf("id", "firstName", "lastName")
df["id"].type() shouldBe typeOf<Int>()
df["title"].type() shouldBe typeOf<String>()
df["genre"].type() shouldBe typeOf<String>()
df["publisher"].type() shouldBe typeOf<String>()
authorGroup["id"].type() shouldBe typeOf<Int>()
authorGroup["firstName"].type() shouldBe typeOf<String>()
authorGroup["lastName"].type() shouldBe typeOf<String>()
}
@Test
fun testParquetNestedStructRoundtrip() {
val resourceUrl = testResource("books.parquet")
val resourcePath = resourceUrl.toURI().toPath()
val original = DataFrame.readParquet(resourcePath)
val featherBytes = original.saveArrowFeatherToByteArray()
val fromFeather = DataFrame.readArrowFeather(featherBytes)
fromFeather shouldBe original
val ipcBytes = original.saveArrowIPCToByteArray()
val fromIpc = DataFrame.readArrowIPC(ipcBytes)
fromIpc shouldBe original
}
}
@@ -0,0 +1,208 @@
package org.jetbrains.kotlinx.dataframe.io
import io.kotest.matchers.shouldBe
import kotlinx.datetime.LocalDate
import kotlinx.datetime.LocalDateTime
import kotlinx.datetime.LocalTime
import kotlinx.datetime.toKotlinLocalDate
import kotlinx.datetime.toKotlinLocalDateTime
import kotlinx.datetime.toKotlinLocalTime
import org.jetbrains.kotlinx.dataframe.AnyFrame
import org.jetbrains.kotlinx.dataframe.DataColumn
import org.jetbrains.kotlinx.dataframe.api.forEachIndexed
import java.math.BigInteger
import java.time.ZoneOffset
import kotlin.math.absoluteValue
import kotlin.math.pow
import kotlin.reflect.full.withNullability
import kotlin.reflect.typeOf
import java.time.LocalDate as JavaLocalDate
import java.time.LocalDateTime as JavaLocalDateTime
import java.time.LocalTime as JavaLocalTime
/**
* Assert that we have got the same data that was originally saved on example creation.
* Example generation project is currently located at https://github.com/Kopilov/arrow_example
*/
internal fun assertEstimations(
exampleFrame: AnyFrame,
expectedNullable: Boolean,
hasNulls: Boolean,
fromParquet: Boolean = false,
) {
/**
* In [exampleFrame] we get two concatenated batches. To assert the estimations, we should transform frame row number to batch row number
*/
fun iBatch(iFrame: Int): Int {
val firstBatchSize = 100
return if (iFrame < firstBatchSize) iFrame else iFrame - firstBatchSize
}
fun expectedNull(rowNumber: Int): Boolean = (rowNumber + 1) % 5 == 0
fun assertValueOrNull(rowNumber: Int, actual: Any?, expected: Any) {
if (hasNulls && expectedNull(rowNumber)) {
actual shouldBe null
} else {
actual shouldBe expected
}
}
val asciiStringCol = exampleFrame["asciiString"] as DataColumn<String?>
asciiStringCol.type() shouldBe typeOf<String>().withNullability(expectedNullable)
asciiStringCol.forEachIndexed { i, element ->
assertValueOrNull(iBatch(i), element, "Test Example ${iBatch(i)}")
}
val utf8StringCol = exampleFrame["utf8String"] as DataColumn<String?>
utf8StringCol.type() shouldBe typeOf<String>().withNullability(expectedNullable)
utf8StringCol.forEachIndexed { i, element ->
assertValueOrNull(iBatch(i), element, "Тестовый пример ${iBatch(i)}")
}
val largeStringCol = exampleFrame["largeString"] as DataColumn<String?>
largeStringCol.type() shouldBe typeOf<String>().withNullability(expectedNullable)
largeStringCol.forEachIndexed { i, element ->
assertValueOrNull(iBatch(i), element, "Test Example Should Be Large ${iBatch(i)}")
}
val booleanCol = exampleFrame["boolean"] as DataColumn<Boolean?>
booleanCol.type() shouldBe typeOf<Boolean>().withNullability(expectedNullable)
booleanCol.forEachIndexed { i, element ->
assertValueOrNull(iBatch(i), element, iBatch(i) % 2 == 0)
}
val byteCol = exampleFrame["byte"] as DataColumn<Byte?>
byteCol.type() shouldBe typeOf<Byte>().withNullability(expectedNullable)
byteCol.forEachIndexed { i, element ->
assertValueOrNull(iBatch(i), element, (iBatch(i) * 10).toByte())
}
val shortCol = exampleFrame["short"] as DataColumn<Short?>
shortCol.type() shouldBe typeOf<Short>().withNullability(expectedNullable)
shortCol.forEachIndexed { i, element ->
assertValueOrNull(iBatch(i), element, (iBatch(i) * 1000).toShort())
}
val intCol = exampleFrame["int"] as DataColumn<Int?>
intCol.type() shouldBe typeOf<Int>().withNullability(expectedNullable)
intCol.forEachIndexed { i, element ->
assertValueOrNull(iBatch(i), element, iBatch(i) * 100000000)
}
val longCol = exampleFrame["longInt"] as DataColumn<Long?>
longCol.type() shouldBe typeOf<Long>().withNullability(expectedNullable)
longCol.forEachIndexed { i, element ->
assertValueOrNull(iBatch(i), element, iBatch(i) * 100000000000000000L)
}
val unsignedByteCol = exampleFrame["unsigned_byte"] as DataColumn<Short?>
unsignedByteCol.type() shouldBe typeOf<Short>().withNullability(expectedNullable)
unsignedByteCol.forEachIndexed { i, element ->
assertValueOrNull(iBatch(i), element, (iBatch(i) * 10 % (Byte.MIN_VALUE.toShort() * 2).absoluteValue).toShort())
}
val unsignedShortCol = exampleFrame["unsigned_short"] as DataColumn<Int?>
unsignedShortCol.type() shouldBe typeOf<Int>().withNullability(expectedNullable)
unsignedShortCol.forEachIndexed { i, element ->
assertValueOrNull(iBatch(i), element, iBatch(i) * 1000 % (Short.MIN_VALUE.toInt() * 2).absoluteValue)
}
val unsignedIntCol = exampleFrame["unsigned_int"] as DataColumn<Long?>
unsignedIntCol.type() shouldBe typeOf<Long>().withNullability(expectedNullable)
unsignedIntCol.forEachIndexed { i, element ->
assertValueOrNull(
rowNumber = iBatch(i),
actual = element,
expected = iBatch(i).toLong() * 100000000 % (Int.MIN_VALUE.toLong() * 2).absoluteValue,
)
}
val unsignedLongIntCol = exampleFrame["unsigned_longInt"] as DataColumn<BigInteger?>
unsignedLongIntCol.type() shouldBe typeOf<BigInteger>().withNullability(expectedNullable)
unsignedLongIntCol.forEachIndexed { i, element ->
assertValueOrNull(
rowNumber = iBatch(i),
actual = element,
expected = iBatch(i).toBigInteger() * 100000000000000000L.toBigInteger() %
(Long.MIN_VALUE.toBigInteger() * 2.toBigInteger()).abs(),
)
}
val floatCol = exampleFrame["float"] as DataColumn<Float?>
floatCol.type() shouldBe typeOf<Float>().withNullability(expectedNullable)
floatCol.forEachIndexed { i, element ->
assertValueOrNull(iBatch(i), element, 2.0f.pow(iBatch(i).toFloat()))
}
val doubleCol = exampleFrame["double"] as DataColumn<Double?>
doubleCol.type() shouldBe typeOf<Double>().withNullability(expectedNullable)
doubleCol.forEachIndexed { i, element ->
assertValueOrNull(iBatch(i), element, 2.0.pow(iBatch(i)))
}
val dateCol = exampleFrame["date32"] as DataColumn<LocalDate?>
dateCol.type() shouldBe typeOf<LocalDate>().withNullability(expectedNullable)
dateCol.forEachIndexed { i, element ->
assertValueOrNull(iBatch(i), element, JavaLocalDate.ofEpochDay(iBatch(i).toLong() * 30).toKotlinLocalDate())
}
if (fromParquet) {
// parquet format have only one type of date: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#date without time
val datetimeCol = exampleFrame["date64"] as DataColumn<LocalDate?>
datetimeCol.type() shouldBe typeOf<LocalDate>().withNullability(expectedNullable)
datetimeCol.forEachIndexed { i, element ->
assertValueOrNull(iBatch(i), element, JavaLocalDate.ofEpochDay(iBatch(i).toLong() * 30).toKotlinLocalDate())
}
} else {
val datetimeCol = exampleFrame["date64"] as DataColumn<LocalDateTime?>
datetimeCol.type() shouldBe typeOf<LocalDateTime>().withNullability(expectedNullable)
datetimeCol.forEachIndexed { i, element ->
assertValueOrNull(
rowNumber = iBatch(i),
actual = element,
expected = JavaLocalDateTime.ofEpochSecond(
iBatch(i).toLong() * 60 * 60 * 24 * 30,
0,
ZoneOffset.UTC,
).toKotlinLocalDateTime(),
)
}
}
val timeSecCol = exampleFrame["time32_seconds"] as DataColumn<LocalTime?>
timeSecCol.type() shouldBe typeOf<LocalTime>().withNullability(expectedNullable)
timeSecCol.forEachIndexed { i, element ->
assertValueOrNull(iBatch(i), element, JavaLocalTime.ofSecondOfDay(iBatch(i).toLong()).toKotlinLocalTime())
}
val timeMilliCol = exampleFrame["time32_milli"] as DataColumn<LocalTime?>
timeMilliCol.type() shouldBe typeOf<LocalTime>().withNullability(expectedNullable)
timeMilliCol.forEachIndexed { i, element ->
assertValueOrNull(
rowNumber = iBatch(i),
actual = element,
expected = JavaLocalTime.ofNanoOfDay(iBatch(i).toLong() * 1000_000).toKotlinLocalTime(),
)
}
val timeMicroCol = exampleFrame["time64_micro"] as DataColumn<LocalTime?>
timeMicroCol.type() shouldBe typeOf<LocalTime>().withNullability(expectedNullable)
timeMicroCol.forEachIndexed { i, element ->
assertValueOrNull(iBatch(i), element, JavaLocalTime.ofNanoOfDay(iBatch(i).toLong() * 1000).toKotlinLocalTime())
}
val timeNanoCol = exampleFrame["time64_nano"] as DataColumn<LocalTime?>
timeNanoCol.type() shouldBe typeOf<LocalTime>().withNullability(expectedNullable)
timeNanoCol.forEachIndexed { i, element ->
assertValueOrNull(iBatch(i), element, JavaLocalTime.ofNanoOfDay(iBatch(i).toLong()).toKotlinLocalTime())
}
exampleFrame.getColumnOrNull("nulls")?.let { nullCol ->
nullCol.type() shouldBe nothingType(hasNulls)
assert(hasNulls)
nullCol.values().forEach {
assert(it == null)
}
}
}
@@ -0,0 +1,206 @@
package org.jetbrains.kotlinx.dataframe.io
import kotlinx.datetime.LocalDate
import org.jetbrains.kotlinx.dataframe.DataColumn
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import java.net.URI
/**
* DataFrame to be saved in Apache Arrow
*/
val citiesExampleFrame = dataFrameOf(
DataColumn.createValueColumn(
"name",
listOf(
"Berlin",
"Hamburg",
"New York",
"Washington",
"Saint Petersburg",
"Vatican",
),
),
DataColumn.createValueColumn(
"affiliation",
listOf(
"Germany",
"Germany",
"The USA",
"The USA",
"Russia",
null,
),
),
DataColumn.createValueColumn(
"is_capital",
listOf(
true,
false,
false,
true,
false,
null,
),
),
DataColumn.createValueColumn(
"population",
listOf(
3_769_495,
1_845_229,
8_467_513,
689_545,
5_377_503,
825,
),
),
DataColumn.createValueColumn(
"area",
listOf(
891.7,
755.22,
1223.59,
177.0,
1439.0,
0.44,
),
),
DataColumn.createValueColumn(
"settled",
listOf(
LocalDate(1237, 1, 1),
LocalDate(1189, 5, 7),
LocalDate(1624, 1, 1),
LocalDate(1790, 7, 16),
LocalDate(1703, 5, 27),
LocalDate(1929, 2, 11),
),
),
DataColumn.createValueColumn(
"page_in_wiki",
listOf(
URI("https://en.wikipedia.org/wiki/Berlin").toURL(),
URI("https://en.wikipedia.org/wiki/Hamburg").toURL(),
URI("https://en.wikipedia.org/wiki/New_York_City").toURL(),
URI("https://en.wikipedia.org/wiki/Washington,_D.C.").toURL(),
URI("https://en.wikipedia.org/wiki/Saint_Petersburg").toURL(),
URI("https://en.wikipedia.org/wiki/Vatican_City").toURL(),
),
),
)
/**
* [citiesExampleFrame] Apache Arrow schema with some changes.
* Originally generated by `citiesExampleFrame.columns().toArrowSchema().toJson()`
* Changes made to test converting and schema matching:
* field "population" changed to nullable Long;
* field "area" changed to single Float;
* field "settled" changed to datetime (date with millisecond precision);
* field "page_in_wiki" removed, nullable field "film_in_youtube" added.
*/
val citiesExampleSchema =
"""
{
"fields" : [ {
"name" : "name",
"nullable" : false,
"type" : {
"name" : "utf8"
},
"children" : [ ]
}, {
"name" : "affiliation",
"nullable" : true,
"type" : {
"name" : "utf8"
},
"children" : [ ]
}, {
"name" : "is_capital",
"nullable" : true,
"type" : {
"name" : "bool"
},
"children" : [ ]
}, {
"name" : "population",
"nullable" : true,
"type" : {
"name" : "int",
"bitWidth" : 64,
"isSigned" : true
},
"children" : [ ]
}, {
"name" : "area",
"nullable" : false,
"type" : {
"name" : "floatingpoint",
"precision" : "SINGLE"
},
"children" : [ ]
}, {
"name" : "settled",
"nullable" : false,
"type" : {
"name" : "date",
"unit" : "MILLISECOND"
},
"children" : [ ]
}, {
"name" : "film_in_youtube",
"nullable" : true,
"type" : {
"name" : "utf8"
},
"children" : [ ]
} ]
}
""".trimIndent()
/**
* String column (variable length vector) with size >1 MiB
*/
val bigStringColumn = run {
val list = ArrayList<String>()
for (i in 0 until 1024) {
val row = StringBuilder()
for (j in 0 until 64) {
row.append("abcd")
}
list.add(row.toString())
}
for (i in 0 until 1024) {
val row = StringBuilder()
for (j in 0 until 64) {
row.append("гдёж")
}
list.add(row.toString())
}
for (i in 0 until 1024) {
val row = StringBuilder()
for (j in 0 until 64) {
row.append("αβγδ")
}
list.add(row.toString())
}
for (i in 0 until 1024) {
val row = StringBuilder()
for (j in 0 until 64) {
row.append("正体字")
}
list.add(row.toString())
}
DataColumn.createValueColumn("bigStringColumn", list)
}
val bigMixedColumn = run {
val list = ArrayList<Any>()
for (i in 0..32768) {
list.add(i * i)
}
list.add("Dirty data")
for (i in 32768 downTo 0) {
list.add(i * i)
}
DataColumn.createValueColumn("bigMixedColumn", list)
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.