init research
This commit is contained in:
Vendored
+8
@@ -0,0 +1,8 @@
|
||||
## :dataframe-arrow
|
||||
|
||||
This module, published as `dataframe-arrow`, contains all logic and tests for DataFrame to be able to work with
|
||||
Apache Arrow.
|
||||
|
||||
See [Read Apache Arrow formats](https://kotlin.github.io/dataframe/read.html#read-apache-arrow-formats) and
|
||||
[Writing to Apache Arrow formats](https://kotlin.github.io/dataframe/write.html#writing-to-apache-arrow-formats)
|
||||
for more information about how to use it.
|
||||
+325
@@ -0,0 +1,325 @@
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ArrowFeather : org/jetbrains/kotlinx/dataframe/io/SupportedDataFrameFormat {
|
||||
public fun <init> ()V
|
||||
public fun acceptsExtension (Ljava/lang/String;)Z
|
||||
public fun acceptsSample (Lorg/jetbrains/kotlinx/dataframe/io/SupportedFormatSample;)Z
|
||||
public fun createDefaultReadMethod (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/codeGen/DefaultReadDfMethod;
|
||||
public fun getTestOrder ()I
|
||||
public fun readDataFrame (Ljava/io/File;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public fun readDataFrame (Ljava/io/InputStream;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public fun readDataFrame (Ljava/nio/file/Path;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ArrowReadingKt {
|
||||
public static final fun readArrow (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Lorg/apache/arrow/vector/ipc/ArrowReader;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readArrow$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Lorg/apache/arrow/vector/ipc/ArrowReader;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun readArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/File;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun readArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/InputStream;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun readArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun readArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/net/URL;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun readArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/nio/channels/SeekableByteChannel;Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun readArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/nio/file/Path;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun readArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[BLorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readArrowFeather$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/File;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readArrowFeather$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/InputStream;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readArrowFeather$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readArrowFeather$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/net/URL;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readArrowFeather$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/nio/channels/SeekableByteChannel;Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readArrowFeather$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/nio/file/Path;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readArrowFeather$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[BLorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun readArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/File;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun readArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/InputStream;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun readArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun readArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/net/URL;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun readArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/nio/channels/ReadableByteChannel;Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun readArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/nio/file/Path;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun readArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[BLorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/File;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/InputStream;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/net/URL;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/nio/channels/ReadableByteChannel;Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/nio/file/Path;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[BLorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun readParquet (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[Ljava/io/File;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;J)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun readParquet (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;J)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun readParquet (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[Ljava/net/URL;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;J)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun readParquet (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[Ljava/nio/file/Path;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;J)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readParquet$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[Ljava/io/File;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;JILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readParquet$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;JILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readParquet$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[Ljava/net/URL;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;JILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun readParquet$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;[Ljava/nio/file/Path;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;JILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static final fun toDataFrame (Lorg/apache/arrow/vector/ipc/ArrowReader;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public static synthetic fun toDataFrame$default (Lorg/apache/arrow/vector/ipc/ArrowReader;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ArrowTypesMatchingKt {
|
||||
public static final fun toArrowField (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lkotlin/jvm/functions/Function1;)Lorg/apache/arrow/vector/types/pojo/Field;
|
||||
public static synthetic fun toArrowField$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lkotlin/jvm/functions/Function1;ILjava/lang/Object;)Lorg/apache/arrow/vector/types/pojo/Field;
|
||||
public static final fun toArrowSchema (Ljava/util/List;Lkotlin/jvm/functions/Function1;)Lorg/apache/arrow/vector/types/pojo/Schema;
|
||||
public static synthetic fun toArrowSchema$default (Ljava/util/List;Lkotlin/jvm/functions/Function1;ILjava/lang/Object;)Lorg/apache/arrow/vector/types/pojo/Schema;
|
||||
}
|
||||
|
||||
public abstract interface class org/jetbrains/kotlinx/dataframe/io/ArrowWriter : java/lang/AutoCloseable {
|
||||
public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Companion;
|
||||
public abstract fun allocateVectorSchemaRoot ()Lorg/apache/arrow/vector/VectorSchemaRoot;
|
||||
public abstract fun getDataFrame ()Lorg/jetbrains/kotlinx/dataframe/DataFrame;
|
||||
public abstract fun getMismatchSubscriber ()Lkotlin/jvm/functions/Function1;
|
||||
public abstract fun getMode ()Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;
|
||||
public abstract fun getTargetSchema ()Lorg/apache/arrow/vector/types/pojo/Schema;
|
||||
public fun saveArrowFeatherToByteArray ()[B
|
||||
public fun saveArrowIPCToByteArray ()[B
|
||||
public fun writeArrowFeather (Ljava/io/File;)V
|
||||
public fun writeArrowFeather (Ljava/io/OutputStream;)V
|
||||
public fun writeArrowFeather (Ljava/nio/channels/WritableByteChannel;)V
|
||||
public fun writeArrowFeather (Ljava/nio/file/Path;)V
|
||||
public fun writeArrowIPC (Ljava/io/File;Z)V
|
||||
public fun writeArrowIPC (Ljava/io/OutputStream;)V
|
||||
public fun writeArrowIPC (Ljava/nio/channels/WritableByteChannel;)V
|
||||
public fun writeArrowIPC (Ljava/nio/file/Path;Z)V
|
||||
public static synthetic fun writeArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/io/File;ZILjava/lang/Object;)V
|
||||
public static synthetic fun writeArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/nio/file/Path;ZILjava/lang/Object;)V
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ArrowWriter$Companion {
|
||||
public final fun create (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lorg/apache/arrow/vector/types/pojo/Schema;Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;Lkotlin/jvm/functions/Function1;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;
|
||||
public static synthetic fun create$default (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Companion;Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lorg/apache/arrow/vector/types/pojo/Schema;Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;Lkotlin/jvm/functions/Function1;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ArrowWriter$DefaultImpls {
|
||||
public static fun saveArrowFeatherToByteArray (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;)[B
|
||||
public static fun saveArrowIPCToByteArray (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;)[B
|
||||
public static fun writeArrowFeather (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/io/File;)V
|
||||
public static fun writeArrowFeather (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/io/OutputStream;)V
|
||||
public static fun writeArrowFeather (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/nio/channels/WritableByteChannel;)V
|
||||
public static fun writeArrowFeather (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/nio/file/Path;)V
|
||||
public static fun writeArrowIPC (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/io/File;Z)V
|
||||
public static fun writeArrowIPC (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/io/OutputStream;)V
|
||||
public static fun writeArrowIPC (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/nio/channels/WritableByteChannel;)V
|
||||
public static fun writeArrowIPC (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/nio/file/Path;Z)V
|
||||
public static synthetic fun writeArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/io/File;ZILjava/lang/Object;)V
|
||||
public static synthetic fun writeArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;Ljava/nio/file/Path;ZILjava/lang/Object;)V
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode {
|
||||
public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode$Companion;
|
||||
public fun <init> (ZZZZ)V
|
||||
public final fun component1 ()Z
|
||||
public final fun component2 ()Z
|
||||
public final fun component3 ()Z
|
||||
public final fun component4 ()Z
|
||||
public final fun copy (ZZZZ)Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;
|
||||
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;ZZZZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;
|
||||
public fun equals (Ljava/lang/Object;)Z
|
||||
public final fun getRestrictNarrowing ()Z
|
||||
public final fun getRestrictWidening ()Z
|
||||
public final fun getStrictNullable ()Z
|
||||
public final fun getStrictType ()Z
|
||||
public fun hashCode ()I
|
||||
public fun toString ()Ljava/lang/String;
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode$Companion {
|
||||
public final fun getLOYAL ()Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;
|
||||
public final fun getSTRICT ()Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ArrowWriterKt {
|
||||
public static final fun getIgnoreMismatchMessage ()Lkotlin/jvm/functions/Function1;
|
||||
public static final fun getLogMismatchMessage ()Lkotlin/jvm/functions/Function1;
|
||||
public static final fun getWriteMismatchMessage ()Lkotlin/jvm/functions/Function1;
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ArrowWritingKt {
|
||||
public static final fun arrowWriter (Lorg/jetbrains/kotlinx/dataframe/DataFrame;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;
|
||||
public static final fun arrowWriter (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lorg/apache/arrow/vector/types/pojo/Schema;Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;Lkotlin/jvm/functions/Function1;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;
|
||||
public static synthetic fun arrowWriter$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lorg/apache/arrow/vector/types/pojo/Schema;Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter$Mode;Lkotlin/jvm/functions/Function1;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowWriter;
|
||||
public static final fun saveArrowFeatherToByteArray (Lorg/jetbrains/kotlinx/dataframe/DataFrame;)[B
|
||||
public static final fun saveArrowIPCToByteArray (Lorg/jetbrains/kotlinx/dataframe/DataFrame;)[B
|
||||
public static final fun writeArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/io/File;)V
|
||||
public static final fun writeArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/io/OutputStream;)V
|
||||
public static final fun writeArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/nio/channels/WritableByteChannel;)V
|
||||
public static final fun writeArrowFeather (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/nio/file/Path;)V
|
||||
public static final fun writeArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/io/File;Z)V
|
||||
public static final fun writeArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/io/OutputStream;)V
|
||||
public static final fun writeArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/nio/channels/WritableByteChannel;)V
|
||||
public static final fun writeArrowIPC (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/nio/file/Path;Z)V
|
||||
public static synthetic fun writeArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/io/File;ZILjava/lang/Object;)V
|
||||
public static synthetic fun writeArrowIPC$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/nio/file/Path;ZILjava/lang/Object;)V
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingException : java/lang/IllegalArgumentException {
|
||||
public fun <init> (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch;)V
|
||||
public final fun getMismatchCase ()Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch;
|
||||
}
|
||||
|
||||
public abstract class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch {
|
||||
public synthetic fun <init> (Ljava/lang/String;Ljava/lang/Integer;Ljava/lang/Exception;Lkotlin/jvm/internal/DefaultConstructorMarker;)V
|
||||
public fun getCause ()Ljava/lang/Exception;
|
||||
public fun getColumn ()Ljava/lang/String;
|
||||
public fun getRow ()Ljava/lang/Integer;
|
||||
}
|
||||
|
||||
public abstract class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch {
|
||||
public synthetic fun <init> (Ljava/lang/String;Lkotlin/jvm/internal/DefaultConstructorMarker;)V
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch$NotPresentedColumnError : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch {
|
||||
public fun <init> (Ljava/lang/String;)V
|
||||
public final fun component1 ()Ljava/lang/String;
|
||||
public final fun copy (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch$NotPresentedColumnError;
|
||||
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch$NotPresentedColumnError;Ljava/lang/String;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch$NotPresentedColumnError;
|
||||
public fun equals (Ljava/lang/Object;)Z
|
||||
public fun getColumn ()Ljava/lang/String;
|
||||
public fun hashCode ()I
|
||||
public fun toString ()Ljava/lang/String;
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch$NotPresentedColumnIgnored : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch {
|
||||
public fun <init> (Ljava/lang/String;)V
|
||||
public final fun component1 ()Ljava/lang/String;
|
||||
public final fun copy (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch$NotPresentedColumnIgnored;
|
||||
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch$NotPresentedColumnIgnored;Ljava/lang/String;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NarrowingMismatch$NotPresentedColumnIgnored;
|
||||
public fun equals (Ljava/lang/Object;)Z
|
||||
public fun getColumn ()Ljava/lang/String;
|
||||
public fun hashCode ()I
|
||||
public fun toString ()Ljava/lang/String;
|
||||
}
|
||||
|
||||
public abstract class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch {
|
||||
public synthetic fun <init> (Ljava/lang/String;Ljava/lang/Integer;Lkotlin/jvm/internal/DefaultConstructorMarker;)V
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch$NullValueError : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch {
|
||||
public fun <init> (Ljava/lang/String;Ljava/lang/Integer;)V
|
||||
public final fun component1 ()Ljava/lang/String;
|
||||
public final fun component2 ()Ljava/lang/Integer;
|
||||
public final fun copy (Ljava/lang/String;Ljava/lang/Integer;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch$NullValueError;
|
||||
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch$NullValueError;Ljava/lang/String;Ljava/lang/Integer;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch$NullValueError;
|
||||
public fun equals (Ljava/lang/Object;)Z
|
||||
public fun getColumn ()Ljava/lang/String;
|
||||
public fun getRow ()Ljava/lang/Integer;
|
||||
public fun hashCode ()I
|
||||
public fun toString ()Ljava/lang/String;
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch$NullValueIgnored : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch {
|
||||
public fun <init> (Ljava/lang/String;Ljava/lang/Integer;)V
|
||||
public final fun component1 ()Ljava/lang/String;
|
||||
public final fun component2 ()Ljava/lang/Integer;
|
||||
public final fun copy (Ljava/lang/String;Ljava/lang/Integer;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch$NullValueIgnored;
|
||||
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch$NullValueIgnored;Ljava/lang/String;Ljava/lang/Integer;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$NullableMismatch$NullValueIgnored;
|
||||
public fun equals (Ljava/lang/Object;)Z
|
||||
public fun getColumn ()Ljava/lang/String;
|
||||
public fun getRow ()Ljava/lang/Integer;
|
||||
public fun hashCode ()I
|
||||
public fun toString ()Ljava/lang/String;
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$SavedAsString : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch {
|
||||
public fun <init> (Ljava/lang/String;Ljava/lang/Class;)V
|
||||
public final fun component1 ()Ljava/lang/String;
|
||||
public final fun component2 ()Ljava/lang/Class;
|
||||
public final fun copy (Ljava/lang/String;Ljava/lang/Class;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$SavedAsString;
|
||||
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$SavedAsString;Ljava/lang/String;Ljava/lang/Class;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$SavedAsString;
|
||||
public fun equals (Ljava/lang/Object;)Z
|
||||
public fun getColumn ()Ljava/lang/String;
|
||||
public final fun getType ()Ljava/lang/Class;
|
||||
public fun hashCode ()I
|
||||
public fun toString ()Ljava/lang/String;
|
||||
}
|
||||
|
||||
public abstract class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch {
|
||||
public synthetic fun <init> (Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;Lkotlin/jvm/internal/DefaultConstructorMarker;)V
|
||||
public synthetic fun getCause ()Ljava/lang/Exception;
|
||||
public fun getCause ()Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail$ConversionFailError : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail {
|
||||
public fun <init> (Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;)V
|
||||
public final fun component1 ()Ljava/lang/String;
|
||||
public final fun component2 ()Ljava/lang/Integer;
|
||||
public final fun component3 ()Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;
|
||||
public final fun copy (Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail$ConversionFailError;
|
||||
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail$ConversionFailError;Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail$ConversionFailError;
|
||||
public fun equals (Ljava/lang/Object;)Z
|
||||
public synthetic fun getCause ()Ljava/lang/Exception;
|
||||
public fun getCause ()Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;
|
||||
public fun getColumn ()Ljava/lang/String;
|
||||
public fun getRow ()Ljava/lang/Integer;
|
||||
public fun hashCode ()I
|
||||
public fun toString ()Ljava/lang/String;
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail$ConversionFailIgnored : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail {
|
||||
public fun <init> (Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;)V
|
||||
public final fun component1 ()Ljava/lang/String;
|
||||
public final fun component2 ()Ljava/lang/Integer;
|
||||
public final fun component3 ()Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;
|
||||
public final fun copy (Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail$ConversionFailIgnored;
|
||||
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail$ConversionFailIgnored;Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionFail$ConversionFailIgnored;
|
||||
public fun equals (Ljava/lang/Object;)Z
|
||||
public synthetic fun getCause ()Ljava/lang/Exception;
|
||||
public fun getCause ()Lorg/jetbrains/kotlinx/dataframe/exceptions/CellConversionException;
|
||||
public fun getColumn ()Ljava/lang/String;
|
||||
public fun getRow ()Ljava/lang/Integer;
|
||||
public fun hashCode ()I
|
||||
public fun toString ()Ljava/lang/String;
|
||||
}
|
||||
|
||||
public abstract class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch {
|
||||
public synthetic fun <init> (Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;Lkotlin/jvm/internal/DefaultConstructorMarker;)V
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound$ConversionNotFoundError : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound {
|
||||
public fun <init> (Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;)V
|
||||
public final fun component1 ()Ljava/lang/String;
|
||||
public final fun component2 ()Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;
|
||||
public final fun copy (Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound$ConversionNotFoundError;
|
||||
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound$ConversionNotFoundError;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound$ConversionNotFoundError;
|
||||
public fun equals (Ljava/lang/Object;)Z
|
||||
public fun getColumn ()Ljava/lang/String;
|
||||
public final fun getE ()Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;
|
||||
public fun hashCode ()I
|
||||
public fun toString ()Ljava/lang/String;
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound$ConversionNotFoundIgnored : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound {
|
||||
public fun <init> (Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;)V
|
||||
public final fun component1 ()Ljava/lang/String;
|
||||
public final fun component2 ()Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;
|
||||
public final fun copy (Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound$ConversionNotFoundIgnored;
|
||||
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound$ConversionNotFoundIgnored;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$TypeConversionNotFound$ConversionNotFoundIgnored;
|
||||
public fun equals (Ljava/lang/Object;)Z
|
||||
public synthetic fun getCause ()Ljava/lang/Exception;
|
||||
public fun getCause ()Lorg/jetbrains/kotlinx/dataframe/exceptions/TypeConverterNotFoundException;
|
||||
public fun getColumn ()Ljava/lang/String;
|
||||
public fun hashCode ()I
|
||||
public fun toString ()Ljava/lang/String;
|
||||
}
|
||||
|
||||
public abstract class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch {
|
||||
public synthetic fun <init> (Ljava/lang/String;Lkotlin/jvm/internal/DefaultConstructorMarker;)V
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch$AddedColumn : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch {
|
||||
public fun <init> (Ljava/lang/String;)V
|
||||
public final fun component1 ()Ljava/lang/String;
|
||||
public final fun copy (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch$AddedColumn;
|
||||
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch$AddedColumn;Ljava/lang/String;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch$AddedColumn;
|
||||
public fun equals (Ljava/lang/Object;)Z
|
||||
public fun getColumn ()Ljava/lang/String;
|
||||
public fun hashCode ()I
|
||||
public fun toString ()Ljava/lang/String;
|
||||
}
|
||||
|
||||
public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch$RejectedColumn : org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch {
|
||||
public fun <init> (Ljava/lang/String;)V
|
||||
public final fun component1 ()Ljava/lang/String;
|
||||
public final fun copy (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch$RejectedColumn;
|
||||
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch$RejectedColumn;Ljava/lang/String;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$WideningMismatch$RejectedColumn;
|
||||
public fun equals (Ljava/lang/Object;)Z
|
||||
public fun getColumn ()Ljava/lang/String;
|
||||
public fun hashCode ()I
|
||||
public fun toString ()Ljava/lang/String;
|
||||
}
|
||||
|
||||
+44
@@ -0,0 +1,44 @@
|
||||
plugins {
|
||||
with(convention.plugins) {
|
||||
alias(kotlinJvm8)
|
||||
}
|
||||
with(libs.plugins) {
|
||||
alias(publisher)
|
||||
alias(binary.compatibility.validator)
|
||||
}
|
||||
}
|
||||
|
||||
group = "org.jetbrains.kotlinx"
|
||||
|
||||
dependencies {
|
||||
api(projects.core)
|
||||
|
||||
implementation(libs.arrow.vector)
|
||||
implementation(libs.arrow.format)
|
||||
implementation(libs.arrow.memory)
|
||||
implementation(libs.arrow.dataset)
|
||||
implementation(libs.commonsCompress)
|
||||
implementation(libs.kotlin.reflect)
|
||||
implementation(libs.kotlin.datetimeJvm)
|
||||
|
||||
testImplementation(libs.junit)
|
||||
testImplementation(projects.dataframeJson)
|
||||
testImplementation(libs.kotestAssertions) {
|
||||
exclude("org.jetbrains.kotlin", "kotlin-stdlib-jdk8")
|
||||
}
|
||||
testImplementation(libs.arrow.c.data)
|
||||
testImplementation(libs.duckdb.jdbc)
|
||||
}
|
||||
|
||||
kotlinPublications {
|
||||
publication {
|
||||
publicationName = "dataframeArrow"
|
||||
artifactId = project.name
|
||||
description = "Apache Arrow support for Kotlin DataFrame"
|
||||
packageName = artifactId
|
||||
}
|
||||
}
|
||||
|
||||
tasks.test {
|
||||
jvmArgs = listOf("--add-opens", "java.base/java.nio=ALL-UNNAMED")
|
||||
}
|
||||
Vendored
+174
@@ -0,0 +1,174 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import org.apache.arrow.vector.VectorSchemaRoot
|
||||
import org.apache.arrow.vector.ipc.ArrowFileWriter
|
||||
import org.apache.arrow.vector.ipc.ArrowStreamWriter
|
||||
import org.apache.arrow.vector.types.pojo.Schema
|
||||
import org.jetbrains.kotlinx.dataframe.AnyFrame
|
||||
import org.jetbrains.kotlinx.dataframe.DataFrame
|
||||
import org.slf4j.LoggerFactory
|
||||
import java.io.ByteArrayOutputStream
|
||||
import java.io.File
|
||||
import java.io.OutputStream
|
||||
import java.nio.channels.Channels
|
||||
import java.nio.channels.WritableByteChannel
|
||||
import java.nio.file.Path
|
||||
import java.nio.file.StandardOpenOption
|
||||
import kotlin.io.path.outputStream
|
||||
|
||||
public val ignoreMismatchMessage: (ConvertingMismatch) -> Unit = { message: ConvertingMismatch -> }
|
||||
public val writeMismatchMessage: (ConvertingMismatch) -> Unit = { message: ConvertingMismatch ->
|
||||
System.err.println(message)
|
||||
}
|
||||
|
||||
private val logger = LoggerFactory.getLogger(ArrowWriter::class.java)
|
||||
|
||||
public val logMismatchMessage: (ConvertingMismatch) -> Unit = { message: ConvertingMismatch ->
|
||||
logger.debug(message.toString())
|
||||
}
|
||||
|
||||
/**
|
||||
* Save [dataFrame] content in Apache Arrow format (can be written to File, ByteArray, OutputStream or raw Channel) with [targetSchema].
|
||||
* If [dataFrame] content does not match with [targetSchema], behaviour is specified by [mode], mismatches would be sent to [mismatchSubscriber]
|
||||
*/
|
||||
public interface ArrowWriter : AutoCloseable {
|
||||
public val dataFrame: DataFrame<*>
|
||||
public val targetSchema: Schema
|
||||
public val mode: Mode
|
||||
public val mismatchSubscriber: (ConvertingMismatch) -> Unit
|
||||
|
||||
public companion object {
|
||||
|
||||
public fun create(
|
||||
dataFrame: AnyFrame,
|
||||
targetSchema: Schema,
|
||||
mode: Mode,
|
||||
mismatchSubscriber: (ConvertingMismatch) -> Unit = ignoreMismatchMessage,
|
||||
): ArrowWriter = ArrowWriterImpl(dataFrame, targetSchema, mode, mismatchSubscriber)
|
||||
}
|
||||
|
||||
/**
|
||||
* If [restrictWidening] is true, [dataFrame] columns not described in [targetSchema] would not be saved (otherwise, would be saved as is).
|
||||
* If [restrictNarrowing] is true, [targetSchema] fields that are not nullable and do not exist in [dataFrame] will produce exception (otherwise, would not be saved).
|
||||
* If [strictType] is true, [dataFrame] columns described in [targetSchema] with non-compatible type will produce exception (otherwise, would be saved as is).
|
||||
* If [strictNullable] is true, [targetSchema] fields that are not nullable and contain nulls in [dataFrame] will produce exception (otherwise, would be saved as is with nullable = true).
|
||||
*/
|
||||
public data class Mode(
|
||||
public val restrictWidening: Boolean,
|
||||
public val restrictNarrowing: Boolean,
|
||||
public val strictType: Boolean,
|
||||
public val strictNullable: Boolean,
|
||||
) {
|
||||
public companion object {
|
||||
public val STRICT: Mode = Mode(true, true, true, true)
|
||||
public val LOYAL: Mode = Mode(false, false, false, false)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create Arrow [VectorSchemaRoot] with [dataFrame] content cast to [targetSchema] according to the [mode].
|
||||
*/
|
||||
public fun allocateVectorSchemaRoot(): VectorSchemaRoot
|
||||
|
||||
// IPC saving block
|
||||
|
||||
/**
|
||||
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format), write to opened [channel].
|
||||
*/
|
||||
public fun writeArrowIPC(channel: WritableByteChannel) {
|
||||
allocateVectorSchemaRoot().use { vectorSchemaRoot ->
|
||||
ArrowStreamWriter(vectorSchemaRoot, null, channel).use { writer ->
|
||||
writer.writeBatch()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format), write to opened [stream].
|
||||
*/
|
||||
public fun writeArrowIPC(stream: OutputStream) {
|
||||
writeArrowIPC(Channels.newChannel(stream))
|
||||
}
|
||||
|
||||
/**
|
||||
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format), write to new or existing [file].
|
||||
* If file exists, it can be recreated or expanded.
|
||||
*/
|
||||
public fun writeArrowIPC(file: File, append: Boolean = true) {
|
||||
writeArrowIPC(file.toPath(), append)
|
||||
}
|
||||
|
||||
/**
|
||||
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format),
|
||||
* write to a new or existing file on the given [path].
|
||||
* If file on the given [path] exists, it can be recreated or expanded.
|
||||
*/
|
||||
public fun writeArrowIPC(path: Path, append: Boolean = true) {
|
||||
val options = if (append) {
|
||||
arrayOf(StandardOpenOption.CREATE, StandardOpenOption.APPEND)
|
||||
} else {
|
||||
arrayOf(StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)
|
||||
}
|
||||
path.outputStream(*options).use { outputStream -> writeArrowIPC(outputStream) }
|
||||
}
|
||||
|
||||
/**
|
||||
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format), write to new [ByteArray]
|
||||
*/
|
||||
public fun saveArrowIPCToByteArray(): ByteArray {
|
||||
val stream = ByteArrayOutputStream()
|
||||
writeArrowIPC(stream)
|
||||
return stream.toByteArray()
|
||||
}
|
||||
|
||||
// Feather saving block
|
||||
|
||||
/**
|
||||
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files), write to opened [channel].
|
||||
*/
|
||||
public fun writeArrowFeather(channel: WritableByteChannel) {
|
||||
allocateVectorSchemaRoot().use { vectorSchemaRoot ->
|
||||
ArrowFileWriter(vectorSchemaRoot, null, channel).use { writer ->
|
||||
writer.writeBatch()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files), write to opened [stream].
|
||||
*/
|
||||
public fun writeArrowFeather(stream: OutputStream) {
|
||||
writeArrowFeather(Channels.newChannel(stream))
|
||||
}
|
||||
|
||||
/**
|
||||
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files), write to new or existing [file].
|
||||
* If [file] exists, it would be recreated.
|
||||
*/
|
||||
public fun writeArrowFeather(file: File) {
|
||||
writeArrowFeather(file.toPath())
|
||||
}
|
||||
|
||||
/**
|
||||
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files),
|
||||
* write to a new or existing file on the given [path].
|
||||
* If file on the given [path] exists, it would be recreated.
|
||||
*/
|
||||
public fun writeArrowFeather(path: Path) {
|
||||
path.outputStream(
|
||||
StandardOpenOption.CREATE,
|
||||
StandardOpenOption.TRUNCATE_EXISTING,
|
||||
).use { outputStream ->
|
||||
writeArrowFeather(outputStream)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files), write to new [ByteArray]
|
||||
*/
|
||||
public fun saveArrowFeatherToByteArray(): ByteArray {
|
||||
val stream = ByteArrayOutputStream()
|
||||
writeArrowFeather(stream)
|
||||
return stream.toByteArray()
|
||||
}
|
||||
}
|
||||
Vendored
+447
@@ -0,0 +1,447 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import kotlinx.datetime.TimeZone
|
||||
import kotlinx.datetime.toInstant
|
||||
import org.apache.arrow.memory.RootAllocator
|
||||
import org.apache.arrow.vector.BaseFixedWidthVector
|
||||
import org.apache.arrow.vector.BaseVariableWidthVector
|
||||
import org.apache.arrow.vector.BigIntVector
|
||||
import org.apache.arrow.vector.BitVector
|
||||
import org.apache.arrow.vector.DateDayVector
|
||||
import org.apache.arrow.vector.DateMilliVector
|
||||
import org.apache.arrow.vector.Decimal256Vector
|
||||
import org.apache.arrow.vector.DecimalVector
|
||||
import org.apache.arrow.vector.FieldVector
|
||||
import org.apache.arrow.vector.FixedWidthVector
|
||||
import org.apache.arrow.vector.Float4Vector
|
||||
import org.apache.arrow.vector.Float8Vector
|
||||
import org.apache.arrow.vector.IntVector
|
||||
import org.apache.arrow.vector.LargeVarCharVector
|
||||
import org.apache.arrow.vector.SmallIntVector
|
||||
import org.apache.arrow.vector.TimeMicroVector
|
||||
import org.apache.arrow.vector.TimeMilliVector
|
||||
import org.apache.arrow.vector.TimeNanoVector
|
||||
import org.apache.arrow.vector.TimeSecVector
|
||||
import org.apache.arrow.vector.TinyIntVector
|
||||
import org.apache.arrow.vector.VarCharVector
|
||||
import org.apache.arrow.vector.VariableWidthVector
|
||||
import org.apache.arrow.vector.VectorSchemaRoot
|
||||
import org.apache.arrow.vector.complex.StructVector
|
||||
import org.apache.arrow.vector.types.DateUnit
|
||||
import org.apache.arrow.vector.types.FloatingPointPrecision
|
||||
import org.apache.arrow.vector.types.pojo.ArrowType
|
||||
import org.apache.arrow.vector.types.pojo.Field
|
||||
import org.apache.arrow.vector.types.pojo.FieldType
|
||||
import org.apache.arrow.vector.types.pojo.Schema
|
||||
import org.apache.arrow.vector.util.Text
|
||||
import org.jetbrains.kotlinx.dataframe.AnyCol
|
||||
import org.jetbrains.kotlinx.dataframe.DataFrame
|
||||
import org.jetbrains.kotlinx.dataframe.api.convertToBigDecimal
|
||||
import org.jetbrains.kotlinx.dataframe.api.convertToBoolean
|
||||
import org.jetbrains.kotlinx.dataframe.api.convertToByte
|
||||
import org.jetbrains.kotlinx.dataframe.api.convertToDouble
|
||||
import org.jetbrains.kotlinx.dataframe.api.convertToFloat
|
||||
import org.jetbrains.kotlinx.dataframe.api.convertToInt
|
||||
import org.jetbrains.kotlinx.dataframe.api.convertToLocalDate
|
||||
import org.jetbrains.kotlinx.dataframe.api.convertToLocalDateTime
|
||||
import org.jetbrains.kotlinx.dataframe.api.convertToLocalTime
|
||||
import org.jetbrains.kotlinx.dataframe.api.convertToLong
|
||||
import org.jetbrains.kotlinx.dataframe.api.convertToShort
|
||||
import org.jetbrains.kotlinx.dataframe.api.convertToString
|
||||
import org.jetbrains.kotlinx.dataframe.api.forEachIndexed
|
||||
import org.jetbrains.kotlinx.dataframe.api.map
|
||||
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
|
||||
import org.jetbrains.kotlinx.dataframe.exceptions.CellConversionException
|
||||
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConverterNotFoundException
|
||||
import org.jetbrains.kotlinx.dataframe.indices
|
||||
import org.jetbrains.kotlinx.dataframe.name
|
||||
import org.jetbrains.kotlinx.dataframe.values
|
||||
import kotlin.reflect.full.isSubtypeOf
|
||||
import kotlin.reflect.typeOf
|
||||
|
||||
/**
|
||||
* Save [dataFrame] content in Apache Arrow format (can be written to File, ByteArray, OutputStream or raw Channel) with [targetSchema].
|
||||
* If [dataFrame] content does not match with [targetSchema], behaviour is specified by [mode], mismatches would be sent to [mismatchSubscriber]
|
||||
*/
|
||||
internal class ArrowWriterImpl(
|
||||
override val dataFrame: DataFrame<*>,
|
||||
override val targetSchema: Schema,
|
||||
override val mode: ArrowWriter.Mode,
|
||||
override val mismatchSubscriber: (ConvertingMismatch) -> Unit = ignoreMismatchMessage,
|
||||
) : ArrowWriter {
|
||||
|
||||
private val allocator = RootAllocator()
|
||||
|
||||
private fun allocateVector(vector: FieldVector, size: Int, totalBytes: Long? = null) {
|
||||
when (vector) {
|
||||
is FixedWidthVector -> vector.allocateNew(size)
|
||||
|
||||
is VariableWidthVector -> totalBytes?.let { vector.allocateNew(it, size) } ?: vector.allocateNew(size)
|
||||
|
||||
is StructVector -> {
|
||||
vector.childrenFromFields.forEach { child ->
|
||||
allocateVector(child, size)
|
||||
}
|
||||
}
|
||||
|
||||
else -> throw IllegalArgumentException("Can not allocate ${vector.javaClass.canonicalName}")
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate buffer size for VariableWidthVector (return null for FixedWidthVector)
|
||||
*/
|
||||
private fun countTotalBytes(column: AnyCol): Long? {
|
||||
val columnType = column.type()
|
||||
return when {
|
||||
columnType.isSubtypeOf(typeOf<String?>()) ->
|
||||
column.values.fold(0L) { totalBytes, value ->
|
||||
totalBytes + value.toString().length * 4
|
||||
}
|
||||
|
||||
else -> null
|
||||
}
|
||||
}
|
||||
|
||||
private fun infillWithNulls(vector: FieldVector, size: Int) {
|
||||
when (vector) {
|
||||
is BaseFixedWidthVector -> for (i in 0 until size) {
|
||||
vector.setNull(i)
|
||||
}
|
||||
|
||||
is BaseVariableWidthVector -> for (i in 0 until size) {
|
||||
vector.setNull(i)
|
||||
}
|
||||
|
||||
else -> throw IllegalArgumentException("Can not infill ${vector.javaClass.canonicalName}")
|
||||
}
|
||||
vector.valueCount = size
|
||||
}
|
||||
|
||||
private fun convertColumnToTarget(column: AnyCol?, targetFieldType: ArrowType): AnyCol? {
|
||||
if (column == null) return null
|
||||
return when (targetFieldType) {
|
||||
ArrowType.Utf8() -> column.map { it?.toString() }
|
||||
|
||||
ArrowType.LargeUtf8() -> column.map { it?.toString() }
|
||||
|
||||
ArrowType.Bool() -> column.convertToBoolean()
|
||||
|
||||
ArrowType.Int(8, true) -> column.convertToByte()
|
||||
|
||||
ArrowType.Int(16, true) -> column.convertToShort()
|
||||
|
||||
ArrowType.Int(32, true) -> column.convertToInt()
|
||||
|
||||
ArrowType.Int(64, true) -> column.convertToLong()
|
||||
|
||||
is ArrowType.Decimal -> column.convertToBigDecimal()
|
||||
|
||||
// Use [convertToDouble] as locale logic step
|
||||
ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE) ->
|
||||
column.convertToDouble().convertToFloat()
|
||||
|
||||
ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE) -> column.convertToDouble()
|
||||
|
||||
ArrowType.Date(DateUnit.DAY) -> column.convertToLocalDate()
|
||||
|
||||
ArrowType.Date(DateUnit.MILLISECOND) -> column.convertToLocalDateTime()
|
||||
|
||||
is ArrowType.Time -> column.convertToLocalTime()
|
||||
|
||||
is ArrowType.Struct -> column
|
||||
|
||||
else ->
|
||||
throw NotImplementedError(
|
||||
"Saving ${targetFieldType.javaClass.canonicalName} is currently not implemented",
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
private fun convertColumnToCompatible(column: AnyCol): Pair<AnyCol, Field> {
|
||||
val actualField = column.toArrowField(mismatchSubscriber)
|
||||
val result = try {
|
||||
convertColumnToTarget(column, actualField.type)!!
|
||||
} catch (e: Exception) {
|
||||
column
|
||||
}
|
||||
return result to actualField
|
||||
}
|
||||
|
||||
private fun infillVector(vector: FieldVector, column: AnyCol) {
|
||||
when (vector) {
|
||||
is VarCharVector ->
|
||||
column.convertToString()
|
||||
.forEachIndexed { i, value ->
|
||||
value?.also { vector.set(i, Text(value)) }
|
||||
?: vector.setNull(i)
|
||||
}
|
||||
|
||||
is LargeVarCharVector ->
|
||||
column.convertToString()
|
||||
.forEachIndexed { i, value ->
|
||||
value?.also { vector.set(i, Text(value)) }
|
||||
?: vector.setNull(i)
|
||||
}
|
||||
|
||||
is BitVector ->
|
||||
column.convertToBoolean()
|
||||
.forEachIndexed { i, value ->
|
||||
value?.also { vector.set(i, value.compareTo(false)) }
|
||||
?: vector.setNull(i)
|
||||
}
|
||||
|
||||
is TinyIntVector ->
|
||||
column.convertToInt()
|
||||
.forEachIndexed { i, value ->
|
||||
value?.also { vector.set(i, value) }
|
||||
?: vector.setNull(i)
|
||||
}
|
||||
|
||||
is SmallIntVector ->
|
||||
column.convertToInt()
|
||||
.forEachIndexed { i, value ->
|
||||
value?.also { vector.set(i, value) }
|
||||
?: vector.setNull(i)
|
||||
}
|
||||
|
||||
is IntVector ->
|
||||
column.convertToInt()
|
||||
.forEachIndexed { i, value ->
|
||||
value?.also { vector.set(i, value) }
|
||||
?: vector.setNull(i)
|
||||
}
|
||||
|
||||
is BigIntVector ->
|
||||
column.convertToLong()
|
||||
.forEachIndexed { i, value ->
|
||||
value?.also { vector.set(i, value) }
|
||||
?: vector.setNull(i)
|
||||
}
|
||||
|
||||
is DecimalVector ->
|
||||
column.convertToBigDecimal()
|
||||
.forEachIndexed { i, value ->
|
||||
value?.also { vector.set(i, value) }
|
||||
?: vector.setNull(i)
|
||||
}
|
||||
|
||||
is Decimal256Vector ->
|
||||
column.convertToBigDecimal()
|
||||
.forEachIndexed { i, value ->
|
||||
value?.also { vector.set(i, value) }
|
||||
?: vector.setNull(i)
|
||||
}
|
||||
|
||||
is Float8Vector ->
|
||||
column.convertToDouble()
|
||||
.forEachIndexed { i, value ->
|
||||
value?.also { vector.set(i, value) }
|
||||
?: vector.setNull(i)
|
||||
}
|
||||
|
||||
is Float4Vector ->
|
||||
column.convertToFloat()
|
||||
.forEachIndexed { i, value ->
|
||||
value?.also { vector.set(i, value) }
|
||||
?: vector.setNull(i)
|
||||
}
|
||||
|
||||
is DateDayVector ->
|
||||
column.convertToLocalDate()
|
||||
.forEachIndexed { i, value ->
|
||||
value?.also { vector.set(i, value.toEpochDays().toInt()) }
|
||||
?: vector.setNull(i)
|
||||
}
|
||||
|
||||
is DateMilliVector ->
|
||||
column.convertToLocalDateTime()
|
||||
.forEachIndexed { i, value ->
|
||||
value?.also { vector.set(i, value.toInstant(TimeZone.UTC).toEpochMilliseconds()) }
|
||||
?: vector.setNull(i)
|
||||
}
|
||||
|
||||
is TimeNanoVector ->
|
||||
column.convertToLocalTime()
|
||||
.forEachIndexed { i, value ->
|
||||
value?.also { vector.set(i, value.toNanosecondOfDay()) }
|
||||
?: vector.setNull(i)
|
||||
}
|
||||
|
||||
is TimeMicroVector ->
|
||||
column.convertToLocalTime()
|
||||
.forEachIndexed { i, value ->
|
||||
value?.also { vector.set(i, value.toNanosecondOfDay() / 1000) }
|
||||
?: vector.setNull(i)
|
||||
}
|
||||
|
||||
is TimeMilliVector ->
|
||||
column.convertToLocalTime()
|
||||
.forEachIndexed { i, value ->
|
||||
value?.also { vector.set(i, (value.toNanosecondOfDay() / 1000 / 1000).toInt()) }
|
||||
?: vector.setNull(i)
|
||||
}
|
||||
|
||||
is TimeSecVector ->
|
||||
column.convertToLocalTime()
|
||||
.forEachIndexed { i, value ->
|
||||
value?.also {
|
||||
vector.set(i, (value.toNanosecondOfDay() / 1000 / 1000 / 1000).toInt())
|
||||
} ?: vector.setNull(i)
|
||||
}
|
||||
|
||||
is StructVector -> {
|
||||
require(column is ColumnGroup<*>) {
|
||||
"StructVector expects ColumnGroup, but got ${column::class.simpleName}"
|
||||
}
|
||||
|
||||
column.columns().forEach { childColumn ->
|
||||
infillVector(vector.getChild(childColumn.name()), childColumn)
|
||||
}
|
||||
|
||||
column.indices.forEach { i -> vector.setIndexDefined(i) }
|
||||
}
|
||||
|
||||
else -> {
|
||||
// TODO implement other vector types from [readField] (VarBinaryVector, UIntVector, DurationVector, StructVector) and may be others (ListVector, FixedSizeListVector etc)
|
||||
throw NotImplementedError("Saving to ${vector.javaClass.canonicalName} is currently not implemented")
|
||||
}
|
||||
}
|
||||
|
||||
vector.valueCount = dataFrame.rowsCount()
|
||||
}
|
||||
|
||||
/**
|
||||
* Create Arrow FieldVector with [column] content cast to [field] type according to [strictType] and [strictNullable] settings.
|
||||
*/
|
||||
private fun allocateVectorAndInfill(
|
||||
field: Field,
|
||||
column: AnyCol?,
|
||||
strictType: Boolean,
|
||||
strictNullable: Boolean,
|
||||
): FieldVector {
|
||||
val containNulls = (column == null || column.hasNulls())
|
||||
// Convert the column to type specified in field. (If we already have target type, convertTo will do nothing)
|
||||
|
||||
val (convertedColumn, actualField) = try {
|
||||
convertColumnToTarget(column, field.type) to field
|
||||
} catch (e: CellConversionException) {
|
||||
if (strictType) {
|
||||
// If conversion failed but strictType is enabled, throw the exception
|
||||
val mismatch =
|
||||
ConvertingMismatch.TypeConversionFail.ConversionFailError(e.column?.name() ?: "", e.row, e)
|
||||
mismatchSubscriber(mismatch)
|
||||
throw ConvertingException(mismatch)
|
||||
} else {
|
||||
// If strictType is not enabled, use original data with its type. Target nullable is saved at this step.
|
||||
mismatchSubscriber(
|
||||
ConvertingMismatch.TypeConversionFail.ConversionFailIgnored(
|
||||
column = e.column?.name() ?: "",
|
||||
row = e.row,
|
||||
cause = e,
|
||||
),
|
||||
)
|
||||
convertColumnToCompatible(column!!)
|
||||
}
|
||||
} catch (e: TypeConverterNotFoundException) {
|
||||
if (strictType) {
|
||||
// If conversion failed but strictType is enabled, throw the exception
|
||||
val mismatch = ConvertingMismatch.TypeConversionNotFound.ConversionNotFoundError(field.name, e)
|
||||
mismatchSubscriber(mismatch)
|
||||
throw ConvertingException(mismatch)
|
||||
} else {
|
||||
// If strictType is not enabled, use original data with its type. Target nullable is saved at this step.
|
||||
mismatchSubscriber(ConvertingMismatch.TypeConversionNotFound.ConversionNotFoundIgnored(field.name, e))
|
||||
convertColumnToCompatible(column!!)
|
||||
}
|
||||
}
|
||||
|
||||
val vector = if (!actualField.isNullable && containNulls) {
|
||||
var firstNullValue: Int? = null
|
||||
for (i in 0 until (column?.size() ?: -1)) {
|
||||
if (column!![i] == null) {
|
||||
firstNullValue = i
|
||||
break
|
||||
}
|
||||
}
|
||||
if (strictNullable) {
|
||||
val mismatch = ConvertingMismatch.NullableMismatch.NullValueError(actualField.name, firstNullValue)
|
||||
mismatchSubscriber(mismatch)
|
||||
throw ConvertingException(mismatch)
|
||||
} else {
|
||||
mismatchSubscriber(
|
||||
ConvertingMismatch.NullableMismatch.NullValueIgnored(
|
||||
actualField.name,
|
||||
firstNullValue,
|
||||
),
|
||||
)
|
||||
Field(
|
||||
actualField.name,
|
||||
FieldType(true, actualField.fieldType.type, actualField.fieldType.dictionary),
|
||||
actualField.children,
|
||||
).createVector(allocator)!!
|
||||
}
|
||||
} else {
|
||||
actualField.createVector(allocator)!!
|
||||
}
|
||||
|
||||
if (convertedColumn == null) {
|
||||
check(actualField.isNullable)
|
||||
allocateVector(vector, dataFrame.rowsCount())
|
||||
infillWithNulls(vector, dataFrame.rowsCount())
|
||||
} else {
|
||||
allocateVector(vector, dataFrame.rowsCount(), countTotalBytes(convertedColumn))
|
||||
infillVector(vector, convertedColumn)
|
||||
}
|
||||
return vector
|
||||
}
|
||||
|
||||
private fun List<AnyCol>.toVectors(): List<FieldVector> =
|
||||
this.map {
|
||||
val field = it.toArrowField(mismatchSubscriber)
|
||||
allocateVectorAndInfill(field = field, column = it, strictType = true, strictNullable = true)
|
||||
}
|
||||
|
||||
override fun allocateVectorSchemaRoot(): VectorSchemaRoot {
|
||||
val mainVectors = LinkedHashMap<String, FieldVector>()
|
||||
try {
|
||||
for (field in targetSchema.fields) {
|
||||
val column = dataFrame.getColumnOrNull(field.name)
|
||||
if (column == null && !field.isNullable) {
|
||||
if (mode.restrictNarrowing) {
|
||||
val mismatch = ConvertingMismatch.NarrowingMismatch.NotPresentedColumnError(field.name)
|
||||
mismatchSubscriber(mismatch)
|
||||
throw ConvertingException(mismatch)
|
||||
} else {
|
||||
mismatchSubscriber(ConvertingMismatch.NarrowingMismatch.NotPresentedColumnIgnored(field.name))
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
val vector = allocateVectorAndInfill(field, column, mode.strictType, mode.strictNullable)
|
||||
mainVectors[field.name] = vector
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
mainVectors.values.forEach { it.close() } // Clear buffers before throwing exception
|
||||
throw e
|
||||
}
|
||||
val vectors = ArrayList<FieldVector>()
|
||||
vectors.addAll(mainVectors.values)
|
||||
val otherColumns = dataFrame.columns().filter { column -> !mainVectors.containsKey(column.name()) }
|
||||
if (!mode.restrictWidening) {
|
||||
vectors.addAll(otherColumns.toVectors())
|
||||
otherColumns.forEach {
|
||||
mismatchSubscriber(ConvertingMismatch.WideningMismatch.AddedColumn(it.name))
|
||||
}
|
||||
} else {
|
||||
otherColumns.forEach {
|
||||
mismatchSubscriber(ConvertingMismatch.WideningMismatch.RejectedColumn(it.name))
|
||||
}
|
||||
}
|
||||
return VectorSchemaRoot(vectors)
|
||||
}
|
||||
|
||||
override fun close() {
|
||||
allocator.close()
|
||||
}
|
||||
}
|
||||
Vendored
+101
@@ -0,0 +1,101 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import org.jetbrains.kotlinx.dataframe.exceptions.CellConversionException
|
||||
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConverterNotFoundException
|
||||
|
||||
/**
|
||||
* Detailed message about any mismatch when saving to Arrow format with user-defined schema that does not match with actual data.
|
||||
* Can be sent to callback, written to log or encapsulated to exception
|
||||
*/
|
||||
public sealed class ConvertingMismatch(
|
||||
/** Name of the column with mismatch */
|
||||
public open val column: String,
|
||||
/** Number of first row with mismatch (0-based) if defined */
|
||||
public open val row: Int?,
|
||||
/** Original exception if exist */
|
||||
public open val cause: Exception?,
|
||||
) {
|
||||
|
||||
public sealed class WideningMismatch(column: String) : ConvertingMismatch(column, null, null) {
|
||||
public data class AddedColumn(override val column: String) : WideningMismatch(column) {
|
||||
override fun toString(): String = "Added column \"$column\" not described in target schema"
|
||||
}
|
||||
|
||||
public data class RejectedColumn(override val column: String) : WideningMismatch(column) {
|
||||
override fun toString(): String = "Column \"$column\" is not described in target schema and was ignored"
|
||||
}
|
||||
}
|
||||
|
||||
public sealed class NarrowingMismatch(column: String) : ConvertingMismatch(column, null, null) {
|
||||
public data class NotPresentedColumnIgnored(override val column: String) : NarrowingMismatch(column) {
|
||||
override fun toString(): String =
|
||||
"Not nullable column \"$column\" is not presented in actual data, saving as is"
|
||||
}
|
||||
|
||||
public data class NotPresentedColumnError(override val column: String) : NarrowingMismatch(column) {
|
||||
override fun toString(): String =
|
||||
"Not nullable column \"$column\" is not presented in actual data, can not save"
|
||||
}
|
||||
}
|
||||
|
||||
public sealed class TypeConversionNotFound(column: String, cause: TypeConverterNotFoundException) :
|
||||
ConvertingMismatch(column, null, cause) {
|
||||
|
||||
public data class ConversionNotFoundIgnored(
|
||||
override val column: String,
|
||||
override val cause: TypeConverterNotFoundException,
|
||||
) : TypeConversionNotFound(column, cause) {
|
||||
override fun toString(): String = "${cause.message} for column \"$column\", saving as is"
|
||||
}
|
||||
|
||||
public data class ConversionNotFoundError(override val column: String, val e: TypeConverterNotFoundException) :
|
||||
TypeConversionNotFound(column, e) {
|
||||
override fun toString(): String = "${e.message} for column \"$column\", can not save"
|
||||
}
|
||||
}
|
||||
|
||||
public sealed class TypeConversionFail(
|
||||
column: String,
|
||||
row: Int?,
|
||||
public override val cause: CellConversionException,
|
||||
) : ConvertingMismatch(column, row, cause) {
|
||||
|
||||
public data class ConversionFailIgnored(
|
||||
override val column: String,
|
||||
override val row: Int?,
|
||||
override val cause: CellConversionException,
|
||||
) : TypeConversionFail(column, row, cause) {
|
||||
override fun toString(): String = "${cause.message}, saving as is"
|
||||
}
|
||||
|
||||
public data class ConversionFailError(
|
||||
override val column: String,
|
||||
override val row: Int?,
|
||||
override val cause: CellConversionException,
|
||||
) : TypeConversionFail(column, row, cause) {
|
||||
override fun toString(): String = "${cause.message}, can not save"
|
||||
}
|
||||
}
|
||||
|
||||
public data class SavedAsString(override val column: String, val type: Class<*>) :
|
||||
ConvertingMismatch(column, null, null) {
|
||||
override fun toString(): String = "Column \"$column\" has type ${type.canonicalName}, will be saved as String\""
|
||||
}
|
||||
|
||||
public sealed class NullableMismatch(column: String, row: Int?) : ConvertingMismatch(column, row, null) {
|
||||
public data class NullValueIgnored(override val column: String, override val row: Int?) :
|
||||
NullableMismatch(column, row) {
|
||||
override fun toString(): String =
|
||||
"Column \"$column\" contains nulls in row $row but expected not nullable, saving as is"
|
||||
}
|
||||
|
||||
public data class NullValueError(override val column: String, override val row: Int?) :
|
||||
NullableMismatch(column, row) {
|
||||
override fun toString(): String =
|
||||
"Column \"$column\" contains nulls in row $row but expected not nullable, can not save"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public class ConvertingException(public val mismatchCase: ConvertingMismatch) :
|
||||
IllegalArgumentException(mismatchCase.toString(), mismatchCase.cause)
|
||||
Vendored
+269
@@ -0,0 +1,269 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import org.apache.arrow.dataset.file.FileFormat
|
||||
import org.apache.arrow.memory.RootAllocator
|
||||
import org.apache.arrow.vector.ipc.ArrowReader
|
||||
import org.apache.commons.compress.utils.SeekableInMemoryByteChannel
|
||||
import org.jetbrains.kotlinx.dataframe.AnyFrame
|
||||
import org.jetbrains.kotlinx.dataframe.DataFrame
|
||||
import org.jetbrains.kotlinx.dataframe.api.NullabilityOptions
|
||||
import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod
|
||||
import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod
|
||||
import java.io.File
|
||||
import java.io.InputStream
|
||||
import java.net.URI
|
||||
import java.net.URL
|
||||
import java.nio.channels.Channels
|
||||
import java.nio.channels.ReadableByteChannel
|
||||
import java.nio.channels.SeekableByteChannel
|
||||
import java.nio.file.Files
|
||||
import java.nio.file.Path
|
||||
|
||||
public class ArrowFeather : SupportedDataFrameFormat {
|
||||
override fun readDataFrame(stream: InputStream, header: List<String>): AnyFrame =
|
||||
DataFrame.readArrowFeather(stream, NullabilityOptions.Widening)
|
||||
|
||||
override fun readDataFrame(path: Path, header: List<String>): AnyFrame =
|
||||
DataFrame.readArrowFeather(path, NullabilityOptions.Widening)
|
||||
|
||||
override fun acceptsExtension(ext: String): Boolean = ext == "feather"
|
||||
|
||||
override fun acceptsSample(sample: SupportedFormatSample): Boolean = true // Extension is enough
|
||||
|
||||
override val testOrder: Int = 50000
|
||||
|
||||
override fun createDefaultReadMethod(pathRepresentation: String?): DefaultReadDfMethod =
|
||||
DefaultReadArrowMethod(pathRepresentation)
|
||||
}
|
||||
|
||||
private const val READ_ARROW_FEATHER = "readArrowFeather"
|
||||
|
||||
internal const val ARROW_PARQUET_DEFAULT_BATCH_SIZE = 32768L
|
||||
|
||||
private class DefaultReadArrowMethod(path: String?) :
|
||||
AbstractDefaultReadMethod(path, MethodArguments.EMPTY, READ_ARROW_FEATHER)
|
||||
|
||||
internal object Allocator {
|
||||
val ROOT by lazy {
|
||||
RootAllocator(Long.MAX_VALUE)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format) data from existing [channel]
|
||||
*/
|
||||
public fun DataFrame.Companion.readArrowIPC(
|
||||
channel: ReadableByteChannel,
|
||||
allocator: RootAllocator = Allocator.ROOT,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
): AnyFrame = readArrowIPCImpl(channel, allocator, nullability)
|
||||
|
||||
/**
|
||||
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [channel]
|
||||
*/
|
||||
public fun DataFrame.Companion.readArrowFeather(
|
||||
channel: SeekableByteChannel,
|
||||
allocator: RootAllocator = Allocator.ROOT,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
): AnyFrame = readArrowFeatherImpl(channel, allocator, nullability)
|
||||
|
||||
// IPC reading block
|
||||
|
||||
/**
|
||||
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format) data from existing [file]
|
||||
*/
|
||||
public fun DataFrame.Companion.readArrowIPC(
|
||||
file: File,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
): AnyFrame = readArrowIPC(file.toPath(), nullability)
|
||||
|
||||
/**
|
||||
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format)
|
||||
* data from existing file on the given [path].
|
||||
*/
|
||||
public fun DataFrame.Companion.readArrowIPC(
|
||||
path: Path,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
): AnyFrame = Files.newByteChannel(path).use { readArrowIPC(it, nullability = nullability) }
|
||||
|
||||
/**
|
||||
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format) data from existing [byteArray]
|
||||
*/
|
||||
public fun DataFrame.Companion.readArrowIPC(
|
||||
byteArray: ByteArray,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
): AnyFrame = SeekableInMemoryByteChannel(byteArray).use { readArrowIPC(it, nullability = nullability) }
|
||||
|
||||
/**
|
||||
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format) data from existing [stream]
|
||||
*/
|
||||
public fun DataFrame.Companion.readArrowIPC(
|
||||
stream: InputStream,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
): AnyFrame = Channels.newChannel(stream).use { readArrowIPC(it, nullability = nullability) }
|
||||
|
||||
/**
|
||||
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format) data from existing [url]
|
||||
*/
|
||||
public fun DataFrame.Companion.readArrowIPC(
|
||||
url: URL,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
): AnyFrame =
|
||||
when {
|
||||
isFile(url) -> readArrowIPC(urlAsFile(url), nullability)
|
||||
|
||||
isProtocolSupported(url) -> url.openStream().use { readArrowIPC(it, nullability) }
|
||||
|
||||
else -> {
|
||||
throw IllegalArgumentException("Invalid protocol for url $url")
|
||||
}
|
||||
}
|
||||
|
||||
public fun DataFrame.Companion.readArrowIPC(
|
||||
path: String,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
): AnyFrame =
|
||||
if (isUrl(path)) {
|
||||
readArrowIPC(URI(path).toURL(), nullability)
|
||||
} else {
|
||||
readArrowIPC(File(path), nullability)
|
||||
}
|
||||
|
||||
// Feather reading block
|
||||
|
||||
/**
|
||||
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [file]
|
||||
*/
|
||||
public fun DataFrame.Companion.readArrowFeather(
|
||||
file: File,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
): AnyFrame = readArrowFeather(file.toPath(), nullability)
|
||||
|
||||
/**
|
||||
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files)
|
||||
* data from an existing file on the given [path].
|
||||
*/
|
||||
public fun DataFrame.Companion.readArrowFeather(
|
||||
path: Path,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
): AnyFrame = Files.newByteChannel(path).use { readArrowFeather(it, nullability = nullability) }
|
||||
|
||||
/**
|
||||
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [byteArray]
|
||||
*/
|
||||
public fun DataFrame.Companion.readArrowFeather(
|
||||
byteArray: ByteArray,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
): AnyFrame = SeekableInMemoryByteChannel(byteArray).use { readArrowFeather(it, nullability = nullability) }
|
||||
|
||||
/**
|
||||
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [stream]
|
||||
*/
|
||||
public fun DataFrame.Companion.readArrowFeather(
|
||||
stream: InputStream,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
): AnyFrame = readArrowFeather(stream.readBytes(), nullability)
|
||||
|
||||
/**
|
||||
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [url]
|
||||
*/
|
||||
public fun DataFrame.Companion.readArrowFeather(
|
||||
url: URL,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
): AnyFrame =
|
||||
when {
|
||||
isFile(url) -> readArrowFeather(urlAsFile(url), nullability)
|
||||
|
||||
isProtocolSupported(url) -> readArrowFeather(url.readBytes(), nullability)
|
||||
|
||||
else -> {
|
||||
throw IllegalArgumentException("Invalid protocol for url $url")
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [path]
|
||||
*/
|
||||
public fun DataFrame.Companion.readArrowFeather(
|
||||
path: String,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
): AnyFrame =
|
||||
if (isUrl(path)) {
|
||||
readArrowFeather(URI(path).toURL(), nullability)
|
||||
} else {
|
||||
readArrowFeather(File(path), nullability)
|
||||
}
|
||||
|
||||
/**
|
||||
* Read [Arrow any format](https://arrow.apache.org/docs/java/ipc.html#reading-writing-ipc-formats) data from existing [reader]
|
||||
*/
|
||||
public fun DataFrame.Companion.readArrow(
|
||||
reader: ArrowReader,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
): AnyFrame = readArrowImpl(reader, nullability)
|
||||
|
||||
/**
|
||||
* Read [Arrow any format](https://arrow.apache.org/docs/java/ipc.html#reading-writing-ipc-formats) data from existing [ArrowReader]
|
||||
*/
|
||||
public fun ArrowReader.toDataFrame(nullability: NullabilityOptions = NullabilityOptions.Infer): AnyFrame =
|
||||
DataFrame.Companion.readArrowImpl(this, nullability)
|
||||
|
||||
/**
|
||||
* Read [Parquet](https://parquet.apache.org/) data from existing [urls] by using [Arrow Dataset](https://arrow.apache.org/docs/java/dataset.html)
|
||||
*/
|
||||
public fun DataFrame.Companion.readParquet(
|
||||
vararg urls: URL,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
batchSize: Long = ARROW_PARQUET_DEFAULT_BATCH_SIZE,
|
||||
): AnyFrame =
|
||||
readArrowDatasetImpl(
|
||||
urls.map {
|
||||
it.toString()
|
||||
}.toTypedArray(),
|
||||
FileFormat.PARQUET,
|
||||
nullability,
|
||||
batchSize,
|
||||
)
|
||||
|
||||
/**
|
||||
* Read [Parquet](https://parquet.apache.org/) data from existing [strUrls] by using [Arrow Dataset](https://arrow.apache.org/docs/java/dataset.html)
|
||||
*/
|
||||
public fun DataFrame.Companion.readParquet(
|
||||
vararg strUrls: String,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
batchSize: Long = ARROW_PARQUET_DEFAULT_BATCH_SIZE,
|
||||
): AnyFrame = readArrowDatasetImpl(arrayOf(*strUrls), FileFormat.PARQUET, nullability, batchSize)
|
||||
|
||||
/**
|
||||
* Read [Parquet](https://parquet.apache.org/) data from existing [paths] by using [Arrow Dataset](https://arrow.apache.org/docs/java/dataset.html)
|
||||
*/
|
||||
public fun DataFrame.Companion.readParquet(
|
||||
vararg paths: Path,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
batchSize: Long = ARROW_PARQUET_DEFAULT_BATCH_SIZE,
|
||||
): AnyFrame =
|
||||
readArrowDatasetImpl(
|
||||
paths.map {
|
||||
it.toUri().toString()
|
||||
}.toTypedArray(),
|
||||
FileFormat.PARQUET,
|
||||
nullability,
|
||||
batchSize,
|
||||
)
|
||||
|
||||
/**
|
||||
* Read [Parquet](https://parquet.apache.org/) data from existing [files] by using [Arrow Dataset](https://arrow.apache.org/docs/java/dataset.html)
|
||||
*/
|
||||
public fun DataFrame.Companion.readParquet(
|
||||
vararg files: File,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
batchSize: Long = ARROW_PARQUET_DEFAULT_BATCH_SIZE,
|
||||
): AnyFrame =
|
||||
readArrowDatasetImpl(
|
||||
files.map {
|
||||
it.toURI().toString()
|
||||
}.toTypedArray(),
|
||||
FileFormat.PARQUET,
|
||||
nullability,
|
||||
batchSize,
|
||||
)
|
||||
Vendored
+481
@@ -0,0 +1,481 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import kotlinx.datetime.LocalDate
|
||||
import kotlinx.datetime.LocalDateTime
|
||||
import kotlinx.datetime.LocalTime
|
||||
import kotlinx.datetime.toKotlinLocalDate
|
||||
import kotlinx.datetime.toKotlinLocalDateTime
|
||||
import kotlinx.datetime.toKotlinLocalTime
|
||||
import org.apache.arrow.dataset.file.FileFormat
|
||||
import org.apache.arrow.dataset.file.FileSystemDatasetFactory
|
||||
import org.apache.arrow.dataset.jni.DirectReservationListener
|
||||
import org.apache.arrow.dataset.jni.NativeMemoryPool
|
||||
import org.apache.arrow.dataset.scanner.ScanOptions
|
||||
import org.apache.arrow.memory.RootAllocator
|
||||
import org.apache.arrow.vector.BigIntVector
|
||||
import org.apache.arrow.vector.BitVector
|
||||
import org.apache.arrow.vector.DateDayVector
|
||||
import org.apache.arrow.vector.DateMilliVector
|
||||
import org.apache.arrow.vector.Decimal256Vector
|
||||
import org.apache.arrow.vector.DecimalVector
|
||||
import org.apache.arrow.vector.DurationVector
|
||||
import org.apache.arrow.vector.FieldVector
|
||||
import org.apache.arrow.vector.Float4Vector
|
||||
import org.apache.arrow.vector.Float8Vector
|
||||
import org.apache.arrow.vector.IntVector
|
||||
import org.apache.arrow.vector.LargeVarBinaryVector
|
||||
import org.apache.arrow.vector.LargeVarCharVector
|
||||
import org.apache.arrow.vector.NullVector
|
||||
import org.apache.arrow.vector.SmallIntVector
|
||||
import org.apache.arrow.vector.TimeMicroVector
|
||||
import org.apache.arrow.vector.TimeMilliVector
|
||||
import org.apache.arrow.vector.TimeNanoVector
|
||||
import org.apache.arrow.vector.TimeSecVector
|
||||
import org.apache.arrow.vector.TimeStampMicroVector
|
||||
import org.apache.arrow.vector.TimeStampMilliVector
|
||||
import org.apache.arrow.vector.TimeStampNanoVector
|
||||
import org.apache.arrow.vector.TimeStampSecVector
|
||||
import org.apache.arrow.vector.TinyIntVector
|
||||
import org.apache.arrow.vector.UInt1Vector
|
||||
import org.apache.arrow.vector.UInt2Vector
|
||||
import org.apache.arrow.vector.UInt4Vector
|
||||
import org.apache.arrow.vector.UInt8Vector
|
||||
import org.apache.arrow.vector.VarBinaryVector
|
||||
import org.apache.arrow.vector.VarCharVector
|
||||
import org.apache.arrow.vector.VectorSchemaRoot
|
||||
import org.apache.arrow.vector.ViewVarBinaryVector
|
||||
import org.apache.arrow.vector.ViewVarCharVector
|
||||
import org.apache.arrow.vector.complex.StructVector
|
||||
import org.apache.arrow.vector.ipc.ArrowFileReader
|
||||
import org.apache.arrow.vector.ipc.ArrowReader
|
||||
import org.apache.arrow.vector.ipc.ArrowStreamReader
|
||||
import org.apache.arrow.vector.types.pojo.Field
|
||||
import org.apache.arrow.vector.util.DateUtility
|
||||
import org.jetbrains.kotlinx.dataframe.AnyBaseCol
|
||||
import org.jetbrains.kotlinx.dataframe.AnyFrame
|
||||
import org.jetbrains.kotlinx.dataframe.DataColumn
|
||||
import org.jetbrains.kotlinx.dataframe.DataFrame
|
||||
import org.jetbrains.kotlinx.dataframe.api.Infer
|
||||
import org.jetbrains.kotlinx.dataframe.api.NullabilityException
|
||||
import org.jetbrains.kotlinx.dataframe.api.NullabilityOptions
|
||||
import org.jetbrains.kotlinx.dataframe.api.applyNullability
|
||||
import org.jetbrains.kotlinx.dataframe.api.cast
|
||||
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
|
||||
import org.jetbrains.kotlinx.dataframe.api.emptyDataFrame
|
||||
import org.jetbrains.kotlinx.dataframe.api.getColumn
|
||||
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
|
||||
import org.jetbrains.kotlinx.dataframe.impl.asList
|
||||
import java.io.File
|
||||
import java.math.BigDecimal
|
||||
import java.math.BigInteger
|
||||
import java.net.URI
|
||||
import java.nio.channels.ReadableByteChannel
|
||||
import java.nio.channels.SeekableByteChannel
|
||||
import java.nio.file.Files
|
||||
import kotlin.reflect.KType
|
||||
import kotlin.reflect.full.withNullability
|
||||
import kotlin.reflect.typeOf
|
||||
import kotlin.time.Duration
|
||||
import kotlin.time.toKotlinDuration
|
||||
import java.time.LocalTime as JavaLocalTime
|
||||
|
||||
/**
|
||||
* same as [Iterable<DataFrame<T>>.concat()] without internal type guessing (all batches should have the same schema)
|
||||
*/
|
||||
internal fun <T> Iterable<DataFrame<T>>.concatKeepingSchema(): DataFrame<T> {
|
||||
val dataFrames = asList()
|
||||
when (dataFrames.size) {
|
||||
0 -> return emptyDataFrame()
|
||||
1 -> return dataFrames[0]
|
||||
}
|
||||
|
||||
val columnNames = dataFrames.first().columnNames()
|
||||
|
||||
val columns = columnNames.map { name ->
|
||||
val values = dataFrames.flatMap { it.getColumn(name).values() }
|
||||
DataColumn.createValueColumn(name, values, dataFrames.first().getColumn(name).type())
|
||||
}
|
||||
return dataFrameOf(columns).cast()
|
||||
}
|
||||
|
||||
private fun BitVector.values(range: IntRange): List<Boolean?> = range.map { getObject(it) }
|
||||
|
||||
private fun UInt1Vector.values(range: IntRange): List<Short?> = range.map { getObjectNoOverflow(it) }
|
||||
|
||||
private fun UInt2Vector.values(range: IntRange): List<Int?> = range.map { getObject(it)?.code }
|
||||
|
||||
private fun UInt4Vector.values(range: IntRange): List<Long?> = range.map { getObjectNoOverflow(it) }
|
||||
|
||||
private fun UInt8Vector.values(range: IntRange): List<BigInteger?> = range.map { getObjectNoOverflow(it) }
|
||||
|
||||
private fun TinyIntVector.values(range: IntRange): List<Byte?> = range.map { getObject(it) }
|
||||
|
||||
private fun SmallIntVector.values(range: IntRange): List<Short?> = range.map { getObject(it) }
|
||||
|
||||
private fun IntVector.values(range: IntRange): List<Int?> = range.map { getObject(it) }
|
||||
|
||||
private fun BigIntVector.values(range: IntRange): List<Long?> = range.map { getObject(it) }
|
||||
|
||||
private fun DecimalVector.values(range: IntRange): List<BigDecimal?> = range.map { getObject(it) }
|
||||
|
||||
private fun Decimal256Vector.values(range: IntRange): List<BigDecimal?> = range.map { getObject(it) }
|
||||
|
||||
private fun Float4Vector.values(range: IntRange): List<Float?> = range.map { getObject(it) }
|
||||
|
||||
private fun Float8Vector.values(range: IntRange): List<Double?> = range.map { getObject(it) }
|
||||
|
||||
private fun DurationVector.values(range: IntRange): List<Duration?> = range.map { getObject(it).toKotlinDuration() }
|
||||
|
||||
private fun DateDayVector.values(range: IntRange): List<LocalDate?> =
|
||||
range.map {
|
||||
if (getObject(it) == null) {
|
||||
null
|
||||
} else {
|
||||
DateUtility.getLocalDateTimeFromEpochMilli(getObject(it).toLong() * DateUtility.daysToStandardMillis)
|
||||
.toLocalDate()
|
||||
.toKotlinLocalDate()
|
||||
}
|
||||
}
|
||||
|
||||
private fun DateMilliVector.values(range: IntRange): List<LocalDateTime?> =
|
||||
range.map { getObject(it)?.toKotlinLocalDateTime() }
|
||||
|
||||
private fun TimeNanoVector.values(range: IntRange): List<LocalTime?> =
|
||||
range.mapIndexed { i, it ->
|
||||
if (isNull(i)) {
|
||||
null
|
||||
} else {
|
||||
JavaLocalTime.ofNanoOfDay(get(it)).toKotlinLocalTime()
|
||||
}
|
||||
}
|
||||
|
||||
private fun TimeMicroVector.values(range: IntRange): List<LocalTime?> =
|
||||
range.mapIndexed { i, it ->
|
||||
if (isNull(i)) {
|
||||
null
|
||||
} else {
|
||||
JavaLocalTime.ofNanoOfDay(getObject(it) * 1000).toKotlinLocalTime()
|
||||
}
|
||||
}
|
||||
|
||||
private fun TimeMilliVector.values(range: IntRange): List<LocalTime?> =
|
||||
range.mapIndexed { i, it ->
|
||||
if (isNull(i)) {
|
||||
null
|
||||
} else {
|
||||
JavaLocalTime.ofNanoOfDay(get(it).toLong() * 1000_000).toKotlinLocalTime()
|
||||
}
|
||||
}
|
||||
|
||||
private fun TimeSecVector.values(range: IntRange): List<LocalTime?> =
|
||||
range.map { getObject(it)?.let { JavaLocalTime.ofSecondOfDay(it.toLong()).toKotlinLocalTime() } }
|
||||
|
||||
private fun TimeStampNanoVector.values(range: IntRange): List<LocalDateTime?> =
|
||||
range.mapIndexed { i, it ->
|
||||
if (isNull(i)) {
|
||||
null
|
||||
} else {
|
||||
getObject(it).toKotlinLocalDateTime()
|
||||
}
|
||||
}
|
||||
|
||||
private fun TimeStampMicroVector.values(range: IntRange): List<LocalDateTime?> =
|
||||
range.mapIndexed { i, it ->
|
||||
if (isNull(i)) {
|
||||
null
|
||||
} else {
|
||||
getObject(it).toKotlinLocalDateTime()
|
||||
}
|
||||
}
|
||||
|
||||
private fun TimeStampMilliVector.values(range: IntRange): List<LocalDateTime?> =
|
||||
range.mapIndexed { i, it ->
|
||||
if (isNull(i)) {
|
||||
null
|
||||
} else {
|
||||
getObject(it).toKotlinLocalDateTime()
|
||||
}
|
||||
}
|
||||
|
||||
private fun TimeStampSecVector.values(range: IntRange): List<LocalDateTime?> =
|
||||
range.mapIndexed { i, it ->
|
||||
if (isNull(i)) {
|
||||
null
|
||||
} else {
|
||||
getObject(it).toKotlinLocalDateTime()
|
||||
}
|
||||
}
|
||||
|
||||
private fun StructVector.values(range: IntRange): List<Map<String, Any?>?> =
|
||||
range.map {
|
||||
getObject(it)
|
||||
}
|
||||
|
||||
private fun NullVector.values(range: IntRange): List<Nothing?> =
|
||||
range.map {
|
||||
getObject(it) as Nothing?
|
||||
}
|
||||
|
||||
private fun VarCharVector.values(range: IntRange): List<String?> =
|
||||
range.map {
|
||||
if (isNull(it)) {
|
||||
null
|
||||
} else {
|
||||
String(get(it))
|
||||
}
|
||||
}
|
||||
|
||||
private fun LargeVarCharVector.values(range: IntRange): List<String?> =
|
||||
range.map {
|
||||
if (isNull(it)) {
|
||||
null
|
||||
} else {
|
||||
String(get(it))
|
||||
}
|
||||
}
|
||||
|
||||
private fun ViewVarCharVector.values(range: IntRange): List<String?> =
|
||||
range.map {
|
||||
if (isNull(it)) {
|
||||
null
|
||||
} else {
|
||||
String(get(it))
|
||||
}
|
||||
}
|
||||
|
||||
private fun VarBinaryVector.values(range: IntRange): List<ByteArray?> =
|
||||
range.map {
|
||||
if (isNull(it)) {
|
||||
null
|
||||
} else {
|
||||
get(it)
|
||||
}
|
||||
}
|
||||
|
||||
private fun LargeVarBinaryVector.values(range: IntRange): List<ByteArray?> =
|
||||
range.map {
|
||||
if (isNull(it)) {
|
||||
null
|
||||
} else {
|
||||
get(it)
|
||||
}
|
||||
}
|
||||
|
||||
private fun ViewVarBinaryVector.values(range: IntRange): List<ByteArray?> =
|
||||
range.map {
|
||||
if (isNull(it)) {
|
||||
null
|
||||
} else {
|
||||
get(it)
|
||||
}
|
||||
}
|
||||
|
||||
internal fun nothingType(nullable: Boolean): KType =
|
||||
if (nullable) {
|
||||
typeOf<List<Nothing?>>()
|
||||
} else {
|
||||
typeOf<List<Nothing>>()
|
||||
}.arguments.first().type!!
|
||||
|
||||
private inline fun <reified T> List<T?>.withTypeNullable(
|
||||
expectedNulls: Boolean,
|
||||
nullabilityOptions: NullabilityOptions,
|
||||
): Pair<List<T?>, KType> {
|
||||
val nullable = nullabilityOptions.applyNullability(this, expectedNulls)
|
||||
return this to typeOf<T>().withNullability(nullable)
|
||||
}
|
||||
|
||||
@JvmName("withTypeNullableNothingList")
|
||||
private fun List<Nothing?>.withTypeNullable(
|
||||
expectedNulls: Boolean,
|
||||
nullabilityOptions: NullabilityOptions,
|
||||
): Pair<List<Nothing?>, KType> {
|
||||
val nullable = nullabilityOptions.applyNullability(this, expectedNulls)
|
||||
return this to nothingType(nullable)
|
||||
}
|
||||
|
||||
private fun readField(vector: FieldVector, field: Field, nullability: NullabilityOptions): AnyBaseCol {
|
||||
try {
|
||||
val range = 0 until vector.valueCount
|
||||
if (vector is StructVector) {
|
||||
val columns = field.children.map { childField ->
|
||||
readField(vector.getChild(childField.name), childField, nullability)
|
||||
}
|
||||
return DataColumn.createColumnGroup(field.name, columns.toDataFrame())
|
||||
}
|
||||
val (list, type) = when (vector) {
|
||||
is VarCharVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is LargeVarCharVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is ViewVarCharVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is VarBinaryVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is LargeVarBinaryVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is ViewVarBinaryVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is BitVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is SmallIntVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is TinyIntVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is UInt1Vector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is UInt2Vector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is UInt4Vector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is UInt8Vector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is IntVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is BigIntVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is DecimalVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is Decimal256Vector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is Float8Vector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is Float4Vector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is DurationVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is DateDayVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is DateMilliVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is TimeNanoVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is TimeMicroVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is TimeMilliVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is TimeSecVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is TimeStampNanoVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is TimeStampMicroVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is TimeStampMilliVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is TimeStampSecVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
is NullVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
|
||||
|
||||
else -> {
|
||||
throw NotImplementedError("reading from ${vector.javaClass.canonicalName} is not implemented")
|
||||
}
|
||||
}
|
||||
return DataColumn.createValueColumn(field.name, list, type, Infer.None)
|
||||
} catch (unexpectedNull: NullabilityException) {
|
||||
throw IllegalArgumentException("Column `${field.name}` should be not nullable but has nulls")
|
||||
}
|
||||
}
|
||||
|
||||
private fun readField(root: VectorSchemaRoot, field: Field, nullability: NullabilityOptions): AnyBaseCol =
|
||||
readField(root.getVector(field), field, nullability)
|
||||
|
||||
/**
|
||||
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format) data from existing [channel]
|
||||
*/
|
||||
internal fun DataFrame.Companion.readArrowIPCImpl(
|
||||
channel: ReadableByteChannel,
|
||||
allocator: RootAllocator = Allocator.ROOT,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
): AnyFrame = readArrowImpl(ArrowStreamReader(channel, allocator), nullability)
|
||||
|
||||
/**
|
||||
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [channel]
|
||||
*/
|
||||
internal fun DataFrame.Companion.readArrowFeatherImpl(
|
||||
channel: SeekableByteChannel,
|
||||
allocator: RootAllocator = Allocator.ROOT,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
): AnyFrame = readArrowImpl(ArrowFileReader(channel, allocator), nullability)
|
||||
|
||||
/**
|
||||
* Read [Arrow any format](https://arrow.apache.org/docs/java/ipc.html#reading-writing-ipc-formats) data from existing [reader]
|
||||
*/
|
||||
internal fun DataFrame.Companion.readArrowImpl(
|
||||
reader: ArrowReader,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
): AnyFrame {
|
||||
reader.use {
|
||||
val flattened = buildList {
|
||||
when (reader) {
|
||||
is ArrowFileReader -> {
|
||||
reader.recordBlocks.forEach { block ->
|
||||
reader.loadRecordBatch(block)
|
||||
val root = reader.vectorSchemaRoot
|
||||
val schema = root.schema
|
||||
val df = schema.fields.map { f -> readField(root, f, nullability) }.toDataFrame()
|
||||
add(df)
|
||||
}
|
||||
}
|
||||
|
||||
else -> {
|
||||
val root = reader.vectorSchemaRoot
|
||||
val schema = root.schema
|
||||
while (reader.loadNextBatch()) {
|
||||
val df = schema.fields.map { f -> readField(root, f, nullability) }.toDataFrame()
|
||||
add(df)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return flattened.concatKeepingSchema()
|
||||
}
|
||||
}
|
||||
|
||||
private fun resolveArrowDatasetUris(fileUris: Array<String>): Array<String> =
|
||||
fileUris.map {
|
||||
when {
|
||||
it.startsWith("http:", true) -> {
|
||||
val url = URI.create(it).toURL()
|
||||
val tempFile = File.createTempFile("kdf", ".parquet")
|
||||
tempFile.deleteOnExit()
|
||||
url.openStream().use { input ->
|
||||
Files.copy(input, tempFile.toPath())
|
||||
tempFile.toURI().toString()
|
||||
}
|
||||
}
|
||||
|
||||
!it.startsWith("file:", true) && File(it).exists() -> {
|
||||
File(it).toURI().toString()
|
||||
}
|
||||
|
||||
else -> it
|
||||
}
|
||||
}.toTypedArray()
|
||||
|
||||
/**
|
||||
* Read [Arrow Dataset](https://arrow.apache.org/docs/java/dataset.html) from [fileUris]
|
||||
*/
|
||||
internal fun DataFrame.Companion.readArrowDatasetImpl(
|
||||
fileUris: Array<String>,
|
||||
fileFormat: FileFormat,
|
||||
nullability: NullabilityOptions = NullabilityOptions.Infer,
|
||||
batchSize: Long = ARROW_PARQUET_DEFAULT_BATCH_SIZE,
|
||||
): AnyFrame {
|
||||
val scanOptions = ScanOptions(batchSize)
|
||||
RootAllocator().use { allocator ->
|
||||
FileSystemDatasetFactory(
|
||||
allocator,
|
||||
NativeMemoryPool.createListenable(DirectReservationListener.instance()),
|
||||
fileFormat,
|
||||
resolveArrowDatasetUris(fileUris),
|
||||
).use { datasetFactory ->
|
||||
datasetFactory.finish().use { dataset ->
|
||||
dataset.newScan(scanOptions).use { scanner ->
|
||||
scanner.scanBatches().use { reader ->
|
||||
return readArrowImpl(reader, nullability)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Vendored
+136
@@ -0,0 +1,136 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import kotlinx.datetime.LocalDate
|
||||
import kotlinx.datetime.LocalDateTime
|
||||
import kotlinx.datetime.LocalTime
|
||||
import org.apache.arrow.vector.types.DateUnit
|
||||
import org.apache.arrow.vector.types.FloatingPointPrecision
|
||||
import org.apache.arrow.vector.types.TimeUnit
|
||||
import org.apache.arrow.vector.types.pojo.ArrowType
|
||||
import org.apache.arrow.vector.types.pojo.Field
|
||||
import org.apache.arrow.vector.types.pojo.FieldType
|
||||
import org.apache.arrow.vector.types.pojo.Schema
|
||||
import org.jetbrains.kotlinx.dataframe.AnyCol
|
||||
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
|
||||
import org.jetbrains.kotlinx.dataframe.typeClass
|
||||
import kotlin.reflect.full.isSubtypeOf
|
||||
import kotlin.reflect.typeOf
|
||||
import java.time.LocalDate as JavaLocalDate
|
||||
import java.time.LocalDateTime as JavaLocalDateTime
|
||||
import java.time.LocalTime as JavaLocalTime
|
||||
|
||||
/**
|
||||
* Create Arrow [Field] (note: this is part of [Schema], does not contain data itself) that has the same
|
||||
* name, type and nullable as [this]
|
||||
*/
|
||||
public fun AnyCol.toArrowField(mismatchSubscriber: (ConvertingMismatch) -> Unit = ignoreMismatchMessage): Field {
|
||||
val column = this
|
||||
val columnType = column.type()
|
||||
val nullable = columnType.isMarkedNullable
|
||||
return when {
|
||||
column is ColumnGroup<*> -> {
|
||||
val childFields = column.columns().map { it.toArrowField(mismatchSubscriber) }
|
||||
Field(
|
||||
column.name(),
|
||||
FieldType(nullable, ArrowType.Struct(), null),
|
||||
childFields,
|
||||
)
|
||||
}
|
||||
|
||||
columnType.isSubtypeOf(typeOf<String?>()) ->
|
||||
Field(
|
||||
column.name(),
|
||||
FieldType(nullable, ArrowType.Utf8(), null),
|
||||
emptyList(),
|
||||
)
|
||||
|
||||
columnType.isSubtypeOf(typeOf<Boolean?>()) ->
|
||||
Field(
|
||||
column.name(),
|
||||
FieldType(nullable, ArrowType.Bool(), null),
|
||||
emptyList(),
|
||||
)
|
||||
|
||||
columnType.isSubtypeOf(typeOf<Byte?>()) ->
|
||||
Field(
|
||||
column.name(),
|
||||
FieldType(nullable, ArrowType.Int(8, true), null),
|
||||
emptyList(),
|
||||
)
|
||||
|
||||
columnType.isSubtypeOf(typeOf<Short?>()) ->
|
||||
Field(
|
||||
column.name(),
|
||||
FieldType(nullable, ArrowType.Int(16, true), null),
|
||||
emptyList(),
|
||||
)
|
||||
|
||||
columnType.isSubtypeOf(typeOf<Int?>()) ->
|
||||
Field(
|
||||
column.name(),
|
||||
FieldType(nullable, ArrowType.Int(32, true), null),
|
||||
emptyList(),
|
||||
)
|
||||
|
||||
columnType.isSubtypeOf(typeOf<Long?>()) ->
|
||||
Field(
|
||||
column.name(),
|
||||
FieldType(nullable, ArrowType.Int(64, true), null),
|
||||
emptyList(),
|
||||
)
|
||||
|
||||
columnType.isSubtypeOf(typeOf<Float?>()) ->
|
||||
Field(
|
||||
column.name(),
|
||||
FieldType(nullable, ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null),
|
||||
emptyList(),
|
||||
)
|
||||
|
||||
columnType.isSubtypeOf(typeOf<Double?>()) ->
|
||||
Field(
|
||||
column.name(),
|
||||
FieldType(nullable, ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE), null),
|
||||
emptyList(),
|
||||
)
|
||||
|
||||
columnType.isSubtypeOf(typeOf<JavaLocalDate?>()) ||
|
||||
columnType.isSubtypeOf(typeOf<LocalDate?>()) ->
|
||||
Field(
|
||||
column.name(),
|
||||
FieldType(nullable, ArrowType.Date(DateUnit.DAY), null),
|
||||
emptyList(),
|
||||
)
|
||||
|
||||
columnType.isSubtypeOf(typeOf<JavaLocalDateTime?>()) ||
|
||||
columnType.isSubtypeOf(typeOf<LocalDateTime?>()) ->
|
||||
Field(
|
||||
column.name(),
|
||||
FieldType(nullable, ArrowType.Date(DateUnit.MILLISECOND), null),
|
||||
emptyList(),
|
||||
)
|
||||
|
||||
columnType.isSubtypeOf(typeOf<JavaLocalTime?>()) ||
|
||||
columnType.isSubtypeOf(typeOf<LocalTime>()) ->
|
||||
Field(
|
||||
column.name(),
|
||||
FieldType(nullable, ArrowType.Time(TimeUnit.NANOSECOND, 64), null),
|
||||
emptyList(),
|
||||
)
|
||||
|
||||
else -> {
|
||||
mismatchSubscriber(ConvertingMismatch.SavedAsString(column.name(), column.typeClass.java))
|
||||
Field(column.name(), FieldType(true, ArrowType.Utf8(), null), emptyList())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create Arrow [Schema] matching [this] actual data.
|
||||
* Columns with not supported types will be interpreted as String
|
||||
*/
|
||||
public fun List<AnyCol>.toArrowSchema(
|
||||
mismatchSubscriber: (ConvertingMismatch) -> Unit = ignoreMismatchMessage,
|
||||
): Schema {
|
||||
val fields = this.map { it.toArrowField(mismatchSubscriber) }
|
||||
return Schema(fields)
|
||||
}
|
||||
Vendored
+117
@@ -0,0 +1,117 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import org.apache.arrow.vector.types.pojo.Schema
|
||||
import org.jetbrains.kotlinx.dataframe.AnyFrame
|
||||
import java.io.File
|
||||
import java.io.OutputStream
|
||||
import java.nio.channels.WritableByteChannel
|
||||
import java.nio.file.Path
|
||||
|
||||
/**
|
||||
* Create [ArrowWriter] for [this] DataFrame with target schema matching actual data
|
||||
*/
|
||||
public fun AnyFrame.arrowWriter(): ArrowWriter = this.arrowWriter(this.columns().toArrowSchema())
|
||||
|
||||
/**
|
||||
* Create [ArrowWriter] for [this] DataFrame with explicit [targetSchema].
|
||||
* If DataFrame does not match with [targetSchema], behaviour is specified by [mode], mismatches would be sent to [mismatchSubscriber]
|
||||
*/
|
||||
public fun AnyFrame.arrowWriter(
|
||||
targetSchema: Schema,
|
||||
mode: ArrowWriter.Mode = ArrowWriter.Mode.STRICT,
|
||||
mismatchSubscriber: (ConvertingMismatch) -> Unit = ignoreMismatchMessage,
|
||||
): ArrowWriter = ArrowWriter.create(this, targetSchema, mode, mismatchSubscriber)
|
||||
|
||||
// IPC saving block with default parameters
|
||||
|
||||
/**
|
||||
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format), write to opened [channel].
|
||||
*/
|
||||
public fun AnyFrame.writeArrowIPC(channel: WritableByteChannel) {
|
||||
this.arrowWriter().use { writer ->
|
||||
writer.writeArrowIPC(channel)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format), write to opened [stream].
|
||||
*/
|
||||
public fun AnyFrame.writeArrowIPC(stream: OutputStream) {
|
||||
this.arrowWriter().use { writer ->
|
||||
writer.writeArrowIPC(stream)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format), write to new or existing [file].
|
||||
* If [file] exists, it can be recreated or expanded.
|
||||
*/
|
||||
public fun AnyFrame.writeArrowIPC(file: File, append: Boolean = true) {
|
||||
writeArrowIPC(file.toPath(), append)
|
||||
}
|
||||
|
||||
/**
|
||||
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format),
|
||||
* write to new or existing file on the given [path].
|
||||
* If file exists, it can be recreated or expanded.
|
||||
*/
|
||||
public fun AnyFrame.writeArrowIPC(path: Path, append: Boolean = true) {
|
||||
this.arrowWriter().use { writer ->
|
||||
writer.writeArrowIPC(path, append)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save data to [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format), write to new [ByteArray]
|
||||
*/
|
||||
public fun AnyFrame.saveArrowIPCToByteArray(): ByteArray =
|
||||
this.arrowWriter().use { writer ->
|
||||
writer.saveArrowIPCToByteArray()
|
||||
}
|
||||
|
||||
// Feather saving block with default parameters
|
||||
|
||||
/**
|
||||
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files), write to opened [channel].
|
||||
*/
|
||||
public fun AnyFrame.writeArrowFeather(channel: WritableByteChannel) {
|
||||
this.arrowWriter().use { writer ->
|
||||
writer.writeArrowFeather(channel)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files), write to opened [stream].
|
||||
*/
|
||||
public fun AnyFrame.writeArrowFeather(stream: OutputStream) {
|
||||
this.arrowWriter().use { writer ->
|
||||
writer.writeArrowFeather(stream)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files), write to new or existing [file].
|
||||
* If file exists, it would be recreated.
|
||||
*/
|
||||
public fun AnyFrame.writeArrowFeather(file: File) {
|
||||
writeArrowFeather(file.toPath())
|
||||
}
|
||||
|
||||
/**
|
||||
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files),
|
||||
* write to new or existing file on the given [path].
|
||||
* If file exists, it would be recreated.
|
||||
*/
|
||||
public fun AnyFrame.writeArrowFeather(path: Path) {
|
||||
this.arrowWriter().use { writer ->
|
||||
writer.writeArrowFeather(path)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save data to [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files), write to new [ByteArray]
|
||||
*/
|
||||
public fun AnyFrame.saveArrowFeatherToByteArray(): ByteArray =
|
||||
this.arrowWriter().use { writer ->
|
||||
writer.saveArrowFeatherToByteArray()
|
||||
}
|
||||
+1
@@ -0,0 +1 @@
|
||||
org.jetbrains.kotlinx.dataframe.io.ArrowFeather
|
||||
Vendored
+813
@@ -0,0 +1,813 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import io.kotest.assertions.throwables.shouldThrow
|
||||
import io.kotest.matchers.collections.shouldContain
|
||||
import io.kotest.matchers.shouldBe
|
||||
import kotlinx.datetime.LocalDate
|
||||
import kotlinx.datetime.LocalDateTime
|
||||
import kotlinx.datetime.UtcOffset
|
||||
import kotlinx.datetime.toInstant
|
||||
import org.apache.arrow.memory.RootAllocator
|
||||
import org.apache.arrow.vector.TimeStampMicroVector
|
||||
import org.apache.arrow.vector.TimeStampMilliVector
|
||||
import org.apache.arrow.vector.TimeStampNanoVector
|
||||
import org.apache.arrow.vector.TimeStampSecVector
|
||||
import org.apache.arrow.vector.VectorSchemaRoot
|
||||
import org.apache.arrow.vector.ipc.ArrowFileReader
|
||||
import org.apache.arrow.vector.ipc.ArrowFileWriter
|
||||
import org.apache.arrow.vector.ipc.ArrowReader
|
||||
import org.apache.arrow.vector.ipc.ArrowStreamReader
|
||||
import org.apache.arrow.vector.ipc.ArrowStreamWriter
|
||||
import org.apache.arrow.vector.types.FloatingPointPrecision
|
||||
import org.apache.arrow.vector.types.TimeUnit
|
||||
import org.apache.arrow.vector.types.pojo.ArrowType
|
||||
import org.apache.arrow.vector.types.pojo.Field
|
||||
import org.apache.arrow.vector.types.pojo.FieldType
|
||||
import org.apache.arrow.vector.types.pojo.Schema
|
||||
import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel
|
||||
import org.duckdb.DuckDBConnection
|
||||
import org.duckdb.DuckDBResultSet
|
||||
import org.jetbrains.kotlinx.dataframe.AnyFrame
|
||||
import org.jetbrains.kotlinx.dataframe.DataColumn
|
||||
import org.jetbrains.kotlinx.dataframe.DataFrame
|
||||
import org.jetbrains.kotlinx.dataframe.api.NullabilityOptions
|
||||
import org.jetbrains.kotlinx.dataframe.api.add
|
||||
import org.jetbrains.kotlinx.dataframe.api.columnOf
|
||||
import org.jetbrains.kotlinx.dataframe.api.convertToBoolean
|
||||
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
|
||||
import org.jetbrains.kotlinx.dataframe.api.map
|
||||
import org.jetbrains.kotlinx.dataframe.api.pathOf
|
||||
import org.jetbrains.kotlinx.dataframe.api.remove
|
||||
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
|
||||
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConverterNotFoundException
|
||||
import org.junit.Assert
|
||||
import org.junit.Test
|
||||
import java.io.ByteArrayInputStream
|
||||
import java.io.ByteArrayOutputStream
|
||||
import java.io.File
|
||||
import java.net.URL
|
||||
import java.nio.channels.Channels
|
||||
import java.sql.DriverManager
|
||||
import java.util.Locale
|
||||
import kotlin.io.path.toPath
|
||||
import kotlin.reflect.typeOf
|
||||
import kotlin.time.toJavaInstant
|
||||
|
||||
internal class ArrowKtTest {
|
||||
|
||||
fun testResource(resourcePath: String): URL = ArrowKtTest::class.java.classLoader.getResource(resourcePath)!!
|
||||
|
||||
fun testArrowFeather(name: String) = testResource("$name.feather")
|
||||
|
||||
fun testArrowIPC(name: String) = testResource("$name.ipc")
|
||||
|
||||
@Test
|
||||
fun testReadingFromFile() {
|
||||
val feather = testArrowFeather("data-arrow_2.0.0_uncompressed")
|
||||
val df = DataFrame.readArrowFeather(feather)
|
||||
val a by columnOf("one")
|
||||
val b by columnOf(2.0)
|
||||
val c by columnOf(
|
||||
"c1" to columnOf("inner"),
|
||||
"c2" to columnOf(4.0),
|
||||
"c3" to columnOf(50.0),
|
||||
)
|
||||
val d by columnOf("four")
|
||||
val expected = dataFrameOf(a, b, c, d)
|
||||
df shouldBe expected
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testReadingAllTypesAsEstimated() {
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowFeather(testArrowFeather("test.arrow"), NullabilityOptions.Infer),
|
||||
expectedNullable = false,
|
||||
hasNulls = false,
|
||||
)
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowIPC(testArrowIPC("test.arrow"), NullabilityOptions.Infer),
|
||||
expectedNullable = false,
|
||||
hasNulls = false,
|
||||
)
|
||||
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowFeather(testArrowFeather("test.arrow"), NullabilityOptions.Checking),
|
||||
expectedNullable = true,
|
||||
hasNulls = false,
|
||||
)
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowIPC(testArrowIPC("test.arrow"), NullabilityOptions.Checking),
|
||||
expectedNullable = true,
|
||||
hasNulls = false,
|
||||
)
|
||||
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowFeather(testArrowFeather("test.arrow"), NullabilityOptions.Widening),
|
||||
expectedNullable = true,
|
||||
hasNulls = false,
|
||||
)
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowIPC(testArrowIPC("test.arrow"), NullabilityOptions.Widening),
|
||||
expectedNullable = true,
|
||||
hasNulls = false,
|
||||
)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testReadingAllTypesAsEstimatedWithNulls() {
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowFeather(
|
||||
testArrowFeather("test-with-nulls.arrow"),
|
||||
NullabilityOptions.Infer,
|
||||
),
|
||||
expectedNullable = true,
|
||||
hasNulls = true,
|
||||
)
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowIPC(
|
||||
testArrowIPC("test-with-nulls.arrow"),
|
||||
NullabilityOptions.Infer,
|
||||
),
|
||||
expectedNullable = true,
|
||||
hasNulls = true,
|
||||
)
|
||||
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowFeather(
|
||||
testArrowFeather("test-with-nulls.arrow"),
|
||||
NullabilityOptions.Checking,
|
||||
),
|
||||
expectedNullable = true,
|
||||
hasNulls = true,
|
||||
)
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowIPC(
|
||||
testArrowIPC("test-with-nulls.arrow"),
|
||||
NullabilityOptions.Checking,
|
||||
),
|
||||
expectedNullable = true,
|
||||
hasNulls = true,
|
||||
)
|
||||
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowFeather(
|
||||
testArrowFeather("test-with-nulls.arrow"),
|
||||
NullabilityOptions.Widening,
|
||||
),
|
||||
expectedNullable = true,
|
||||
hasNulls = true,
|
||||
)
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowIPC(
|
||||
testArrowIPC("test-with-nulls.arrow"),
|
||||
NullabilityOptions.Widening,
|
||||
),
|
||||
expectedNullable = true,
|
||||
hasNulls = true,
|
||||
)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testReadingAllTypesAsEstimatedNotNullable() {
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowFeather(
|
||||
testArrowFeather("test-not-nullable.arrow"),
|
||||
NullabilityOptions.Infer,
|
||||
),
|
||||
expectedNullable = false,
|
||||
hasNulls = false,
|
||||
)
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowIPC(
|
||||
testArrowIPC("test-not-nullable.arrow"),
|
||||
NullabilityOptions.Infer,
|
||||
),
|
||||
expectedNullable = false,
|
||||
hasNulls = false,
|
||||
)
|
||||
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowFeather(
|
||||
testArrowFeather("test-not-nullable.arrow"),
|
||||
NullabilityOptions.Checking,
|
||||
),
|
||||
expectedNullable = false,
|
||||
hasNulls = false,
|
||||
)
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowIPC(
|
||||
testArrowIPC("test-not-nullable.arrow"),
|
||||
NullabilityOptions.Checking,
|
||||
),
|
||||
expectedNullable = false,
|
||||
hasNulls = false,
|
||||
)
|
||||
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowFeather(
|
||||
testArrowFeather("test-not-nullable.arrow"),
|
||||
NullabilityOptions.Widening,
|
||||
),
|
||||
expectedNullable = false,
|
||||
hasNulls = false,
|
||||
)
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowIPC(
|
||||
testArrowIPC("test-not-nullable.arrow"),
|
||||
NullabilityOptions.Widening,
|
||||
),
|
||||
expectedNullable = false,
|
||||
hasNulls = false,
|
||||
)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testReadingAllTypesAsEstimatedNotNullableWithNulls() {
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowFeather(
|
||||
testArrowFeather("test-illegal.arrow"),
|
||||
NullabilityOptions.Infer,
|
||||
),
|
||||
expectedNullable = true,
|
||||
hasNulls = true,
|
||||
)
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowIPC(
|
||||
testArrowIPC("test-illegal.arrow"),
|
||||
NullabilityOptions.Infer,
|
||||
),
|
||||
expectedNullable = true,
|
||||
hasNulls = true,
|
||||
)
|
||||
|
||||
shouldThrow<IllegalArgumentException> {
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowFeather(
|
||||
testArrowFeather("test-illegal.arrow"),
|
||||
NullabilityOptions.Checking,
|
||||
),
|
||||
expectedNullable = false,
|
||||
hasNulls = true,
|
||||
)
|
||||
}
|
||||
shouldThrow<IllegalArgumentException> {
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowIPC(
|
||||
testArrowIPC("test-illegal.arrow"),
|
||||
NullabilityOptions.Checking,
|
||||
),
|
||||
expectedNullable = false,
|
||||
hasNulls = true,
|
||||
)
|
||||
}
|
||||
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowFeather(
|
||||
testArrowFeather("test-illegal.arrow"),
|
||||
NullabilityOptions.Widening,
|
||||
),
|
||||
expectedNullable = true,
|
||||
hasNulls = true,
|
||||
)
|
||||
assertEstimations(
|
||||
exampleFrame = DataFrame.readArrowIPC(
|
||||
testArrowIPC("test-illegal.arrow"),
|
||||
NullabilityOptions.Widening,
|
||||
),
|
||||
expectedNullable = true,
|
||||
hasNulls = true,
|
||||
)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testWritingGeneral() {
|
||||
fun assertEstimation(citiesDeserialized: DataFrame<*>) {
|
||||
citiesDeserialized["name"] shouldBe citiesExampleFrame["name"]
|
||||
citiesDeserialized["affiliation"] shouldBe citiesExampleFrame["affiliation"]
|
||||
citiesDeserialized["is_capital"] shouldBe citiesExampleFrame["is_capital"]
|
||||
citiesDeserialized["population"] shouldBe citiesExampleFrame["population"]
|
||||
citiesDeserialized["area"] shouldBe citiesExampleFrame["area"]
|
||||
// cities["settled"].type() refers to FlexibleTypeImpl(LocalDate..LocalDate?)
|
||||
// and does not match typeOf<LocalDate>()
|
||||
citiesDeserialized["settled"].type() shouldBe typeOf<LocalDate>()
|
||||
citiesDeserialized["settled"].values() shouldBe citiesExampleFrame["settled"].values()
|
||||
// cities["page_in_wiki"].type() is URI, not supported by Arrow directly
|
||||
citiesDeserialized["page_in_wiki"].type() shouldBe typeOf<String>()
|
||||
citiesDeserialized["page_in_wiki"].values() shouldBe
|
||||
citiesExampleFrame["page_in_wiki"].values().map { it.toString() }
|
||||
}
|
||||
|
||||
val testFile = File.createTempFile("cities", "arrow")
|
||||
citiesExampleFrame.writeArrowFeather(testFile)
|
||||
assertEstimation(DataFrame.readArrowFeather(testFile))
|
||||
|
||||
val testByteArray = citiesExampleFrame.saveArrowIPCToByteArray()
|
||||
assertEstimation(DataFrame.readArrowIPC(testByteArray))
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testWritingBySchema() {
|
||||
val testFile = File.createTempFile("cities", "arrow")
|
||||
citiesExampleFrame.arrowWriter(Schema.fromJSON(citiesExampleSchema)).use { it.writeArrowFeather(testFile) }
|
||||
val citiesDeserialized = DataFrame.readArrowFeather(testFile, NullabilityOptions.Checking)
|
||||
citiesDeserialized["population"].type() shouldBe typeOf<Long?>()
|
||||
citiesDeserialized["area"].type() shouldBe typeOf<Float>()
|
||||
citiesDeserialized["settled"].type() shouldBe typeOf<LocalDateTime>()
|
||||
shouldThrow<IllegalArgumentException> { citiesDeserialized["page_in_wiki"] }
|
||||
citiesDeserialized["film_in_youtube"] shouldBe
|
||||
DataColumn.createValueColumn(
|
||||
name = "film_in_youtube",
|
||||
values = arrayOfNulls<String>(citiesExampleFrame.rowsCount()).asList(),
|
||||
)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testWidening() {
|
||||
val warnings = ArrayList<ConvertingMismatch>()
|
||||
val testRestrictWidening = citiesExampleFrame.arrowWriter(
|
||||
targetSchema = Schema.fromJSON(citiesExampleSchema),
|
||||
mode = ArrowWriter.Mode.STRICT,
|
||||
) { warning ->
|
||||
warnings.add(warning)
|
||||
}.use { it.saveArrowFeatherToByteArray() }
|
||||
warnings.shouldContain(ConvertingMismatch.WideningMismatch.RejectedColumn("page_in_wiki"))
|
||||
shouldThrow<IllegalArgumentException> { DataFrame.readArrowFeather(testRestrictWidening)["page_in_wiki"] }
|
||||
|
||||
val testAllowWidening = citiesExampleFrame.arrowWriter(
|
||||
targetSchema = Schema.fromJSON(citiesExampleSchema),
|
||||
mode = ArrowWriter.Mode(
|
||||
restrictWidening = false,
|
||||
restrictNarrowing = true,
|
||||
strictType = true,
|
||||
strictNullable = true,
|
||||
),
|
||||
).use { it.saveArrowFeatherToByteArray() }
|
||||
DataFrame.readArrowFeather(testAllowWidening)["page_in_wiki"].values() shouldBe
|
||||
citiesExampleFrame["page_in_wiki"]
|
||||
.values()
|
||||
.map { it.toString() }
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testNarrowing() {
|
||||
val frameWithoutRequiredField = citiesExampleFrame.remove("settled")
|
||||
|
||||
frameWithoutRequiredField.arrowWriter(
|
||||
targetSchema = Schema.fromJSON(citiesExampleSchema),
|
||||
mode = ArrowWriter.Mode.STRICT,
|
||||
).use {
|
||||
shouldThrow<ConvertingException> { it.saveArrowFeatherToByteArray() }
|
||||
}
|
||||
|
||||
val warnings = ArrayList<ConvertingMismatch>()
|
||||
val testAllowNarrowing = frameWithoutRequiredField.arrowWriter(
|
||||
targetSchema = Schema.fromJSON(citiesExampleSchema),
|
||||
mode = ArrowWriter.Mode(
|
||||
restrictWidening = true,
|
||||
restrictNarrowing = false,
|
||||
strictType = true,
|
||||
strictNullable = true,
|
||||
),
|
||||
) { warning ->
|
||||
warnings.add(warning)
|
||||
}.use { it.saveArrowFeatherToByteArray() }
|
||||
warnings.shouldContain(ConvertingMismatch.NarrowingMismatch.NotPresentedColumnIgnored("settled"))
|
||||
shouldThrow<IllegalArgumentException> { DataFrame.readArrowFeather(testAllowNarrowing)["settled"] }
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testStrictType() {
|
||||
val frameRenaming = citiesExampleFrame.remove("settled")
|
||||
val frameWithIncompatibleField =
|
||||
frameRenaming.add(
|
||||
frameRenaming["is_capital"]
|
||||
.map { value -> value ?: false }
|
||||
.rename("settled")
|
||||
.convertToBoolean(),
|
||||
)
|
||||
|
||||
frameWithIncompatibleField.arrowWriter(
|
||||
Schema.fromJSON(citiesExampleSchema),
|
||||
ArrowWriter.Mode.STRICT,
|
||||
).use {
|
||||
shouldThrow<ConvertingException> { it.saveArrowFeatherToByteArray() }
|
||||
}
|
||||
|
||||
val warnings = ArrayList<ConvertingMismatch>()
|
||||
val testLoyalType = frameWithIncompatibleField.arrowWriter(
|
||||
targetSchema = Schema.fromJSON(citiesExampleSchema),
|
||||
mode = ArrowWriter.Mode(
|
||||
restrictWidening = true,
|
||||
restrictNarrowing = true,
|
||||
strictType = false,
|
||||
strictNullable = true,
|
||||
),
|
||||
) { warning ->
|
||||
warnings.add(warning)
|
||||
}.use { it.saveArrowFeatherToByteArray() }
|
||||
warnings.map { it.toString() }.shouldContain(
|
||||
ConvertingMismatch.TypeConversionNotFound.ConversionNotFoundIgnored(
|
||||
"settled",
|
||||
TypeConverterNotFoundException(
|
||||
typeOf<Boolean>(),
|
||||
typeOf<kotlinx.datetime.LocalDateTime?>(),
|
||||
pathOf("settled"),
|
||||
),
|
||||
).toString(),
|
||||
)
|
||||
DataFrame.readArrowFeather(testLoyalType)["settled"].type() shouldBe typeOf<Boolean>()
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testStrictNullable() {
|
||||
val frameRenaming = citiesExampleFrame.remove("settled")
|
||||
val frameWithNulls = frameRenaming.add(
|
||||
DataColumn.createValueColumn(
|
||||
"settled",
|
||||
arrayOfNulls<LocalDate>(frameRenaming.rowsCount()).asList(),
|
||||
),
|
||||
)
|
||||
|
||||
frameWithNulls.arrowWriter(
|
||||
targetSchema = Schema.fromJSON(citiesExampleSchema),
|
||||
mode = ArrowWriter.Mode.STRICT,
|
||||
).use {
|
||||
shouldThrow<ConvertingException> { it.saveArrowFeatherToByteArray() }
|
||||
}
|
||||
|
||||
val warnings = ArrayList<ConvertingMismatch>()
|
||||
val testLoyalNullable = frameWithNulls.arrowWriter(
|
||||
targetSchema = Schema.fromJSON(citiesExampleSchema),
|
||||
mode = ArrowWriter.Mode(
|
||||
restrictWidening = true,
|
||||
restrictNarrowing = true,
|
||||
strictType = true,
|
||||
strictNullable = false,
|
||||
),
|
||||
) { warning ->
|
||||
warnings.add(warning)
|
||||
}.use { it.saveArrowFeatherToByteArray() }
|
||||
warnings.shouldContain(ConvertingMismatch.NullableMismatch.NullValueIgnored("settled", 0))
|
||||
DataFrame.readArrowFeather(testLoyalNullable)["settled"].type() shouldBe typeOf<LocalDateTime?>()
|
||||
DataFrame.readArrowFeather(testLoyalNullable)["settled"].values() shouldBe
|
||||
arrayOfNulls<LocalDate>(frameRenaming.rowsCount()).asList()
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testParsing() {
|
||||
val columnStringDot = columnOf("12.345", "67.890")
|
||||
val columnStringComma = columnOf("12,345", "67,890")
|
||||
val frameString = dataFrameOf("columnDot", "columnComma")(columnStringDot, columnStringComma)
|
||||
val columnDoubleFraction = columnOf(12.345, 67.890)
|
||||
val columnDoubleRound = columnOf(12345.0, 67890.0)
|
||||
val targetType = FieldType.notNullable(ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE))
|
||||
val targetSchema = Schema(
|
||||
listOf(
|
||||
Field("columnDot", targetType, emptyList()),
|
||||
Field("columnComma", targetType, emptyList()),
|
||||
),
|
||||
)
|
||||
|
||||
val currentLocale = Locale.getDefault()
|
||||
try {
|
||||
Locale.setDefault(Locale.forLanguageTag("en-US"))
|
||||
val serializedAsUs = frameString.arrowWriter(targetSchema).saveArrowFeatherToByteArray()
|
||||
DataFrame.readArrowFeather(serializedAsUs) shouldBe dataFrameOf("columnDot", "columnComma")(
|
||||
columnDoubleFraction,
|
||||
columnDoubleRound,
|
||||
)
|
||||
Locale.setDefault(Locale.forLanguageTag("ru-RU"))
|
||||
val serializedAsRu = frameString.arrowWriter(targetSchema).saveArrowFeatherToByteArray()
|
||||
DataFrame.readArrowFeather(serializedAsRu) shouldBe
|
||||
dataFrameOf("columnDot", "columnComma")(
|
||||
columnDoubleFraction,
|
||||
columnDoubleFraction,
|
||||
)
|
||||
} finally {
|
||||
Locale.setDefault(currentLocale)
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testBigStringColumn() {
|
||||
val dataFrame = dataFrameOf(bigStringColumn)
|
||||
val data = dataFrame.saveArrowFeatherToByteArray()
|
||||
DataFrame.readArrowFeather(data) shouldBe dataFrame
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testBigMixedColumn() {
|
||||
val dataFrame = dataFrameOf(bigMixedColumn)
|
||||
val warnings = ArrayList<ConvertingMismatch>()
|
||||
val writer = dataFrame.arrowWriter(
|
||||
targetSchema = Schema(
|
||||
listOf(
|
||||
Field("bigMixedColumn", FieldType.nullable(ArrowType.Int(64, true)), emptyList()),
|
||||
),
|
||||
),
|
||||
mode = ArrowWriter.Mode.LOYAL,
|
||||
) {
|
||||
warnings.add(it)
|
||||
}
|
||||
val stream = ByteArrayOutputStream()
|
||||
writer.writeArrowFeather(stream)
|
||||
val data = stream.toByteArray()
|
||||
|
||||
assert(warnings.filterIsInstance<ConvertingMismatch.TypeConversionFail.ConversionFailIgnored>().size == 1)
|
||||
assert(warnings.filterIsInstance<ConvertingMismatch.SavedAsString>().size == 1)
|
||||
|
||||
DataFrame.readArrowFeather(data)["bigMixedColumn"] shouldBe dataFrame[bigMixedColumn].map { it.toString() }
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testTimeStamp() {
|
||||
val dates = listOf(
|
||||
LocalDateTime(2023, 11, 23, 9, 30, 25),
|
||||
LocalDateTime(2015, 5, 25, 14, 20, 13),
|
||||
LocalDateTime(2013, 6, 19, 11, 20, 13),
|
||||
)
|
||||
|
||||
val dataFrame = dataFrameOf(
|
||||
"ts_nano" to dates,
|
||||
"ts_micro" to dates,
|
||||
"ts_milli" to dates,
|
||||
"ts_sec" to dates,
|
||||
)
|
||||
|
||||
DataFrame.readArrowFeather(writeArrowTimestamp(dates)) shouldBe dataFrame
|
||||
DataFrame.readArrowIPC(writeArrowTimestamp(dates, true)) shouldBe dataFrame
|
||||
}
|
||||
|
||||
private fun writeArrowTimestamp(dates: List<LocalDateTime>, streaming: Boolean = false): ByteArray {
|
||||
RootAllocator().use { allocator ->
|
||||
val timeStampMilli = Field(
|
||||
"ts_milli",
|
||||
FieldType.nullable(ArrowType.Timestamp(TimeUnit.MILLISECOND, null)),
|
||||
null,
|
||||
)
|
||||
|
||||
val timeStampMicro = Field(
|
||||
"ts_micro",
|
||||
FieldType.nullable(ArrowType.Timestamp(TimeUnit.MICROSECOND, null)),
|
||||
null,
|
||||
)
|
||||
|
||||
val timeStampNano = Field(
|
||||
"ts_nano",
|
||||
FieldType.nullable(ArrowType.Timestamp(TimeUnit.NANOSECOND, null)),
|
||||
null,
|
||||
)
|
||||
|
||||
val timeStampSec = Field(
|
||||
"ts_sec",
|
||||
FieldType.nullable(ArrowType.Timestamp(TimeUnit.SECOND, null)),
|
||||
null,
|
||||
)
|
||||
val schemaTimeStamp = Schema(
|
||||
listOf(timeStampNano, timeStampMicro, timeStampMilli, timeStampSec),
|
||||
)
|
||||
VectorSchemaRoot.create(schemaTimeStamp, allocator).use { vectorSchemaRoot ->
|
||||
val timeStampMilliVector = vectorSchemaRoot.getVector("ts_milli") as TimeStampMilliVector
|
||||
val timeStampNanoVector = vectorSchemaRoot.getVector("ts_nano") as TimeStampNanoVector
|
||||
val timeStampMicroVector = vectorSchemaRoot.getVector("ts_micro") as TimeStampMicroVector
|
||||
val timeStampSecVector = vectorSchemaRoot.getVector("ts_sec") as TimeStampSecVector
|
||||
timeStampMilliVector.allocateNew(dates.size)
|
||||
timeStampNanoVector.allocateNew(dates.size)
|
||||
timeStampMicroVector.allocateNew(dates.size)
|
||||
timeStampSecVector.allocateNew(dates.size)
|
||||
|
||||
dates.forEachIndexed { index, localDateTime ->
|
||||
val instant = localDateTime.toInstant(UtcOffset.ZERO).toJavaInstant()
|
||||
timeStampNanoVector[index] = instant.toEpochMilli() * 1_000_000L + instant.nano
|
||||
timeStampMicroVector[index] = instant.toEpochMilli() * 1_000L
|
||||
timeStampMilliVector[index] = instant.toEpochMilli()
|
||||
timeStampSecVector[index] = instant.toEpochMilli() / 1_000L
|
||||
}
|
||||
vectorSchemaRoot.setRowCount(dates.size)
|
||||
val bos = ByteArrayOutputStream()
|
||||
bos.use { out ->
|
||||
val arrowWriter = if (streaming) {
|
||||
ArrowStreamWriter(vectorSchemaRoot, null, Channels.newChannel(out))
|
||||
} else {
|
||||
ArrowFileWriter(vectorSchemaRoot, null, Channels.newChannel(out))
|
||||
}
|
||||
arrowWriter.use { writer ->
|
||||
writer.start()
|
||||
writer.writeBatch()
|
||||
}
|
||||
}
|
||||
return bos.toByteArray()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun expectedSimpleDataFrame(): AnyFrame {
|
||||
val dates = listOf(
|
||||
LocalDateTime(2020, 11, 23, 9, 30, 25),
|
||||
LocalDateTime(2015, 5, 25, 14, 20, 13),
|
||||
LocalDateTime(2013, 6, 19, 11, 20, 13),
|
||||
LocalDateTime(2000, 1, 1, 0, 0, 0),
|
||||
)
|
||||
|
||||
return dataFrameOf(
|
||||
"string" to listOf("a", "b", "c", "d"),
|
||||
"int" to listOf(1, 2, 3, 4),
|
||||
"float" to listOf(1.0f, 2.0f, 3.0f, 4.0f),
|
||||
"double" to listOf(1.0, 2.0, 3.0, 4.0),
|
||||
"datetime" to dates,
|
||||
)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testArrowReaderExtension() {
|
||||
val expected = expectedSimpleDataFrame()
|
||||
val featherChannel = ByteArrayReadableSeekableByteChannel(expected.saveArrowFeatherToByteArray())
|
||||
val arrowFileReader = ArrowFileReader(featherChannel, RootAllocator())
|
||||
arrowFileReader.toDataFrame() shouldBe expected
|
||||
|
||||
val ipcInputStream = ByteArrayInputStream(expected.saveArrowIPCToByteArray())
|
||||
val arrowStreamReader = ArrowStreamReader(ipcInputStream, RootAllocator())
|
||||
arrowStreamReader.toDataFrame() shouldBe expected
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testDuckDBArrowIntegration() {
|
||||
val expected = expectedSimpleDataFrame()
|
||||
val query =
|
||||
"""
|
||||
select 'a' as string, 1 as int, CAST(1.0 as FLOAT) as float, CAST(1.0 as DOUBLE) as double, TIMESTAMP '2020-11-23 09:30:25' as datetime
|
||||
UNION ALL SELECT 'b', 2, 2.0, 2.0, TIMESTAMP '2015-05-25 14:20:13'
|
||||
UNION ALL SELECT 'c', 3, 3.0, 3.0, TIMESTAMP '2013-06-19 11:20:13'
|
||||
UNION ALL SELECT 'd', 4, 4.0, 4.0, TIMESTAMP '2000-01-01 00:00:00'
|
||||
""".trimIndent()
|
||||
|
||||
Class.forName("org.duckdb.DuckDBDriver")
|
||||
val conn = DriverManager.getConnection("jdbc:duckdb:") as DuckDBConnection
|
||||
conn.use {
|
||||
val resultSet = it.createStatement().executeQuery(query) as DuckDBResultSet
|
||||
val dbArrowReader = resultSet.arrowExportStream(RootAllocator(), 256) as ArrowReader
|
||||
Assert.assertTrue(dbArrowReader.javaClass.name.equals("org.apache.arrow.c.ArrowArrayStreamReader"))
|
||||
DataFrame.readArrow(dbArrowReader) shouldBe expected
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testReadParquetPath() {
|
||||
val resourceUrl = testResource("test.arrow.parquet")
|
||||
val resourcePath = resourceUrl.toURI().toPath()
|
||||
|
||||
val dataFrame = DataFrame.readParquet(resourcePath)
|
||||
|
||||
dataFrame.rowsCount() shouldBe 300
|
||||
assertEstimations(
|
||||
exampleFrame = dataFrame,
|
||||
expectedNullable = false,
|
||||
hasNulls = false,
|
||||
fromParquet = true,
|
||||
)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testReadParquetFile() {
|
||||
val resourceUrl = testResource("test.arrow.parquet")
|
||||
val resourcePath = resourceUrl.toURI().toPath()
|
||||
|
||||
val dataFrame = DataFrame.readParquet(resourcePath.toFile())
|
||||
|
||||
dataFrame.rowsCount() shouldBe 300
|
||||
assertEstimations(
|
||||
exampleFrame = dataFrame,
|
||||
expectedNullable = false,
|
||||
hasNulls = false,
|
||||
fromParquet = true,
|
||||
)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testReadParquetStringPath() {
|
||||
val resourceUrl = testResource("test.arrow.parquet")
|
||||
val resourcePath = resourceUrl.toURI().toPath()
|
||||
|
||||
val dataFrame = DataFrame.readParquet("$resourcePath")
|
||||
|
||||
dataFrame.rowsCount() shouldBe 300
|
||||
assertEstimations(
|
||||
exampleFrame = dataFrame,
|
||||
expectedNullable = false,
|
||||
hasNulls = false,
|
||||
fromParquet = true,
|
||||
)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testReadParquetUrl() {
|
||||
val resourceUrl = testResource("test.arrow.parquet")
|
||||
val resourcePath = resourceUrl.toURI().toPath()
|
||||
val fileUrl = resourcePath.toUri().toURL()
|
||||
|
||||
val dataFrame = DataFrame.readParquet(fileUrl)
|
||||
|
||||
dataFrame.rowsCount() shouldBe 300
|
||||
assertEstimations(
|
||||
exampleFrame = dataFrame,
|
||||
expectedNullable = false,
|
||||
hasNulls = false,
|
||||
fromParquet = true,
|
||||
)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testReadMultipleParquetFiles() {
|
||||
val resourceUrl = testResource("test.arrow.parquet")
|
||||
val resourcePath = resourceUrl.toURI().toPath()
|
||||
|
||||
val dataFrame = DataFrame.readParquet(resourcePath, resourcePath, resourcePath)
|
||||
|
||||
dataFrame.rowsCount() shouldBe 900
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testColumnGroupRoundtrip() {
|
||||
val original = dataFrameOf(
|
||||
"outer" to columnOf("x", "y", "z"),
|
||||
"inner" to columnOf(
|
||||
"nested1" to columnOf("a", "b", "c"),
|
||||
"nested2" to columnOf(1, 2, 3),
|
||||
),
|
||||
)
|
||||
|
||||
val featherBytes = original.saveArrowFeatherToByteArray()
|
||||
val fromFeather = DataFrame.readArrowFeather(featherBytes)
|
||||
fromFeather shouldBe original
|
||||
|
||||
val ipcBytes = original.saveArrowIPCToByteArray()
|
||||
val fromIpc = DataFrame.readArrowIPC(ipcBytes)
|
||||
fromIpc shouldBe original
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testNestedColumnGroupRoundtrip() {
|
||||
val deeplyNested by columnOf(
|
||||
"level2" to columnOf(
|
||||
"level3" to columnOf(1, 2, 3),
|
||||
),
|
||||
)
|
||||
val original = dataFrameOf(deeplyNested)
|
||||
|
||||
val bytes = original.saveArrowFeatherToByteArray()
|
||||
val restored = DataFrame.readArrowFeather(bytes)
|
||||
|
||||
restored shouldBe original
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testColumnGroupWithNulls() {
|
||||
val group by columnOf(
|
||||
"a" to columnOf("x", null, "z"),
|
||||
"b" to columnOf(1, 2, null),
|
||||
)
|
||||
val original = dataFrameOf(group)
|
||||
|
||||
val bytes = original.saveArrowFeatherToByteArray()
|
||||
val restored = DataFrame.readArrowFeather(bytes)
|
||||
|
||||
restored shouldBe original
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testReadParquetWithNestedStruct() {
|
||||
val resourceUrl = testResource("books.parquet")
|
||||
val resourcePath = resourceUrl.toURI().toPath()
|
||||
|
||||
val df = DataFrame.readParquet(resourcePath)
|
||||
|
||||
df.columnNames() shouldBe listOf("id", "title", "author", "genre", "publisher")
|
||||
|
||||
val authorGroup = df["author"] as ColumnGroup<*>
|
||||
authorGroup.columnNames() shouldBe listOf("id", "firstName", "lastName")
|
||||
|
||||
df["id"].type() shouldBe typeOf<Int>()
|
||||
df["title"].type() shouldBe typeOf<String>()
|
||||
df["genre"].type() shouldBe typeOf<String>()
|
||||
df["publisher"].type() shouldBe typeOf<String>()
|
||||
authorGroup["id"].type() shouldBe typeOf<Int>()
|
||||
authorGroup["firstName"].type() shouldBe typeOf<String>()
|
||||
authorGroup["lastName"].type() shouldBe typeOf<String>()
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testParquetNestedStructRoundtrip() {
|
||||
val resourceUrl = testResource("books.parquet")
|
||||
val resourcePath = resourceUrl.toURI().toPath()
|
||||
|
||||
val original = DataFrame.readParquet(resourcePath)
|
||||
|
||||
val featherBytes = original.saveArrowFeatherToByteArray()
|
||||
val fromFeather = DataFrame.readArrowFeather(featherBytes)
|
||||
fromFeather shouldBe original
|
||||
|
||||
val ipcBytes = original.saveArrowIPCToByteArray()
|
||||
val fromIpc = DataFrame.readArrowIPC(ipcBytes)
|
||||
fromIpc shouldBe original
|
||||
}
|
||||
}
|
||||
+208
@@ -0,0 +1,208 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import io.kotest.matchers.shouldBe
|
||||
import kotlinx.datetime.LocalDate
|
||||
import kotlinx.datetime.LocalDateTime
|
||||
import kotlinx.datetime.LocalTime
|
||||
import kotlinx.datetime.toKotlinLocalDate
|
||||
import kotlinx.datetime.toKotlinLocalDateTime
|
||||
import kotlinx.datetime.toKotlinLocalTime
|
||||
import org.jetbrains.kotlinx.dataframe.AnyFrame
|
||||
import org.jetbrains.kotlinx.dataframe.DataColumn
|
||||
import org.jetbrains.kotlinx.dataframe.api.forEachIndexed
|
||||
import java.math.BigInteger
|
||||
import java.time.ZoneOffset
|
||||
import kotlin.math.absoluteValue
|
||||
import kotlin.math.pow
|
||||
import kotlin.reflect.full.withNullability
|
||||
import kotlin.reflect.typeOf
|
||||
import java.time.LocalDate as JavaLocalDate
|
||||
import java.time.LocalDateTime as JavaLocalDateTime
|
||||
import java.time.LocalTime as JavaLocalTime
|
||||
|
||||
/**
|
||||
* Assert that we have got the same data that was originally saved on example creation.
|
||||
* Example generation project is currently located at https://github.com/Kopilov/arrow_example
|
||||
*/
|
||||
internal fun assertEstimations(
|
||||
exampleFrame: AnyFrame,
|
||||
expectedNullable: Boolean,
|
||||
hasNulls: Boolean,
|
||||
fromParquet: Boolean = false,
|
||||
) {
|
||||
/**
|
||||
* In [exampleFrame] we get two concatenated batches. To assert the estimations, we should transform frame row number to batch row number
|
||||
*/
|
||||
fun iBatch(iFrame: Int): Int {
|
||||
val firstBatchSize = 100
|
||||
return if (iFrame < firstBatchSize) iFrame else iFrame - firstBatchSize
|
||||
}
|
||||
|
||||
fun expectedNull(rowNumber: Int): Boolean = (rowNumber + 1) % 5 == 0
|
||||
|
||||
fun assertValueOrNull(rowNumber: Int, actual: Any?, expected: Any) {
|
||||
if (hasNulls && expectedNull(rowNumber)) {
|
||||
actual shouldBe null
|
||||
} else {
|
||||
actual shouldBe expected
|
||||
}
|
||||
}
|
||||
|
||||
val asciiStringCol = exampleFrame["asciiString"] as DataColumn<String?>
|
||||
asciiStringCol.type() shouldBe typeOf<String>().withNullability(expectedNullable)
|
||||
asciiStringCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(iBatch(i), element, "Test Example ${iBatch(i)}")
|
||||
}
|
||||
|
||||
val utf8StringCol = exampleFrame["utf8String"] as DataColumn<String?>
|
||||
utf8StringCol.type() shouldBe typeOf<String>().withNullability(expectedNullable)
|
||||
utf8StringCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(iBatch(i), element, "Тестовый пример ${iBatch(i)}")
|
||||
}
|
||||
|
||||
val largeStringCol = exampleFrame["largeString"] as DataColumn<String?>
|
||||
largeStringCol.type() shouldBe typeOf<String>().withNullability(expectedNullable)
|
||||
largeStringCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(iBatch(i), element, "Test Example Should Be Large ${iBatch(i)}")
|
||||
}
|
||||
|
||||
val booleanCol = exampleFrame["boolean"] as DataColumn<Boolean?>
|
||||
booleanCol.type() shouldBe typeOf<Boolean>().withNullability(expectedNullable)
|
||||
booleanCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(iBatch(i), element, iBatch(i) % 2 == 0)
|
||||
}
|
||||
|
||||
val byteCol = exampleFrame["byte"] as DataColumn<Byte?>
|
||||
byteCol.type() shouldBe typeOf<Byte>().withNullability(expectedNullable)
|
||||
byteCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(iBatch(i), element, (iBatch(i) * 10).toByte())
|
||||
}
|
||||
|
||||
val shortCol = exampleFrame["short"] as DataColumn<Short?>
|
||||
shortCol.type() shouldBe typeOf<Short>().withNullability(expectedNullable)
|
||||
shortCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(iBatch(i), element, (iBatch(i) * 1000).toShort())
|
||||
}
|
||||
|
||||
val intCol = exampleFrame["int"] as DataColumn<Int?>
|
||||
intCol.type() shouldBe typeOf<Int>().withNullability(expectedNullable)
|
||||
intCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(iBatch(i), element, iBatch(i) * 100000000)
|
||||
}
|
||||
|
||||
val longCol = exampleFrame["longInt"] as DataColumn<Long?>
|
||||
longCol.type() shouldBe typeOf<Long>().withNullability(expectedNullable)
|
||||
longCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(iBatch(i), element, iBatch(i) * 100000000000000000L)
|
||||
}
|
||||
|
||||
val unsignedByteCol = exampleFrame["unsigned_byte"] as DataColumn<Short?>
|
||||
unsignedByteCol.type() shouldBe typeOf<Short>().withNullability(expectedNullable)
|
||||
unsignedByteCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(iBatch(i), element, (iBatch(i) * 10 % (Byte.MIN_VALUE.toShort() * 2).absoluteValue).toShort())
|
||||
}
|
||||
|
||||
val unsignedShortCol = exampleFrame["unsigned_short"] as DataColumn<Int?>
|
||||
unsignedShortCol.type() shouldBe typeOf<Int>().withNullability(expectedNullable)
|
||||
unsignedShortCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(iBatch(i), element, iBatch(i) * 1000 % (Short.MIN_VALUE.toInt() * 2).absoluteValue)
|
||||
}
|
||||
|
||||
val unsignedIntCol = exampleFrame["unsigned_int"] as DataColumn<Long?>
|
||||
unsignedIntCol.type() shouldBe typeOf<Long>().withNullability(expectedNullable)
|
||||
unsignedIntCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(
|
||||
rowNumber = iBatch(i),
|
||||
actual = element,
|
||||
expected = iBatch(i).toLong() * 100000000 % (Int.MIN_VALUE.toLong() * 2).absoluteValue,
|
||||
)
|
||||
}
|
||||
|
||||
val unsignedLongIntCol = exampleFrame["unsigned_longInt"] as DataColumn<BigInteger?>
|
||||
unsignedLongIntCol.type() shouldBe typeOf<BigInteger>().withNullability(expectedNullable)
|
||||
unsignedLongIntCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(
|
||||
rowNumber = iBatch(i),
|
||||
actual = element,
|
||||
expected = iBatch(i).toBigInteger() * 100000000000000000L.toBigInteger() %
|
||||
(Long.MIN_VALUE.toBigInteger() * 2.toBigInteger()).abs(),
|
||||
)
|
||||
}
|
||||
|
||||
val floatCol = exampleFrame["float"] as DataColumn<Float?>
|
||||
floatCol.type() shouldBe typeOf<Float>().withNullability(expectedNullable)
|
||||
floatCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(iBatch(i), element, 2.0f.pow(iBatch(i).toFloat()))
|
||||
}
|
||||
|
||||
val doubleCol = exampleFrame["double"] as DataColumn<Double?>
|
||||
doubleCol.type() shouldBe typeOf<Double>().withNullability(expectedNullable)
|
||||
doubleCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(iBatch(i), element, 2.0.pow(iBatch(i)))
|
||||
}
|
||||
|
||||
val dateCol = exampleFrame["date32"] as DataColumn<LocalDate?>
|
||||
dateCol.type() shouldBe typeOf<LocalDate>().withNullability(expectedNullable)
|
||||
dateCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(iBatch(i), element, JavaLocalDate.ofEpochDay(iBatch(i).toLong() * 30).toKotlinLocalDate())
|
||||
}
|
||||
|
||||
if (fromParquet) {
|
||||
// parquet format have only one type of date: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#date without time
|
||||
val datetimeCol = exampleFrame["date64"] as DataColumn<LocalDate?>
|
||||
datetimeCol.type() shouldBe typeOf<LocalDate>().withNullability(expectedNullable)
|
||||
datetimeCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(iBatch(i), element, JavaLocalDate.ofEpochDay(iBatch(i).toLong() * 30).toKotlinLocalDate())
|
||||
}
|
||||
} else {
|
||||
val datetimeCol = exampleFrame["date64"] as DataColumn<LocalDateTime?>
|
||||
datetimeCol.type() shouldBe typeOf<LocalDateTime>().withNullability(expectedNullable)
|
||||
datetimeCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(
|
||||
rowNumber = iBatch(i),
|
||||
actual = element,
|
||||
expected = JavaLocalDateTime.ofEpochSecond(
|
||||
iBatch(i).toLong() * 60 * 60 * 24 * 30,
|
||||
0,
|
||||
ZoneOffset.UTC,
|
||||
).toKotlinLocalDateTime(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
val timeSecCol = exampleFrame["time32_seconds"] as DataColumn<LocalTime?>
|
||||
timeSecCol.type() shouldBe typeOf<LocalTime>().withNullability(expectedNullable)
|
||||
timeSecCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(iBatch(i), element, JavaLocalTime.ofSecondOfDay(iBatch(i).toLong()).toKotlinLocalTime())
|
||||
}
|
||||
|
||||
val timeMilliCol = exampleFrame["time32_milli"] as DataColumn<LocalTime?>
|
||||
timeMilliCol.type() shouldBe typeOf<LocalTime>().withNullability(expectedNullable)
|
||||
timeMilliCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(
|
||||
rowNumber = iBatch(i),
|
||||
actual = element,
|
||||
expected = JavaLocalTime.ofNanoOfDay(iBatch(i).toLong() * 1000_000).toKotlinLocalTime(),
|
||||
)
|
||||
}
|
||||
|
||||
val timeMicroCol = exampleFrame["time64_micro"] as DataColumn<LocalTime?>
|
||||
timeMicroCol.type() shouldBe typeOf<LocalTime>().withNullability(expectedNullable)
|
||||
timeMicroCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(iBatch(i), element, JavaLocalTime.ofNanoOfDay(iBatch(i).toLong() * 1000).toKotlinLocalTime())
|
||||
}
|
||||
|
||||
val timeNanoCol = exampleFrame["time64_nano"] as DataColumn<LocalTime?>
|
||||
timeNanoCol.type() shouldBe typeOf<LocalTime>().withNullability(expectedNullable)
|
||||
timeNanoCol.forEachIndexed { i, element ->
|
||||
assertValueOrNull(iBatch(i), element, JavaLocalTime.ofNanoOfDay(iBatch(i).toLong()).toKotlinLocalTime())
|
||||
}
|
||||
|
||||
exampleFrame.getColumnOrNull("nulls")?.let { nullCol ->
|
||||
nullCol.type() shouldBe nothingType(hasNulls)
|
||||
assert(hasNulls)
|
||||
nullCol.values().forEach {
|
||||
assert(it == null)
|
||||
}
|
||||
}
|
||||
}
|
||||
Vendored
+206
@@ -0,0 +1,206 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import kotlinx.datetime.LocalDate
|
||||
import org.jetbrains.kotlinx.dataframe.DataColumn
|
||||
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
|
||||
import java.net.URI
|
||||
|
||||
/**
|
||||
* DataFrame to be saved in Apache Arrow
|
||||
*/
|
||||
val citiesExampleFrame = dataFrameOf(
|
||||
DataColumn.createValueColumn(
|
||||
"name",
|
||||
listOf(
|
||||
"Berlin",
|
||||
"Hamburg",
|
||||
"New York",
|
||||
"Washington",
|
||||
"Saint Petersburg",
|
||||
"Vatican",
|
||||
),
|
||||
),
|
||||
DataColumn.createValueColumn(
|
||||
"affiliation",
|
||||
listOf(
|
||||
"Germany",
|
||||
"Germany",
|
||||
"The USA",
|
||||
"The USA",
|
||||
"Russia",
|
||||
null,
|
||||
),
|
||||
),
|
||||
DataColumn.createValueColumn(
|
||||
"is_capital",
|
||||
listOf(
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
false,
|
||||
null,
|
||||
),
|
||||
),
|
||||
DataColumn.createValueColumn(
|
||||
"population",
|
||||
listOf(
|
||||
3_769_495,
|
||||
1_845_229,
|
||||
8_467_513,
|
||||
689_545,
|
||||
5_377_503,
|
||||
825,
|
||||
),
|
||||
),
|
||||
DataColumn.createValueColumn(
|
||||
"area",
|
||||
listOf(
|
||||
891.7,
|
||||
755.22,
|
||||
1223.59,
|
||||
177.0,
|
||||
1439.0,
|
||||
0.44,
|
||||
),
|
||||
),
|
||||
DataColumn.createValueColumn(
|
||||
"settled",
|
||||
listOf(
|
||||
LocalDate(1237, 1, 1),
|
||||
LocalDate(1189, 5, 7),
|
||||
LocalDate(1624, 1, 1),
|
||||
LocalDate(1790, 7, 16),
|
||||
LocalDate(1703, 5, 27),
|
||||
LocalDate(1929, 2, 11),
|
||||
),
|
||||
),
|
||||
DataColumn.createValueColumn(
|
||||
"page_in_wiki",
|
||||
listOf(
|
||||
URI("https://en.wikipedia.org/wiki/Berlin").toURL(),
|
||||
URI("https://en.wikipedia.org/wiki/Hamburg").toURL(),
|
||||
URI("https://en.wikipedia.org/wiki/New_York_City").toURL(),
|
||||
URI("https://en.wikipedia.org/wiki/Washington,_D.C.").toURL(),
|
||||
URI("https://en.wikipedia.org/wiki/Saint_Petersburg").toURL(),
|
||||
URI("https://en.wikipedia.org/wiki/Vatican_City").toURL(),
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
/**
|
||||
* [citiesExampleFrame] Apache Arrow schema with some changes.
|
||||
* Originally generated by `citiesExampleFrame.columns().toArrowSchema().toJson()`
|
||||
* Changes made to test converting and schema matching:
|
||||
* field "population" changed to nullable Long;
|
||||
* field "area" changed to single Float;
|
||||
* field "settled" changed to datetime (date with millisecond precision);
|
||||
* field "page_in_wiki" removed, nullable field "film_in_youtube" added.
|
||||
*/
|
||||
val citiesExampleSchema =
|
||||
"""
|
||||
{
|
||||
"fields" : [ {
|
||||
"name" : "name",
|
||||
"nullable" : false,
|
||||
"type" : {
|
||||
"name" : "utf8"
|
||||
},
|
||||
"children" : [ ]
|
||||
}, {
|
||||
"name" : "affiliation",
|
||||
"nullable" : true,
|
||||
"type" : {
|
||||
"name" : "utf8"
|
||||
},
|
||||
"children" : [ ]
|
||||
}, {
|
||||
"name" : "is_capital",
|
||||
"nullable" : true,
|
||||
"type" : {
|
||||
"name" : "bool"
|
||||
},
|
||||
"children" : [ ]
|
||||
}, {
|
||||
"name" : "population",
|
||||
"nullable" : true,
|
||||
"type" : {
|
||||
"name" : "int",
|
||||
"bitWidth" : 64,
|
||||
"isSigned" : true
|
||||
},
|
||||
"children" : [ ]
|
||||
}, {
|
||||
"name" : "area",
|
||||
"nullable" : false,
|
||||
"type" : {
|
||||
"name" : "floatingpoint",
|
||||
"precision" : "SINGLE"
|
||||
},
|
||||
"children" : [ ]
|
||||
}, {
|
||||
"name" : "settled",
|
||||
"nullable" : false,
|
||||
"type" : {
|
||||
"name" : "date",
|
||||
"unit" : "MILLISECOND"
|
||||
},
|
||||
"children" : [ ]
|
||||
}, {
|
||||
"name" : "film_in_youtube",
|
||||
"nullable" : true,
|
||||
"type" : {
|
||||
"name" : "utf8"
|
||||
},
|
||||
"children" : [ ]
|
||||
} ]
|
||||
}
|
||||
""".trimIndent()
|
||||
|
||||
/**
|
||||
* String column (variable length vector) with size >1 MiB
|
||||
*/
|
||||
val bigStringColumn = run {
|
||||
val list = ArrayList<String>()
|
||||
for (i in 0 until 1024) {
|
||||
val row = StringBuilder()
|
||||
for (j in 0 until 64) {
|
||||
row.append("abcd")
|
||||
}
|
||||
list.add(row.toString())
|
||||
}
|
||||
for (i in 0 until 1024) {
|
||||
val row = StringBuilder()
|
||||
for (j in 0 until 64) {
|
||||
row.append("гдёж")
|
||||
}
|
||||
list.add(row.toString())
|
||||
}
|
||||
for (i in 0 until 1024) {
|
||||
val row = StringBuilder()
|
||||
for (j in 0 until 64) {
|
||||
row.append("αβγδ")
|
||||
}
|
||||
list.add(row.toString())
|
||||
}
|
||||
for (i in 0 until 1024) {
|
||||
val row = StringBuilder()
|
||||
for (j in 0 until 64) {
|
||||
row.append("正体字")
|
||||
}
|
||||
list.add(row.toString())
|
||||
}
|
||||
DataColumn.createValueColumn("bigStringColumn", list)
|
||||
}
|
||||
|
||||
val bigMixedColumn = run {
|
||||
val list = ArrayList<Any>()
|
||||
for (i in 0..32768) {
|
||||
list.add(i * i)
|
||||
}
|
||||
list.add("Dirty data")
|
||||
for (i in 32768 downTo 0) {
|
||||
list.add(i * i)
|
||||
}
|
||||
DataColumn.createValueColumn("bigMixedColumn", list)
|
||||
}
|
||||
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user