init research
This commit is contained in:
@@ -0,0 +1,100 @@
|
||||
package tech.v3.libs;
|
||||
|
||||
import static tech.v3.Clj.*;
|
||||
import clojure.lang.IFn;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* Bindings to save/load datasets apache arrow streaming format. These bindings support
|
||||
* JDK-17, memory mapping, and per-column compression.
|
||||
*
|
||||
* Required Dependencies:
|
||||
*
|
||||
*```clojure
|
||||
*[org.apache.arrow/arrow-vector "6.0.0"]
|
||||
*[org.lz4/lz4-java "1.8.0"]
|
||||
*[com.github.luben/zstd-jni "1.5.1-1"]
|
||||
*```
|
||||
*/
|
||||
public class Arrow {
|
||||
|
||||
private Arrow(){}
|
||||
|
||||
static final IFn dsToStreamFn = requiringResolve("tech.v3.libs.arrow", "dataset->stream!");
|
||||
static final IFn streamToDsFn = requiringResolve("tech.v3.libs.arrow", "stream->dataset");
|
||||
static final IFn dsSeqToStreamFn = requiringResolve("tech.v3.libs.arrow", "dataset-seq->stream!");
|
||||
static final IFn streamToDsSeqFn = requiringResolve("tech.v3.libs.arrow", "stream->dataset-seq");
|
||||
|
||||
/**
|
||||
* Save a dataset to apache stream format.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `strings-as-text?`: - defaults to false - Save out strings into arrow files without
|
||||
* dictionaries. This works well if you want to load an arrow file in-place or if
|
||||
* you know the strings in your dataset are either really large or should not be in
|
||||
* string tables.
|
||||
*
|
||||
* * `:compression` - Either `:zstd` or `:lz4`, defaults to no compression (nil).
|
||||
* Per-column compression of the data can result in some significant size savings
|
||||
* (2x+) and thus some significant time savings when transferring over the network.
|
||||
* Using compression makes loading via mmap non-in-place - If you are going to use
|
||||
* compression mmap probably doesn't make sense on load and most likely will
|
||||
* result on slower loading times. Zstd can also be passed in map form with an
|
||||
* addition parameter, `:level` which defaults to 3.
|
||||
*
|
||||
*
|
||||
*```java
|
||||
* //Slightly higher compression than the default.
|
||||
*datasetToStream(ds, "data.arrow-ipc", hashmap(kw("compression"),
|
||||
* hashmap(kw("compression-type"), kw("zstd"),
|
||||
* kw("level"), 5)));
|
||||
*```
|
||||
*/
|
||||
public static void datasetToStream(Object ds, Object pathOrInputStream, Object options) {
|
||||
dsToStreamFn.invoke(ds, pathOrInputStream, options);
|
||||
}
|
||||
/**
|
||||
* Save a sequence of datasets to a single stream file. Datasets must either have matching
|
||||
* schemas or downstream dataset column datatypes must be able to be widened to the initial
|
||||
* dataset column datatypes.
|
||||
*
|
||||
* For options see `datasetToStream`.
|
||||
*/
|
||||
public static void datasetSeqToStream(Iterable dsSeq, Object pathOrInputStream, Object options) {
|
||||
dsSeqToStreamFn.invoke(dsSeq, pathOrInputStream, options);
|
||||
}
|
||||
/**
|
||||
* Load an apache arrow streaming file returning a single dataset. File must only contain a
|
||||
* single record batch.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:open-type` - Either `:mmap` or `:input-stream` defaulting to the slower but more robust
|
||||
* `:input-stream` pathway. When using `:mmap` resources will be released when the resource
|
||||
* system dictates - see documentation for [tech.v3.DType.stackResourceContext](https://cnuernber.github.io/dtype-next/javadoc/index.html).
|
||||
* When using `:input-stream` the stream will be closed when the lazy sequence is either fully realized or an
|
||||
* exception is thrown.
|
||||
*
|
||||
* * `close-input-stream?` - When using `:input-stream` `:open-type`, close the input
|
||||
* stream upon exception or when stream is fully realized. Defaults to true.
|
||||
*
|
||||
* * `:integer-datatime-types?` - when true arrow columns in the appropriate packed
|
||||
* datatypes will be represented as their integer types as opposed to their respective
|
||||
* packed types. For example columns of type `:epoch-days` will be returned to the user
|
||||
* as datatype `:epoch-days` as opposed to `:packed-local-date`. This means reading values
|
||||
* will return integers as opposed to `java.time.LocalDate`s.
|
||||
*/
|
||||
public static Map streamToDataset(Object pathOrInputStream, Object options) {
|
||||
return (Map)streamToDsFn.invoke(pathOrInputStream, options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Load an apache arrow streaming file returning a sequence of datasets, one for each record batch.
|
||||
* For options see streamToDataset.
|
||||
*/
|
||||
public static Iterable streamToDatasetSeq(Object pathOrInputStream, Object options) {
|
||||
return (Iterable)streamToDsSeqFn.invoke(pathOrInputStream, options);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
package tech.v3.libs;
|
||||
|
||||
|
||||
import static tech.v3.Clj.*;
|
||||
import clojure.lang.IFn;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Read/write parquet files. Uses the standard hadoop parquet library. One aspect that
|
||||
* may be confusing is that when writing files the parquet system decides when to end
|
||||
* the record batch so a single dataset may end up as a single parquet file with many
|
||||
* record batches.
|
||||
*
|
||||
* Note that in the requiring dependencies I remove slf4j. tmd comes with logback-classic
|
||||
* by default which is less featureful but far less of a security disaster than slf4j. If you
|
||||
* have a setup that already uses slf4j then you should exclude logback-classic from
|
||||
* tmd's dependencies.
|
||||
*
|
||||
* You must disable debug logging else the parquet system is unreasonably slow. See logging
|
||||
* section of [parquet namespace](https://techascent.github.io/tech.ml.dataset/tech.v3.libs.parquet.html).
|
||||
*
|
||||
* Required dependencies:
|
||||
*
|
||||
*```clojure
|
||||
*org.apache.parquet/parquet-hadoop {:mvn/version "1.12.0"
|
||||
* :exclusions [org.slf4j/slf4j-log4j12]}
|
||||
*org.apache.hadoop/hadoop-common {:mvn/version "3.3.0"
|
||||
* :exclusions [org.slf4j/slf4j-log4j12]}
|
||||
*;; We literally need this for 1 POJO formatting object.
|
||||
*org.apache.hadoop/hadoop-mapreduce-client-core {:mvn/version "3.3.0"
|
||||
* :exclusions [org.slf4j/slf4j-log4j12]}
|
||||
*```
|
||||
*/
|
||||
public class Parquet
|
||||
{
|
||||
private Parquet(){}
|
||||
|
||||
static final IFn dsToParquetFn = requiringResolve("tech.v3.libs.parquet", "ds->parquet");
|
||||
static final IFn dsSeqToParquetFn = requiringResolve("tech.v3.libs.parquet", "ds-seq->parquet");
|
||||
static final IFn parquetToDsSeqFn = requiringResolve("tech.v3.libs.parquet", "parquet->ds-seq");
|
||||
static final IFn parquetToDsFn = requiringResolve("tech.v3.libs.parquet", "parquet->ds");
|
||||
static final IFn parquetToMetadataSeq = requiringResolve("tech.v3.libs.parquet", "parquet->metadata-seq");
|
||||
|
||||
public static Iterable parquetMetadata(String path) {
|
||||
return (Iterable)parquetToMetadataSeq.invoke(path);
|
||||
}
|
||||
public static Map parquetToDataset(String path, Object options) {
|
||||
return (Map)parquetToDsFn.invoke(path, options);
|
||||
}
|
||||
public static Iterable parquetToDatasetSeq(String path, Object options) {
|
||||
return (Iterable)parquetToDsSeqFn.invoke(path, options);
|
||||
}
|
||||
public static void datasetToParquet(Object ds, String path, Object options) {
|
||||
dsToParquetFn.invoke(ds, path, options);
|
||||
}
|
||||
public static void datasetSeqToParquet(Iterable dsSeq, String path, Object options) {
|
||||
dsSeqToParquetFn.invoke(dsSeq, path, options);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user