df-research/tech.ml.dataset/java_public_api/tech/v3/libs/Arrow.java

package tech.v3.libs;

import static tech.v3.Clj.*;
import clojure.lang.IFn;
import java.util.Map;


/**
 * Bindings to save/load datasets apache arrow streaming format.  These bindings support
 * JDK-17, memory mapping, and per-column compression.
 *
 * Required Dependencies:
 *
 *```clojure
 *[org.apache.arrow/arrow-vector "6.0.0"]
 *[org.lz4/lz4-java "1.8.0"]
 *[com.github.luben/zstd-jni "1.5.1-1"]
 *```
 */
public class Arrow {

  private Arrow(){}

  static final IFn dsToStreamFn = requiringResolve("tech.v3.libs.arrow", "dataset->stream!");
  static final IFn streamToDsFn = requiringResolve("tech.v3.libs.arrow", "stream->dataset");
  static final IFn dsSeqToStreamFn = requiringResolve("tech.v3.libs.arrow", "dataset-seq->stream!");
  static final IFn streamToDsSeqFn = requiringResolve("tech.v3.libs.arrow", "stream->dataset-seq");

  /**
   * Save a dataset to apache stream format.
   *
   * Options:
   *
   * * `strings-as-text?`: - defaults to false - Save out strings into arrow files without
   *  dictionaries.  This works well if you want to load an arrow file in-place or if
   *  you know the strings in your dataset are either really large or should not be in
   *  string tables.
   *
   * * `:compression` - Either `:zstd` or `:lz4`,  defaults to no compression (nil).
   * Per-column compression of the data can result in some significant size savings
   * (2x+) and thus some significant time savings when transferring over the network.
   * Using compression makes loading via mmap non-in-place - If you are going to use
   * compression mmap probably doesn't make sense on load and most likely will
   * result on slower loading times.  Zstd can also be passed in map form with an
   * addition parameter, `:level` which defaults to 3.
   *
   *
   *```java
   * //Slightly higher compression than the default.
   *datasetToStream(ds, "data.arrow-ipc", hashmap(kw("compression"),
   *                                              hashmap(kw("compression-type"), kw("zstd"),
   *                                                      kw("level"), 5)));
   *```
   */
  public static void datasetToStream(Object ds, Object pathOrInputStream, Object options) {
    dsToStreamFn.invoke(ds, pathOrInputStream, options);
  }
  /**
   * Save a sequence of datasets to a single stream file.  Datasets must either have matching
   * schemas or downstream dataset column datatypes must be able to be widened to the initial
   * dataset column datatypes.
   *
   * For options see `datasetToStream`.
   */
  public static void datasetSeqToStream(Iterable dsSeq, Object pathOrInputStream, Object options) {
    dsSeqToStreamFn.invoke(dsSeq, pathOrInputStream, options);
  }
  /**
   * Load an apache arrow streaming file returning a single dataset.  File must only contain a
   * single record batch.
   *
   * Options:
   *
   * * `:open-type` - Either `:mmap` or `:input-stream` defaulting to the slower but more robust
   * `:input-stream` pathway.  When using `:mmap` resources will be released when the resource
   * system dictates - see documentation for [tech.v3.DType.stackResourceContext](https://cnuernber.github.io/dtype-next/javadoc/index.html).
   * When using `:input-stream` the stream will be closed when the lazy sequence is either fully realized or an
   * exception is thrown.
   *
   * * `close-input-stream?` - When using `:input-stream` `:open-type`, close the input
   * stream upon exception or when stream is fully realized.  Defaults to true.
   *
   * * `:integer-datatime-types?` - when true arrow columns in the appropriate packed
   * datatypes will be represented as their integer types as opposed to their respective
   * packed types.  For example columns of type `:epoch-days` will be returned to the user
   * as datatype `:epoch-days` as opposed to `:packed-local-date`.  This means reading values
   * will return integers as opposed to `java.time.LocalDate`s.
   */
  public static Map streamToDataset(Object pathOrInputStream, Object options) {
    return (Map)streamToDsFn.invoke(pathOrInputStream, options);
  }

  /**
   * Load an apache arrow streaming file returning a sequence of datasets, one for each record batch.
   * For options see streamToDataset.
   */
  public static Iterable streamToDatasetSeq(Object pathOrInputStream, Object options) {
    return (Iterable)streamToDsSeqFn.invoke(pathOrInputStream, options);
  }
}