init research

2026-02-08 11:20:43 -10:00
commit bdf064f54d
3041 changed files with 1592200 additions and 0 deletions
@@ -0,0 +1,100 @@
+package tech.v3.libs;
+
+import static tech.v3.Clj.*;
+import clojure.lang.IFn;
+import java.util.Map;
+
+
+/**
+ * Bindings to save/load datasets apache arrow streaming format.  These bindings support
+ * JDK-17, memory mapping, and per-column compression.
+ *
+ * Required Dependencies:
+ *
+ *```clojure
+ *[org.apache.arrow/arrow-vector "6.0.0"]
+ *[org.lz4/lz4-java "1.8.0"]
+ *[com.github.luben/zstd-jni "1.5.1-1"]
+ *```
+ */
+public class Arrow {
+
+  private Arrow(){}
+
+  static final IFn dsToStreamFn = requiringResolve("tech.v3.libs.arrow", "dataset->stream!");
+  static final IFn streamToDsFn = requiringResolve("tech.v3.libs.arrow", "stream->dataset");
+  static final IFn dsSeqToStreamFn = requiringResolve("tech.v3.libs.arrow", "dataset-seq->stream!");
+  static final IFn streamToDsSeqFn = requiringResolve("tech.v3.libs.arrow", "stream->dataset-seq");
+
+  /**
+   * Save a dataset to apache stream format.
+   *
+   * Options:
+   *
+   * * `strings-as-text?`: - defaults to false - Save out strings into arrow files without
+   *  dictionaries.  This works well if you want to load an arrow file in-place or if
+   *  you know the strings in your dataset are either really large or should not be in
+   *  string tables.
+   *
+   * * `:compression` - Either `:zstd` or `:lz4`,  defaults to no compression (nil).
+   * Per-column compression of the data can result in some significant size savings
+   * (2x+) and thus some significant time savings when transferring over the network.
+   * Using compression makes loading via mmap non-in-place - If you are going to use
+   * compression mmap probably doesn't make sense on load and most likely will
+   * result on slower loading times.  Zstd can also be passed in map form with an
+   * addition parameter, `:level` which defaults to 3.
+   *
+   *
+   *```java
+   * //Slightly higher compression than the default.
+   *datasetToStream(ds, "data.arrow-ipc", hashmap(kw("compression"),
+   *                                              hashmap(kw("compression-type"), kw("zstd"),
+   *                                                      kw("level"), 5)));
+   *```
+   */
+  public static void datasetToStream(Object ds, Object pathOrInputStream, Object options) {
+    dsToStreamFn.invoke(ds, pathOrInputStream, options);
+  }
+  /**
+   * Save a sequence of datasets to a single stream file.  Datasets must either have matching
+   * schemas or downstream dataset column datatypes must be able to be widened to the initial
+   * dataset column datatypes.
+   *
+   * For options see `datasetToStream`.
+   */
+  public static void datasetSeqToStream(Iterable dsSeq, Object pathOrInputStream, Object options) {
+    dsSeqToStreamFn.invoke(dsSeq, pathOrInputStream, options);
+  }
+  /**
+   * Load an apache arrow streaming file returning a single dataset.  File must only contain a
+   * single record batch.
+   *
+   * Options:
+   *
+   * * `:open-type` - Either `:mmap` or `:input-stream` defaulting to the slower but more robust
+   * `:input-stream` pathway.  When using `:mmap` resources will be released when the resource
+   * system dictates - see documentation for [tech.v3.DType.stackResourceContext](https://cnuernber.github.io/dtype-next/javadoc/index.html).
+   * When using `:input-stream` the stream will be closed when the lazy sequence is either fully realized or an
+   * exception is thrown.
+   *
+   * * `close-input-stream?` - When using `:input-stream` `:open-type`, close the input
+   * stream upon exception or when stream is fully realized.  Defaults to true.
+   *
+   * * `:integer-datatime-types?` - when true arrow columns in the appropriate packed
+   * datatypes will be represented as their integer types as opposed to their respective
+   * packed types.  For example columns of type `:epoch-days` will be returned to the user
+   * as datatype `:epoch-days` as opposed to `:packed-local-date`.  This means reading values
+   * will return integers as opposed to `java.time.LocalDate`s.
+   */
+  public static Map streamToDataset(Object pathOrInputStream, Object options) {
+    return (Map)streamToDsFn.invoke(pathOrInputStream, options);
+  }
+
+  /**
+   * Load an apache arrow streaming file returning a sequence of datasets, one for each record batch.
+   * For options see streamToDataset.
+   */
+  public static Iterable streamToDatasetSeq(Object pathOrInputStream, Object options) {
+    return (Iterable)streamToDsSeqFn.invoke(pathOrInputStream, options);
+  }
+}
@@ -0,0 +1,59 @@
+package tech.v3.libs;
+
+
+import static tech.v3.Clj.*;
+import clojure.lang.IFn;
+import java.util.Map;
+
+/**
+ * Read/write parquet files.  Uses the standard hadoop parquet library.  One aspect that
+ * may be confusing is that when writing files the parquet system decides when to end
+ * the record batch so a single dataset may end up as a single parquet file with many
+ * record batches.
+ *
+ * Note that in the requiring dependencies I remove slf4j.  tmd comes with logback-classic
+ * by default which is less featureful but far less of a security disaster than slf4j.  If you
+ * have a setup that already uses slf4j then you should exclude logback-classic from
+ * tmd's dependencies.
+ *
+ * You must disable debug logging else the parquet system is unreasonably slow.  See logging
+ * section of [parquet namespace](https://techascent.github.io/tech.ml.dataset/tech.v3.libs.parquet.html).
+ *
+ * Required dependencies:
+ *
+ *```clojure
+ *org.apache.parquet/parquet-hadoop {:mvn/version "1.12.0"
+ *                                    :exclusions [org.slf4j/slf4j-log4j12]}
+ *org.apache.hadoop/hadoop-common {:mvn/version "3.3.0"
+ *                                 :exclusions [org.slf4j/slf4j-log4j12]}
+ *;; We literally need this for 1 POJO formatting object.
+ *org.apache.hadoop/hadoop-mapreduce-client-core {:mvn/version "3.3.0"
+ *                                                :exclusions [org.slf4j/slf4j-log4j12]}
+ *```
+ */
+public class Parquet
+{
+  private Parquet(){}
+  
+  static final IFn dsToParquetFn = requiringResolve("tech.v3.libs.parquet", "ds->parquet");
+  static final IFn dsSeqToParquetFn = requiringResolve("tech.v3.libs.parquet", "ds-seq->parquet");
+  static final IFn parquetToDsSeqFn = requiringResolve("tech.v3.libs.parquet", "parquet->ds-seq");
+  static final IFn parquetToDsFn = requiringResolve("tech.v3.libs.parquet", "parquet->ds");
+  static final IFn parquetToMetadataSeq = requiringResolve("tech.v3.libs.parquet", "parquet->metadata-seq");
+
+  public static Iterable parquetMetadata(String path) {
+    return (Iterable)parquetToMetadataSeq.invoke(path);
+  }
+  public static Map parquetToDataset(String path, Object options) {
+    return (Map)parquetToDsFn.invoke(path, options);
+  }
+  public static Iterable parquetToDatasetSeq(String path, Object options) {
+    return (Iterable)parquetToDsSeqFn.invoke(path, options);
+  }
+  public static void datasetToParquet(Object ds, String path, Object options) {
+    dsToParquetFn.invoke(ds, path, options);
+  }
+  public static void datasetSeqToParquet(Iterable dsSeq, String path, Object options) {
+    dsSeqToParquetFn.invoke(dsSeq, path, options);
+  }
+}