Files
2026-02-08 11:20:43 -10:00

60 lines
2.6 KiB
Java
Vendored

package tech.v3.libs;
import static tech.v3.Clj.*;
import clojure.lang.IFn;
import java.util.Map;
/**
* Read/write parquet files. Uses the standard hadoop parquet library. One aspect that
* may be confusing is that when writing files the parquet system decides when to end
* the record batch so a single dataset may end up as a single parquet file with many
* record batches.
*
* Note that in the requiring dependencies I remove slf4j. tmd comes with logback-classic
* by default which is less featureful but far less of a security disaster than slf4j. If you
* have a setup that already uses slf4j then you should exclude logback-classic from
* tmd's dependencies.
*
* You must disable debug logging else the parquet system is unreasonably slow. See logging
* section of [parquet namespace](https://techascent.github.io/tech.ml.dataset/tech.v3.libs.parquet.html).
*
* Required dependencies:
*
*```clojure
*org.apache.parquet/parquet-hadoop {:mvn/version "1.12.0"
* :exclusions [org.slf4j/slf4j-log4j12]}
*org.apache.hadoop/hadoop-common {:mvn/version "3.3.0"
* :exclusions [org.slf4j/slf4j-log4j12]}
*;; We literally need this for 1 POJO formatting object.
*org.apache.hadoop/hadoop-mapreduce-client-core {:mvn/version "3.3.0"
* :exclusions [org.slf4j/slf4j-log4j12]}
*```
*/
public class Parquet
{
private Parquet(){}
static final IFn dsToParquetFn = requiringResolve("tech.v3.libs.parquet", "ds->parquet");
static final IFn dsSeqToParquetFn = requiringResolve("tech.v3.libs.parquet", "ds-seq->parquet");
static final IFn parquetToDsSeqFn = requiringResolve("tech.v3.libs.parquet", "parquet->ds-seq");
static final IFn parquetToDsFn = requiringResolve("tech.v3.libs.parquet", "parquet->ds");
static final IFn parquetToMetadataSeq = requiringResolve("tech.v3.libs.parquet", "parquet->metadata-seq");
public static Iterable parquetMetadata(String path) {
return (Iterable)parquetToMetadataSeq.invoke(path);
}
public static Map parquetToDataset(String path, Object options) {
return (Map)parquetToDsFn.invoke(path, options);
}
public static Iterable parquetToDatasetSeq(String path, Object options) {
return (Iterable)parquetToDsSeqFn.invoke(path, options);
}
public static void datasetToParquet(Object ds, String path, Object options) {
dsToParquetFn.invoke(ds, path, options);
}
public static void datasetSeqToParquet(Iterable dsSeq, String path, Object options) {
dsSeqToParquetFn.invoke(dsSeq, path, options);
}
}