init research
This commit is contained in:
+892
@@ -0,0 +1,892 @@
|
||||
package tech.v3;
|
||||
|
||||
|
||||
import static tech.v3.Clj.*;
|
||||
import clojure.lang.Keyword;
|
||||
import clojure.lang.IFn;
|
||||
import java.util.Map;
|
||||
import java.util.Comparator;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
import tech.v3.datatype.Buffer;
|
||||
import tech.v3.datatype.NDBuffer;
|
||||
import ham_fisted.IFnDef;
|
||||
|
||||
|
||||
/**
|
||||
* `tech.ml.dataset` is a high performance library for processing columnar data similar to
|
||||
* pandas or R' data table. Datasets are `maps` of their columns and columns derive from
|
||||
* various Clojure interfaces such as IIndexed and IFn to make accessing their data as easy
|
||||
* as possible.
|
||||
*
|
||||
* Columns have a conversion to a `tech.v3.datate.Buffer` object accessible via
|
||||
* `tech.v3.DType.toBuffer()` so if you want higher performance non-boxing access that is
|
||||
* also available. Any bit of sequential data can be turned into a column. The best way
|
||||
* is if the data is already in a primitive array or nio buffer use that as a column - it
|
||||
* will be used in place. It is also possible to direclty instantiate a Buffer object in
|
||||
* a read-only pathway to create a virtualized column:
|
||||
*
|
||||
*```java
|
||||
*println(head(assoc(colmapDs, kw("c"), new tech.v3.datatype.LongReader() {
|
||||
* public long lsize() { return 10; }
|
||||
* public long readLong( long idx) {
|
||||
* return 2*idx;
|
||||
* }
|
||||
* })));
|
||||
* //testds [5 3]:
|
||||
*
|
||||
* //| :b | :a | :c |
|
||||
* //|----:|---:|---:|
|
||||
* //| 9.0 | 0 | 0 |
|
||||
* //| 8.0 | 1 | 2 |
|
||||
* //| 7.0 | 2 | 4 |
|
||||
* //| 6.0 | 3 | 6 |
|
||||
* //| 5.0 | 4 | 8 |
|
||||
*```
|
||||
*
|
||||
* Datasets implement a subset of java.util.Map and clojure's persistent map interfaces.
|
||||
* This means you can use various `java.util.Map` functions and you can also use
|
||||
* `clojure.core/assoc`, `clojure.core/dissoc`, and `clojure.core/merge` in order to add and
|
||||
* remove columns from the dataset. These are exposed in `tech.v3.Clj` as equivalently named
|
||||
* functions. In combination with the fact that columns implement Clojure.lang.IIndexed
|
||||
* providing `nth` as well as the single arity IFn invoke method you can do a surprising
|
||||
* amount of dataset processing without using bespoke TMD functions at all.
|
||||
*
|
||||
* All of the functions in `tech.v3.datatype.VecMath` will work with column although most
|
||||
* of those functions require the columns to have no missing data. The recommendation is to
|
||||
* do you missing-value processing first and then move into the various elemwise functions.
|
||||
* Integer columns with missing values will upcast themselves to double columns for any math
|
||||
* operation so the result keeps consistent w/r/t NaN behavior. Again, ideally missing values
|
||||
* should be dealt with before doing operations in the `VecMath` namespace.
|
||||
*
|
||||
* Most of the functions of the dataset (filter, sort, groupBy) will auto-parallelize but
|
||||
* but there are many times where the most efficient use of machine resources is to
|
||||
* parallelize a the outermost level using `pmapDs`. The parallelization primitives check and
|
||||
* run in serial mode of the current thread is already in a parallelization pathway.
|
||||
*
|
||||
*/
|
||||
public class TMD {
|
||||
private TMD(){}
|
||||
|
||||
static final IFn toDatasetFn = requiringResolve("tech.v3.dataset-api", "->dataset");
|
||||
static final IFn isDatasetFn = requiringResolve("tech.v3.dataset-api", "dataset?");
|
||||
static final IFn rowCountFn = requiringResolve("tech.v3.dataset.base", "row-count");
|
||||
static final IFn columnCountFn = requiringResolve("tech.v3.dataset.base", "column-count");
|
||||
static final IFn columnFn = requiringResolve("tech.v3.dataset", "column");
|
||||
static final IFn missingFn = requiringResolve("tech.v3.dataset-api", "missing");
|
||||
static final IFn selectFn = requiringResolve("tech.v3.dataset-api", "select");
|
||||
static final IFn selectRowsFn = requiringResolve("tech.v3.dataset-api", "select-rows");
|
||||
static final IFn dropRowsFn = requiringResolve("tech.v3.dataset-api", "drop-rows");
|
||||
static final IFn selectColumnsFn = requiringResolve("tech.v3.dataset-api", "select-columns");
|
||||
static final IFn dropColumnsFn = requiringResolve("tech.v3.dataset-api", "drop-columns");
|
||||
static final IFn renameColumnsFn = requiringResolve("tech.v3.dataset-api", "rename-columns");
|
||||
static final IFn replaceMissingFn = requiringResolve("tech.v3.dataset.missing", "replace-missing");
|
||||
static final IFn rowsFn = requiringResolve("tech.v3.dataset", "rows");
|
||||
static final IFn rowvecsFn = requiringResolve("tech.v3.dataset", "rowvecs");
|
||||
static final IFn headFn = requiringResolve("tech.v3.dataset", "head");
|
||||
static final IFn tailFn = requiringResolve("tech.v3.dataset", "tail");
|
||||
static final IFn sampleFn = requiringResolve("tech.v3.dataset", "sample");
|
||||
static final IFn shuffleFn = requiringResolve("tech.v3.dataset", "shuffle");
|
||||
static final IFn reverseFn = requiringResolve("tech.v3.dataset", "reverse-rows");
|
||||
|
||||
static final IFn columnMapFn = requiringResolve("tech.v3.dataset", "column-map");
|
||||
static final IFn rowMapFn = requiringResolve("tech.v3.dataset", "row-map");
|
||||
static final IFn rowMapcatFn = requiringResolve("tech.v3.dataset", "row-mapcat");
|
||||
static final IFn pmapDsFn = requiringResolve("tech.v3.dataset", "pmap-ds");
|
||||
|
||||
static final IFn sortByFn = requiringResolve("tech.v3.dataset", "sort-by");
|
||||
static final IFn sortByColumnFn = requiringResolve("tech.v3.dataset", "sort-by-column");
|
||||
static final IFn filterFn = requiringResolve("tech.v3.dataset", "filter");
|
||||
static final IFn filterColumnFn = requiringResolve("tech.v3.dataset", "filter-column");
|
||||
static final IFn groupByFn = requiringResolve("tech.v3.dataset", "group-by");
|
||||
static final IFn groupByColumnFn = requiringResolve("tech.v3.dataset", "group-by-column");
|
||||
static final IFn concatCopyingFn = requiringResolve("tech.v3.dataset", "concat-copying");
|
||||
static final IFn concatInplaceFn = requiringResolve("tech.v3.dataset", "concat-inplace");
|
||||
static final IFn uniqueByFn = requiringResolve("tech.v3.dataset", "unique-by");
|
||||
static final IFn uniqueByColumnFn = requiringResolve("tech.v3.dataset", "unique-by-column");
|
||||
|
||||
static final IFn descriptiveStatsFn = requiringResolve("tech.v3.dataset", "descriptive-stats");
|
||||
|
||||
static final IFn pdMergeFn = requiringResolve("tech.v3.dataset.join", "pd-merge");
|
||||
static final IFn joinAsof = requiringResolve("tech.v3.dataset.join", "left-join-asof");
|
||||
|
||||
static final Object toNeanderthalDelay = delay(new IFnDef() {
|
||||
public Object invoke() {
|
||||
//Bindings to make as-tensor work with neanderthal
|
||||
require("tech.v3.libs.neanderthal");
|
||||
//Actual function to convert a dataset into a neanderthal double or float matrix.
|
||||
return requiringResolve("tech.v3.dataset.neanderthal", "dataset->dense");
|
||||
}
|
||||
});
|
||||
static final Object neanderthalToDatasetDelay = delay(new IFnDef() {
|
||||
public Object invoke() {
|
||||
//tensor bindings
|
||||
require("tech.v3.libs.neanderthal");
|
||||
return requiringResolve("tech.v3.dataset.neanderthal", "dense->dataset");
|
||||
}
|
||||
});
|
||||
static final Object toTensorDelay = delay(new IFnDef() {
|
||||
public Object invoke() {
|
||||
return requiringResolve("tech.v3.dataset.tensor", "dataset->tensor");
|
||||
}
|
||||
});
|
||||
static final Object tensorToDatasetDelay = delay(new IFnDef() {
|
||||
public Object invoke() {
|
||||
return requiringResolve("tech.v3.dataset.tensor", "tensor->dataset");
|
||||
}
|
||||
});
|
||||
|
||||
static final IFn writeFn = requiringResolve("tech.v3.dataset", "write!");
|
||||
|
||||
|
||||
/**
|
||||
* Basic pathway to take data and get back a datasets. If dsData is a string
|
||||
* a built in system can parse csv, tsv, csv.gz, tsv.gz, .json, json.gz and .nippy format files.
|
||||
* Specific other formats such as xlsx, apache arrow and parquet formats are provided
|
||||
* in other classes.
|
||||
*
|
||||
* Aside from string data formats, you can explicitly provide either a sequence of maps
|
||||
* or a map of columns with the map of columns being by far more the most efficient.
|
||||
* In the map-of-columns approach arrays of primitive numeric data and native buffers will
|
||||
* be used in-place.
|
||||
*
|
||||
* The options for parsing a dataset are extensive and documented at
|
||||
* [->dataset](https://techascent.github.io/tech.ml.dataset/tech.v3.dataset.html#var--.3Edataset).
|
||||
*
|
||||
* Example:
|
||||
*
|
||||
*```java
|
||||
* Map ds = makeDataset("https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv");
|
||||
* tech.v3.Clj.println(head(ds));
|
||||
* // https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv [5 3]:
|
||||
* // | symbol | date | price |
|
||||
* // |--------|------------|------:|
|
||||
* // | MSFT | 2000-01-01 | 39.81 |
|
||||
* // | MSFT | 2000-02-01 | 36.35 |
|
||||
* // | MSFT | 2000-03-01 | 43.22 |
|
||||
* // | MSFT | 2000-04-01 | 28.37 |
|
||||
* // | MSFT | 2000-05-01 | 25.45 |
|
||||
*Map colmapDs = makeDataset(hashmap(kw("a"), range(10),
|
||||
* kw("b"), toDoubleArray(range(9,-1,-1))),
|
||||
* hashmap(kw("dataset-name"), "testds"));
|
||||
*println(colmapDs);
|
||||
* // testds [10 2]:
|
||||
*
|
||||
* // | :b | :a |
|
||||
* // |----:|---:|
|
||||
* // | 9.0 | 0 |
|
||||
* // | 8.0 | 1 |
|
||||
* // | 7.0 | 2 |
|
||||
* // | 6.0 | 3 |
|
||||
* // | 5.0 | 4 |
|
||||
* // | 4.0 | 5 |
|
||||
* // | 3.0 | 6 |
|
||||
* // | 2.0 | 7 |
|
||||
* // | 1.0 | 8 |
|
||||
* // | 0.0 | 9 |
|
||||
*```
|
||||
*/
|
||||
public static Map makeDataset(Object dsData, Map options) {
|
||||
return (Map)call(toDatasetFn, dsData, options);
|
||||
}
|
||||
/**
|
||||
* Make a dataset. See 2-arity form of function.
|
||||
*/
|
||||
public static Map makeDataset(Object dsData) {
|
||||
return (Map)call(toDatasetFn, dsData, null);
|
||||
}
|
||||
/**
|
||||
* Returns true if this object is a dataset.
|
||||
*/
|
||||
public static boolean isDataset(Object ds) {
|
||||
return (boolean)call(isDatasetFn, ds);
|
||||
}
|
||||
/** Return the number of rows. */
|
||||
public static long rowCount(Object ds) {
|
||||
return (long)call(rowCountFn, ds);
|
||||
}
|
||||
/** Return the number of columns. */
|
||||
public static long columnCount(Object ds) {
|
||||
return (long)call(columnCountFn, ds);
|
||||
}
|
||||
/** Return the column named `cname` else throw exception. */
|
||||
public static Object column(Object ds, Object cname) {
|
||||
return call(columnFn, ds, cname);
|
||||
}
|
||||
/**
|
||||
* Efficiently create a column definition explicitly specifying name and data.
|
||||
* Typed data will be scanned for missing values and untyped data will be read
|
||||
* element by element to discern datatype and missing information.
|
||||
* The result can be `assoc`d back into the dataset.
|
||||
*/
|
||||
public static Map columnDef(Object name, Object data) {
|
||||
return hashmap(keyword("tech.v3.dataset", "name"), name,
|
||||
keyword("tech.v3.dataset", "data"), data);
|
||||
}
|
||||
/**
|
||||
* Efficiently create a column definition explicitly specifying name, data, and missing.
|
||||
* The result can be `assoc`d back into the dataset. Missing will be converted to a RoaringBitmap
|
||||
* but can additionally be an integer array, a java set, or a sequence of integers.
|
||||
*/
|
||||
public static Map columnDef(Object name, Object data, Object missing) {
|
||||
return hashmap(keyword("tech.v3.dataset", "name"), name,
|
||||
keyword("tech.v3.dataset", "data"), data,
|
||||
keyword("tech.v3.dataset", "missing"), missing);
|
||||
}
|
||||
/**
|
||||
* Efficiently create a column definition explicitly specifying name, data, missing, and
|
||||
* metadata. The result can be `assoc`d back into the dataset and saves the system the
|
||||
* time required to scan for missing elements. Missing will be converted to a RoaringBitmap
|
||||
* but can additionally be an integer array, a java set, or a sequence of integers.
|
||||
*/
|
||||
public static Map columnDef(Object name, Object data, Object missing, Object metadata) {
|
||||
return hashmap(keyword("tech.v3.dataset", "name"), name,
|
||||
keyword("tech.v3.dataset", "data"), data,
|
||||
keyword("tech.v3.dataset", "missing"), missing,
|
||||
keyword("tech.v3.dataset", "metadata"), metadata);
|
||||
}
|
||||
/**
|
||||
* Select a sub-rect of the dataset. Dataset names is a sequence of column names that must
|
||||
* exist in the dataset. Rows is a sequence, list, array, or bitmap of integer row
|
||||
* indexes to select. Dataset returned has column in the order specified
|
||||
* by `columnNames`.
|
||||
*/
|
||||
public static Map select(Object ds, Object columnNames, Object rows) {
|
||||
return (Map)call(selectFn, ds, columnNames, rows);
|
||||
}
|
||||
/**
|
||||
* Select columns by name. All names must exist in the dataset.
|
||||
*/
|
||||
public static Map selectColumns(Object ds, Object columnNames ) {
|
||||
return (Map)call(selectColumnsFn, ds, columnNames);
|
||||
}
|
||||
/**
|
||||
* Drop columns by name. All names must exist in the dataset.
|
||||
* Another option is to use the Clojure function `dissoc`.
|
||||
*/
|
||||
public static Map dropColumns(Object ds, Object columnNames ) {
|
||||
return (Map)call(dropColumnsFn, ds, columnNames);
|
||||
}
|
||||
/**
|
||||
* Rename columns providing a map of oldname to newname.
|
||||
*/
|
||||
public static Map renameColumns(Object ds, Map renameMap) {
|
||||
return (Map)call(renameColumnsFn, ds, renameMap);
|
||||
}
|
||||
/**
|
||||
* Select rows by index.
|
||||
*/
|
||||
public static Map selectRows(Object ds, Object rowIndexes) {
|
||||
return (Map)call(selectRowsFn, ds, rowIndexes);
|
||||
}
|
||||
/**
|
||||
* Drop rows by index.
|
||||
*/
|
||||
public static Map dropRows(Object ds, Object rowIndexes) {
|
||||
return (Map)call(dropRowsFn, ds, rowIndexes);
|
||||
}
|
||||
/**
|
||||
* Return the missing set of a dataset or a column in the form of a RoaringBitmap.
|
||||
*/
|
||||
public static RoaringBitmap missing(Object dsOrColumn) {
|
||||
return (RoaringBitmap)call(missingFn, dsOrColumn);
|
||||
}
|
||||
/**
|
||||
* Replace the missing values from a column or set of columns. To replace across
|
||||
* all columns use the keyword :all.
|
||||
*
|
||||
* Strategy can be:
|
||||
*
|
||||
* * `:up` - take next value
|
||||
* * `:down` - take previous value
|
||||
* * `:lerp` - linearly interpolate across values. Datetime objects will have
|
||||
* interpolation in done in millisecond space.
|
||||
* * `vector(:value, val)` - Provide this value explicity to replace entries.
|
||||
* * `:nearest` - use the nearest value.
|
||||
* * `:midpoint` - use the mean of the range.
|
||||
* * `:abb` - impute missing values using approximate bayesian bootstrap.
|
||||
*
|
||||
* Further documentation is located at [replace-missing](https://techascent.github.io/tech.ml.dataset/tech.v3.dataset.html#var-replace-missing).
|
||||
*/
|
||||
public static Map replaceMissing(Object ds, Object strategy, Object columns) {
|
||||
Keyword actualStrat;
|
||||
Object value;
|
||||
if (isVector(strategy)) {
|
||||
actualStrat = (Keyword)call(strategy,0);
|
||||
value = call(strategy,1);
|
||||
}
|
||||
else {
|
||||
actualStrat = (Keyword)strategy;
|
||||
value = null;
|
||||
}
|
||||
if (value != null ) {
|
||||
return (Map)call(replaceMissingFn, ds, columns, actualStrat, value);
|
||||
} else {
|
||||
return (Map)call(replaceMissingFn, ds, columns, actualStrat);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Replace missing values. See 3-arity form of function for documentation.
|
||||
*/
|
||||
public static Map replaceMissing(Object ds, Object strategy) {
|
||||
return replaceMissing(ds, strategy, kw("all"));
|
||||
}
|
||||
/**
|
||||
* Return the rows of the dataset in a flyweight map format. Maps share keys
|
||||
* and read their data lazily from the base dataset.
|
||||
*/
|
||||
public static Buffer rows(Object ds) {
|
||||
return (Buffer)call(rowsFn, ds);
|
||||
}
|
||||
/**
|
||||
* Return the rows of the dataset where each row is just a flat Buffer of data.
|
||||
*
|
||||
* When copying is true data is copied upon each access from the underlying dataset. This
|
||||
* makes doing something like using each row as the key in a map more efficient.
|
||||
*/
|
||||
public static Buffer rowvecs(Object ds, boolean copying) {
|
||||
return (Buffer)call(rowvecsFn, ds, hashmap(kw("copying?"), copying));
|
||||
}
|
||||
/** Return the rows of the dataset where each row is just a flat Buffer of data. */
|
||||
public static Buffer rowvecs(Object ds) {
|
||||
return (Buffer)call(rowvecsFn, ds);
|
||||
}
|
||||
|
||||
/** Return the first 5 rows of the dataset */
|
||||
public static Map head(Object ds) {
|
||||
return (Map)call(headFn, ds);
|
||||
}
|
||||
/** Return the first N rows of the dataset */
|
||||
public static Map head(Object ds, long nRows) {
|
||||
return (Map)call(headFn, ds, nRows);
|
||||
}
|
||||
/** Return the last 5 rows of the dataset */
|
||||
public static Map tail(Object ds) {
|
||||
return (Map)call(tailFn, ds);
|
||||
}
|
||||
/** Return the last N rows of the dataset */
|
||||
public static Map tail(Object ds, long nRows) {
|
||||
return (Map)call(tailFn, ds, nRows);
|
||||
}
|
||||
/** Return a random sampling of 5 rows without replacement of the data */
|
||||
public static Map sample(Object ds) {
|
||||
return (Map)call(sampleFn, ds);
|
||||
}
|
||||
/** Return a random sampling of N rows without replacement of the data */
|
||||
public static Map sample(Object ds, long nRows) {
|
||||
return (Map)call(sampleFn, ds, nRows);
|
||||
}
|
||||
/**
|
||||
* Return a random sampling of N rows of the data.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:replacement?` - Do sampling with replacement. Defaults to false.
|
||||
* * `:seed` - Either an integer or an implementation of java.util.Random.
|
||||
*/
|
||||
public static Map sample(Object ds, long nRows, Map options) {
|
||||
return (Map)call(sampleFn, ds, nRows, options);
|
||||
}
|
||||
/** Randomly shuffle the dataset rows. */
|
||||
public static Map shuffle(Object ds) {
|
||||
return (Map)call(shuffleFn, ds);
|
||||
}
|
||||
/**
|
||||
* Randomly shuffle the dataset rows.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:seed` - Either an integer or an implementation of java.util.Random.
|
||||
*/
|
||||
public static Map shuffle(Object ds, Map options) {
|
||||
return (Map)call(shuffleFn, ds, options);
|
||||
}
|
||||
/** Reverse the rows of the dataset */
|
||||
public static Map reverseRows(Object ds) {
|
||||
return (Map)call(reverseFn, ds);
|
||||
}
|
||||
|
||||
/**
|
||||
* Map a function across 1 or more columns to produce a new column. The new column is
|
||||
* serially scanned to detect datatype and its missing set.
|
||||
*/
|
||||
public static Map columnMap(Object ds, Object resultCname, IFn mapFn, Object srcCnames) {
|
||||
return (Map)call(columnMapFn, ds, resultCname, mapFn, srcCnames);
|
||||
}
|
||||
/**
|
||||
* Map a function across the rows of the dataset with each row in map form. Function must
|
||||
* return a new map for each row. Result is generated in parallel so, when used with a map
|
||||
* factory, this is a suprisingly efficient strategy to create multiple columns at once from
|
||||
* each row.
|
||||
*/
|
||||
public static Map rowMap(Object ds, IFn mapFn) {
|
||||
return (Map)call(rowMapFn, ds, mapFn);
|
||||
}
|
||||
/**
|
||||
* Map a function across the rows of the dataset with each row in map form. Function must
|
||||
* return a new map for each row. Result is generated in parallel so, when used with a map
|
||||
* factory, this is a suprisingly efficient strategy to create multiple columns at once from
|
||||
* each row.
|
||||
*
|
||||
* See options for pmapDs. Especially note `:max-batch-size` and `:result-type`. In
|
||||
* order to conserve memory it may be much more efficient to return a sequence of
|
||||
* datasets rather than one large dataset. If returning sequences of datasets
|
||||
* perhaps consider a transducing pathway across them or the
|
||||
* tech.v3.dataset.reductions namespace.
|
||||
*/
|
||||
public static Object rowMap(Object ds, IFn mapFn, Object options) {
|
||||
return call(rowMapFn, ds, mapFn, options);
|
||||
}
|
||||
/**
|
||||
* Map a function across the rows of the dataset with each row in map form. Function must
|
||||
* return either null or a sequence of maps and thus can produce many new rows for
|
||||
* each input row. Function is called in a parallelized context. Maps returned
|
||||
* must be an implementation of clojure's IPersistentMap. See [tech.v3.Clj.mapFactory](https://cnuernber.github.io/dtype-next/javadoc/tech/v3/DType.html#mapFactory-java.util.List-)
|
||||
* for an efficient way to create those in bulk.
|
||||
*
|
||||
* See options for pmapDs. Especially note `:max-batch-size` and `:result-type`. In
|
||||
* order to conserve memory it may be much more efficient to return a sequence of
|
||||
* datasets rather than one large dataset. If returning sequences of datasets
|
||||
* perhaps consider a transducing pathway across them or the
|
||||
* tech.v3.dataset.reductions namespace.
|
||||
*/
|
||||
public static Object rowMapcat(Object ds, IFn mapFn, Object options) {
|
||||
return call(rowMapcatFn, ds, mapFn, options);
|
||||
}
|
||||
/**
|
||||
* Parallelize mapping a function from dataset->dataset across a dataset. Function may
|
||||
* return null. The original dataset is simply sliced into n-core results and
|
||||
* map-fn is called n-core times with the results either concatenated into a new dataset or
|
||||
* returned as an Iterable.
|
||||
*
|
||||
* Most of the functions of the dataset (filter, sort, groupBy) will auto-parallelize but
|
||||
* but there are many times where the most efficient use of machine resources is to
|
||||
* parallelize a the outermost level. The parallelization primitives check and run in
|
||||
* serial mode of the current thread is already in a parallelization pathway.
|
||||
*
|
||||
* @param mapFn a function from dataset->dataset although it may return null.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:max-batch-size` - Defaults to 64000. This controls the size of each parallelized
|
||||
* chunk.
|
||||
* * `:result-type` - Either `:as-seq` in which case the output of this function is a
|
||||
* sequence of datasets or `:as-ds` in which case the output is a single dataset. The
|
||||
* default is `:as-ds`.
|
||||
*/
|
||||
public static Object pmapDS(Object ds, IFn mapFn, Object options) {
|
||||
return call(pmapDsFn, ds, mapFn, options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sort a dataset by first mapping `sortFn` over it and then sorting over the result.
|
||||
* `sortFn` is passed each row in map form and the return value is used to sort the
|
||||
* dataset.
|
||||
*
|
||||
* @param sortFn function taking a single argument which is the row-map and returns the value
|
||||
* to sort on.
|
||||
* @param compareFn Comparison operator or comparator. Some examples are the Clojure '<' or
|
||||
* '>' operators - tech.v3.Clj.lessThanFn, tech.v3.Clj.greaterThanFn. The clojure keywords
|
||||
* `:tech.numerics/<` and `:tech.numerics/>` can be used for somewhat higher performance
|
||||
* unboxed primitive comparisons or the Clojure function `compare` - tech.v3.Clj.compareFn -
|
||||
* which is similar to .compareTo except it works with null and the input must implement
|
||||
* Comparable. Finally you can instantiate an instance of java.util.Comparator.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:nan-strategy` - General missing strategy. Options are `:first`, `:last`, and
|
||||
* `:exception`.
|
||||
* * `:parallel?` - Uses parallel quicksort when true and regular quicksort when false.
|
||||
*/
|
||||
public static Map sortBy(Object ds, IFn sortFn, Object compareFn, Object options) {
|
||||
return (Map)call(sortByFn, ds, sortFn, compareFn, options);
|
||||
}
|
||||
/** Sort a dataset. See documentation of 4-arity version.*/
|
||||
public static Map sortBy(Object ds, IFn sortFn, Object compareFn) {
|
||||
return (Map)call(sortByFn, ds, sortFn, compareFn, null);
|
||||
}
|
||||
/** Sort a dataset. See documentation of 4-arity version.*/
|
||||
public static Map sortBy(Object ds, IFn sortFn) {
|
||||
return (Map)call(sortByFn, ds, sortFn);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sort a dataset by using the values from column `cname`.
|
||||
* to sort on.
|
||||
* @param compareFn Comparison operator or comparator. Some examples are the Clojure '<' or
|
||||
* '>' operators - tech.v3.Clj.lessThanFn, tech.v3.Clj.greaterThanFn. The clojure keywords
|
||||
* `:tech.numerics/<` and `:tech.numerics/>` can be used for somewhat higher performance
|
||||
* unboxed primitive comparisons or the Clojure function `compare` - tech.v3.Clj.compareFn -
|
||||
* which is similar to .compareTo except it works with null and the input must implement
|
||||
* Comparable. Finally you can instantiate an instance of java.util.Comparator.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:nan-strategy` - General missing strategy. Options are `:first`, `:last`, and
|
||||
* `:exception`.
|
||||
* * `:parallel?` - Uses parallel quicksort when true and regular quicksort when false.
|
||||
*/
|
||||
public static Map sortByColumn(Object ds, Object cname, Object compareFn, Object options) {
|
||||
return (Map)call(sortByColumnFn, ds, cname, compareFn, options);
|
||||
}
|
||||
/** Sort a dataset by a specific column. See documentation on 4-arity version.*/
|
||||
public static Map sortByColumn(Object ds, Object cname, Object compareFn) {
|
||||
return (Map)call(sortByColumnFn, ds, cname, compareFn, null);
|
||||
}
|
||||
/** Sort a dataset by a specific column. See documentation on 4-arity version.*/
|
||||
public static Map sortByColumn(Object ds, Object cname) {
|
||||
return (Map)call(sortByColumnFn, ds, cname);
|
||||
}
|
||||
/**
|
||||
* Filter a dataset. Predicate gets passed all rows and must return a `truthy` values.
|
||||
*/
|
||||
public static Map filter(Object ds, IFn predicate) {
|
||||
return (Map)call(filterFn, ds, predicate);
|
||||
}
|
||||
/**
|
||||
* Filter a dataset. Predicate gets passed a values from column cname and must
|
||||
* return a `truthy` values.
|
||||
*/
|
||||
public static Map filterColumn(Object ds, Object cname, IFn predicate) {
|
||||
return (Map)call(filterColumnFn, ds, cname, predicate);
|
||||
}
|
||||
/**
|
||||
* Group a dataset returning a Map of keys to dataset.
|
||||
*
|
||||
* @param groupFn Gets passed each row in map format and must return the desired key.
|
||||
*
|
||||
* @return a map of key to dataset.
|
||||
*/
|
||||
public static Map groupBy(Object ds, IFn groupFn) {
|
||||
return (Map)call(groupByFn, ds, groupFn);
|
||||
}
|
||||
/**
|
||||
* Group a dataset by a specific column returning a Map of keys to dataset.
|
||||
*
|
||||
* @return a map of key to dataset.
|
||||
*/
|
||||
public static Map groupByColumn(Object ds, Object cname) {
|
||||
return (Map)call(groupByColumnFn, ds, cname);
|
||||
}
|
||||
/**
|
||||
* Concatenate an Iterable of datasets into one dataset via copying data into one
|
||||
* dataset. This generally results in higher performance than an in-place concatenation
|
||||
* with the exception of small (< 3) numbers of datasets. Null datasets will be silently
|
||||
* ignored.
|
||||
*/
|
||||
public static Map concatCopying(Object datasets) {
|
||||
return (Map)call(applyFn, concatCopyingFn, datasets);
|
||||
}
|
||||
|
||||
/**
|
||||
* Concatenate an Iterable of datasets into one dataset via creating virtual buffers that
|
||||
* index into the previous datasets. This generally results in lower performance than a
|
||||
* copying concatenation with the exception of small (< 3) numbers of datasets. Null
|
||||
* datasets will be silently ignored.
|
||||
*/
|
||||
public static Map concatInplace(Object datasets) {
|
||||
return (Map)call(applyFn, concatInplaceFn, datasets);
|
||||
}
|
||||
/**
|
||||
* Create a dataset with no duplicates by taking first of duplicate values.
|
||||
*
|
||||
* @param uniqueFn is passed a row and must return the uniqueness criteria. A uniqueFn is
|
||||
* the identity function.
|
||||
*/
|
||||
public static Map uniqueBy(Object ds, IFn uniqueFn) {
|
||||
return (Map)call(uniqueByFn, ds, uniqueFn);
|
||||
}
|
||||
/**
|
||||
* Make a dataset unique using a particular column as the uniqueness criteria and taking
|
||||
* the first value.
|
||||
*/
|
||||
public static Map uniqueByColumn(Object ds, Object cname) {
|
||||
return (Map)call(uniqueByFn, ds, cname);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a dataset of the descriptive statistics of the input dataset. This works with
|
||||
* date-time columns, missing values, etc. and serves as very fast way to quickly get a feel
|
||||
* for a dataset.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:stat-names` - A set of desired stat names. Possible statistic operations are:
|
||||
* `[:col-name :datatype :n-valid :n-missing :min :quartile-1 :mean :mode :median
|
||||
* :quartile-3 :max :standard-deviation :skew :n-values :values :histogram :first
|
||||
* :last]`
|
||||
*/
|
||||
public static Map descriptiveStats(Object ds, Object options) {
|
||||
return (Map)call(descriptiveStatsFn, ds, options);
|
||||
}
|
||||
/**
|
||||
* Create a dataset of the descriptive statistics of the input dataset. This works with
|
||||
* date-time columns, missing values, etc. and serves as very fast way to quickly get a feel
|
||||
* for a dataset.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:stat-names` - A set of desired stat names. Possible statistic operations are:
|
||||
* `[:col-name :datatype :n-valid :n-missing :min :quartile-1 :mean :mode :median
|
||||
* :quartile-3 :max :standard-deviation :skew :n-values :values :histogram :first
|
||||
* :last]`
|
||||
*/
|
||||
public static Map descriptiveStats(Object ds) {
|
||||
return (Map)call(descriptiveStatsFn, ds);
|
||||
}
|
||||
/**
|
||||
* Perform a join operation between two datasets.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:on` - column name or list of columns names. Names must be found in both datasets.
|
||||
* * `:left-on` - Column name or list of column names
|
||||
* * `:right-on` - Column name or list of column names
|
||||
* * `:how` - `:left`, `:right` `:inner`, `:outer`, `:cross`. If `:cross`, then it is
|
||||
* an error to provide `:on`, `:left-on`, `:right-on`. Defaults to `:inner`.
|
||||
*
|
||||
* Examples:
|
||||
*
|
||||
*```java
|
||||
*Map dsa = makeDataset(hashmap("a", vector("a", "b", "b", "a", "c"),
|
||||
* "b", range(5),
|
||||
* "c", range(5)));
|
||||
*println(dsa);
|
||||
* //_unnamed [5 3]:
|
||||
*
|
||||
* //| a | b | c |
|
||||
* //|---|--:|--:|
|
||||
* //| a | 0 | 0 |
|
||||
* //| b | 1 | 1 |
|
||||
* //| b | 2 | 2 |
|
||||
* //| a | 3 | 3 |
|
||||
* //| c | 4 | 4 |
|
||||
*
|
||||
*
|
||||
*Map dsb = makeDataset(hashmap("a", vector("a", "b", "a", "b", "d"),
|
||||
* "b", range(5),
|
||||
* "c", range(6,11)));
|
||||
*println(dsb);
|
||||
* //_unnamed [5 3]:
|
||||
*
|
||||
* //| a | b | c |
|
||||
* //|---|--:|---:|
|
||||
* //| a | 0 | 6 |
|
||||
* //| b | 1 | 7 |
|
||||
* //| a | 2 | 8 |
|
||||
* //| b | 3 | 9 |
|
||||
* //| d | 4 | 10 |
|
||||
*
|
||||
* //Join on the columns a,b. Default join mode is inner
|
||||
* println(join(dsa, dsb, hashmap(kw("on"), vector("a", "b"))));
|
||||
* //inner-join [2 4]:
|
||||
*
|
||||
* //| a | b | c | right.c |
|
||||
* //|---|--:|--:|--------:|
|
||||
* //| a | 0 | 0 | 6 |
|
||||
* //| b | 1 | 1 | 7 |
|
||||
*
|
||||
*
|
||||
* //Outer join on same columns
|
||||
*println(join(dsa, dsb, hashmap(kw("on"), vector("a", "b"),
|
||||
* kw("how"), kw("outer"))));
|
||||
* //outer-join [8 4]:
|
||||
*
|
||||
* //| a | b | c | right.c |
|
||||
* //|---|--:|--:|--------:|
|
||||
* //| a | 0 | 0 | 6 |
|
||||
* //| b | 1 | 1 | 7 |
|
||||
* //| b | 2 | 2 | |
|
||||
* //| a | 3 | 3 | |
|
||||
* //| c | 4 | 4 | |
|
||||
* //| a | 2 | | 8 |
|
||||
* //| b | 3 | | 9 |
|
||||
* //| d | 4 | | 10 |
|
||||
*```
|
||||
*/
|
||||
public static Map join(Map leftDs, Map rightDs, Map options) {
|
||||
return (Map)pdMergeFn.invoke(leftDs, rightDs, options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform a left join but join on nearest value as opposed to matching value.
|
||||
* Both datasets must be sorted by the join column and the join column itself
|
||||
* must be either a datetime column or a numeric column. When the join column
|
||||
* is a datetime column the join happens in millisecond space.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:asof-op` - One of the keywords `[:< :<= :nearest :>= :>]`. Defaults to `:<=`.
|
||||
*
|
||||
* Examples:
|
||||
*
|
||||
*```java
|
||||
*println(head(googPrices, 200));
|
||||
* //GOOG [68 3]:
|
||||
* //| symbol | date | price |
|
||||
* //|--------|------------|-------:|
|
||||
* //| GOOG | 2004-08-01 | 102.37 |
|
||||
* //| GOOG | 2004-09-01 | 129.60 |
|
||||
* //| GOOG | 2005-03-01 | 180.51 |
|
||||
* //| GOOG | 2004-11-01 | 181.98 |
|
||||
* //| GOOG | 2005-02-01 | 187.99 |
|
||||
* //| GOOG | 2004-10-01 | 190.64 |
|
||||
* //| GOOG | 2004-12-01 | 192.79 |
|
||||
* //| GOOG | 2005-01-01 | 195.62 |
|
||||
* //| GOOG | 2005-04-01 | 220.00 |
|
||||
* //| GOOG | 2005-05-01 | 277.27 |
|
||||
* //| GOOG | 2005-08-01 | 286.00 |
|
||||
* //| GOOG | 2005-07-01 | 287.76 |
|
||||
* //| GOOG | 2008-11-01 | 292.96 |
|
||||
* //| GOOG | 2005-06-01 | 294.15 |
|
||||
* //| GOOG | 2008-12-01 | 307.65 |
|
||||
* //| GOOG | 2005-09-01 | 316.46 |
|
||||
* //| GOOG | 2009-02-01 | 337.99 |
|
||||
* //| GOOG | 2009-01-01 | 338.53 |
|
||||
* //| GOOG | 2009-03-01 | 348.06 |
|
||||
* //| GOOG | 2008-10-01 | 359.36 |
|
||||
* //| GOOG | 2006-02-01 | 362.62 |
|
||||
* //| GOOG | 2006-05-01 | 371.82 |
|
||||
* //| GOOG | 2005-10-01 | 372.14 |
|
||||
* //| GOOG | 2006-08-01 | 378.53 |
|
||||
* //| GOOG | 2006-07-01 | 386.60 |
|
||||
* //| GOOG | 2006-03-01 | 390.00 |
|
||||
* //| GOOG | 2009-04-01 | 395.97 |
|
||||
* //| GOOG | 2008-09-01 | 400.52 |
|
||||
* //| GOOG | 2006-09-01 | 401.90 |
|
||||
* //| GOOG | 2005-11-01 | 404.91 |
|
||||
* //| GOOG | 2005-12-01 | 414.86 |
|
||||
* //| GOOG | 2009-05-01 | 417.23 |
|
||||
* //| GOOG | 2006-04-01 | 417.94 |
|
||||
* //| GOOG | 2006-06-01 | 419.33 |
|
||||
* //| GOOG | 2009-06-01 | 421.59 |
|
||||
* //| GOOG | 2006-01-01 | 432.66 |
|
||||
* //| GOOG | 2008-03-01 | 440.47 |
|
||||
* //| GOOG | 2009-07-01 | 443.05 |
|
||||
* //| GOOG | 2007-02-01 | 449.45 |
|
||||
* //| GOOG | 2007-03-01 | 458.16 |
|
||||
* //| GOOG | 2006-12-01 | 460.48 |
|
||||
* //| GOOG | 2009-08-01 | 461.67 |
|
||||
* //| GOOG | 2008-08-01 | 463.29 |
|
||||
* //| GOOG | 2008-02-01 | 471.18 |
|
||||
* //| GOOG | 2007-04-01 | 471.38 |
|
||||
* //| GOOG | 2008-07-01 | 473.75 |
|
||||
* //| GOOG | 2006-10-01 | 476.39 |
|
||||
* //| GOOG | 2006-11-01 | 484.81 |
|
||||
* //| GOOG | 2009-09-01 | 495.85 |
|
||||
* //| GOOG | 2007-05-01 | 497.91 |
|
||||
* //| GOOG | 2007-01-01 | 501.50 |
|
||||
* //| GOOG | 2007-07-01 | 510.00 |
|
||||
* //| GOOG | 2007-08-01 | 515.25 |
|
||||
* //| GOOG | 2007-06-01 | 522.70 |
|
||||
* //| GOOG | 2008-06-01 | 526.42 |
|
||||
* //| GOOG | 2010-02-01 | 526.80 |
|
||||
* //| GOOG | 2010-01-01 | 529.94 |
|
||||
* //| GOOG | 2009-10-01 | 536.12 |
|
||||
* //| GOOG | 2010-03-01 | 560.19 |
|
||||
* //| GOOG | 2008-01-01 | 564.30 |
|
||||
* //| GOOG | 2007-09-01 | 567.27 |
|
||||
* //| GOOG | 2008-04-01 | 574.29 |
|
||||
* //| GOOG | 2009-11-01 | 583.00 |
|
||||
* //| GOOG | 2008-05-01 | 585.80 |
|
||||
* //| GOOG | 2009-12-01 | 619.98 |
|
||||
* //| GOOG | 2007-12-01 | 691.48 |
|
||||
* //| GOOG | 2007-11-01 | 693.00 |
|
||||
* //| GOOG | 2007-10-01 | 707.00 |
|
||||
|
||||
*Map targetPrices = makeDataset(hashmap("price", new Double[] { 200.0, 300.0, 400.0 }));
|
||||
|
||||
*println(leftJoinAsof("price", targetPrices, googPrices, hashmap(kw("asof-op"), kw("<="))));
|
||||
* //asof-<= [3 4]:
|
||||
* //| price | symbol | date | GOOG.price |
|
||||
* //|------:|--------|------------|-----------:|
|
||||
* //| 200.0 | GOOG | 2005-04-01 | 220.00 |
|
||||
* //| 300.0 | GOOG | 2008-12-01 | 307.65 |
|
||||
* //| 400.0 | GOOG | 2008-09-01 | 400.52 |
|
||||
* println(leftJoinAsof("price", targetPrices, googPrices, hashmap(kw("asof-op"), kw(">"))));
|
||||
* //asof-> [3 4]:
|
||||
* //| price | symbol | date | GOOG.price |
|
||||
* //|------:|--------|------------|-----------:|
|
||||
* //| 200.0 | GOOG | 2005-01-01 | 195.62 |
|
||||
* //| 300.0 | GOOG | 2005-06-01 | 294.15 |
|
||||
* //| 400.0 | GOOG | 2009-04-01 | 395.97 |
|
||||
*```
|
||||
*/
|
||||
public static Map leftJoinAsof(Object colname, Map lhs, Map rhs, Object options) {
|
||||
return (Map)joinAsof.invoke(colname, lhs, rhs, options);
|
||||
}
|
||||
public static Map leftJoinAsof(Object colname, Map lhs, Map rhs) {
|
||||
return (Map)joinAsof.invoke(colname, lhs, rhs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a dataset to a neanderthal 2D matrix such that the columns of the dataset
|
||||
* become the columns of the matrix. This function dynamically loads the neanderthal
|
||||
* MKL bindings so there may be some pause when first called. If you would like to have
|
||||
* the pause somewhere else call `require("tech.v3.dataset.neanderthal");` at some
|
||||
* previous point of the program. You must have an update-to-date version of
|
||||
* neanderthal in your classpath such as `[uncomplicate/neanderthal "0.43.3"]`.
|
||||
*
|
||||
* See the [neanderthal documentation](https://neanderthal.uncomplicate.org/)`
|
||||
*
|
||||
* @param layout One of `:column` or `:row`.
|
||||
* @param datatype One of `:float32` or `:float64`.
|
||||
*
|
||||
* Note that you can get a tech tensor (tech.v3.datatype.NDBuffer) from a neanderthal
|
||||
* matrix using `tech.v3.DType.asTensor()`.
|
||||
*
|
||||
*/
|
||||
public static Object toNeanderthal(Object ds, Keyword layout, Keyword datatype) {
|
||||
return call(deref(toNeanderthalDelay), ds, layout, datatype);
|
||||
}
|
||||
/**
|
||||
* Convert a dataset to a neanderthal 2D matrix such that the columns of the dataset
|
||||
* become the columns of the matrix. See documentation for 4-arity version of
|
||||
* function. This function creates a column-major float64 (double) matrix.
|
||||
*/
|
||||
public static Object toNeanderthal(Object ds) {
|
||||
return call(deref(toNeanderthalDelay), ds);
|
||||
}
|
||||
/**
|
||||
* Convert a neanderthal matrix to a dataset such that the columns of the matrix
|
||||
* become the columns of the dataset. Column names are the indexes of the columns.
|
||||
*/
|
||||
public static Map neanderthalToDataset(Object denseMat) {
|
||||
return (Map)call(deref(neanderthalToDatasetDelay), denseMat);
|
||||
}
|
||||
/**
|
||||
* Convert a dataset to a jvm-heap based 2D tensor such that the columns of the
|
||||
* dataset become the columns of the tensor.
|
||||
*
|
||||
* @param datatype Any numeric datatype - `:int8`, `:uint8`, `:float32`, `:float64`, etc.
|
||||
*/
|
||||
public static NDBuffer toTensor(Object ds, Keyword datatype) {
|
||||
return (NDBuffer)call(deref(toTensorDelay), ds, datatype);
|
||||
}
|
||||
/**
|
||||
* Convert a dataset to a jvm-heap based 2D double (float64) tensor.
|
||||
*/
|
||||
public static NDBuffer toTensor(Object ds) {
|
||||
return (NDBuffer)call(deref(toTensorDelay), ds);
|
||||
}
|
||||
/**
|
||||
* Convert a tensor to a dataset such that the columns of the tensor
|
||||
* become the columns of the dataset named after their index.
|
||||
*/
|
||||
public static Map tensorToDataset(Object tens) {
|
||||
return (Map)call(deref(tensorToDatasetDelay), tens);
|
||||
}
|
||||
/**
|
||||
* Write a dataset to disc as csv, tsv, csv.gz, tsv.gz, json, json.gz or nippy.
|
||||
*
|
||||
* Reading/writing to parquet or arrow is accessible via separate clasess
|
||||
*/
|
||||
public static void writeDataset(Object ds, String path, Object options) {
|
||||
call(writeFn, ds, path, options);
|
||||
}
|
||||
/**
|
||||
* Write a dataset to disc as csv, tsv, csv.gz, tsv.gz or nippy.
|
||||
*/
|
||||
public static void writeDataset(Object ds, String path) {
|
||||
writeDataset(ds, path, null);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,329 @@
|
||||
package tech.v3.dataset;
|
||||
|
||||
|
||||
import static tech.v3.Clj.*;
|
||||
import clojure.lang.IFn;
|
||||
import clojure.lang.Keyword;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* Functions related to training and evaluating ML models. The functions are grouped into
|
||||
* a few groups.
|
||||
*
|
||||
* For the purpose of this system, categorical data means a column of data that is not numeric.
|
||||
* it could be strings, keywords, or arbitrary objects.
|
||||
*
|
||||
* Minimal example extra dependencies for PCA:
|
||||
*
|
||||
*```console
|
||||
* [uncomplicate/neanderthal "0.43.3"]
|
||||
*```
|
||||
*
|
||||
* It is also important to note that you can serialize the fit results to nippy automatically
|
||||
* as included in dtype-next are extensions to nippy that work with tensors.
|
||||
*/
|
||||
public class Modelling {
|
||||
private Modelling(){}
|
||||
|
||||
static final IFn fitCatFn = requiringResolve("tech.v3.dataset.categorical", "fit-categorical-map");
|
||||
static final IFn transCatFn = requiringResolve("tech.v3.dataset.categorical", "transform-categorical-map");
|
||||
static final IFn invCatFn = requiringResolve("tech.v3.dataset.categorical", "invert-categorical-map");
|
||||
static final IFn fitOneHotFn = requiringResolve("tech.v3.dataset.categorical", "fit-one-hot");
|
||||
static final IFn transOneHotFn = requiringResolve("tech.v3.dataset.categorical", "transform-one-hot");
|
||||
static final IFn invOneHotFn = requiringResolve("tech.v3.dataset.categorical", "invert-one-hot-map");
|
||||
|
||||
static final IFn corrTableFn = requiringResolve("tech.v3.dataset.math", "correlation-table");
|
||||
static final IFn fillRangeReplaceFn = requiringResolve("tech.v3.dataset.math", "fill-range-replace");
|
||||
static final IFn fitPCAFn = requiringResolve("tech.v3.dataset.math", "fit-pca");
|
||||
static final IFn fitStdScaleFn = requiringResolve("tech.v3.dataset.math", "fit-std-scale");
|
||||
static final IFn fitMinMaxFn = requiringResolve("tech.v3.dataset.math", "fit-minmax");
|
||||
static final IFn transPCAFn = requiringResolve("tech.v3.dataset.math", "transform-pca");
|
||||
static final IFn transStdScaleFn = requiringResolve("tech.v3.dataset.math", "transform-std-scale");
|
||||
static final IFn transMinMaxFn = requiringResolve("tech.v3.dataset.math", "transform-minmax");
|
||||
static final IFn interpolateLOESSFn = requiringResolve("tech.v3.dataset.math", "interpolate-loess");
|
||||
|
||||
static final IFn kFoldFn = requiringResolve("tech.v3.dataset.modelling", "k-fold-datasets");
|
||||
static final IFn trainTestFn = requiringResolve("tech.v3.dataset.modelling", "train-test-split");
|
||||
static final IFn setInfTargetFn = requiringResolve("tech.v3.dataset.modelling", "set-inference-target");
|
||||
static final IFn labelsFn = requiringResolve("tech.v3.dataset.modelling", "labels");
|
||||
static final IFn probDistToLabel = requiringResolve("tech.v3.dataset.modelling", "probability-distributions->label-column");
|
||||
static final IFn infTargetLabelMap = requiringResolve("tech.v3.dataset.modelling", "inference-target-label-map");
|
||||
|
||||
|
||||
/**
|
||||
* Fit an object->integer transform that takes each value and assigned an integer to it. The
|
||||
* returned value can be used in transformCategorical to transform the dataset.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:table-args` - Either a sequence of vectors [col-val, idx] or a sorted sequence
|
||||
* of column values where integers will be assigned as per the sorted sequence. Any values
|
||||
* found outside the the specified values will be auto-mapped to the next largest integer.
|
||||
* * `:res-dtype` - Datatype of result column. Defaults to `:float64`.
|
||||
*/
|
||||
public static Map fitCategorical(Object ds, Object cname, Object options) {
|
||||
return (Map)fitCatFn.invoke(ds, cname, options);
|
||||
}
|
||||
/**
|
||||
* Fit an object->integer transformation. Integers will be assigned in random order. For
|
||||
* more control over the transform see the 3-arity version of the function.
|
||||
*/
|
||||
public static Map fitCategorical(Object ds, Object cname) {
|
||||
return (Map)fitCatFn.invoke(ds, cname);
|
||||
}
|
||||
/**
|
||||
* Apply an object->integer transformation with data obtained from fitCategorical.
|
||||
*/
|
||||
public static Map transformCategorical(Object ds, Object catFitData) {
|
||||
return (Map)transCatFn.invoke(ds, catFitData);
|
||||
}
|
||||
/**
|
||||
* Reverse a previously transformed categorical mapping.
|
||||
*/
|
||||
public static Map invertCategorical(Object ds, Object catFitData) {
|
||||
return (Map)invCatFn.invoke(ds, catFitData);
|
||||
}
|
||||
/**
|
||||
* Fit a transformation from a single column of categorical values to a `one-hot` encoded
|
||||
* group of columns.
|
||||
* .
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:table-args` - Either a sequence of vectors [col-val, idx] or a sorted sequence
|
||||
* of column values where integers will be assigned as per the sorted sequence. Any values
|
||||
* found outside the the specified values will be auto-mapped to the next largest integer.
|
||||
* * `:res-dtype` - Datatype of result column. Defaults to `:float64`.
|
||||
*
|
||||
*/
|
||||
public static Map fitOneHot(Object ds, Object cname, Object options) {
|
||||
return (Map)fitOneHotFn.invoke(ds, cname, options);
|
||||
}
|
||||
/**
|
||||
* Fit a mapping from a categorical column to a group of one-hot encoded columns.
|
||||
*/
|
||||
public static Map fitOneHot(Object ds, Object cname) {
|
||||
return (Map)fitOneHotFn.invoke(ds, cname);
|
||||
}
|
||||
/**
|
||||
* Transform a dataset using a fitted one-hot mapping.
|
||||
*/
|
||||
public static Map transformOneHot(Object ds, Object fitData) {
|
||||
return (Map)transOneHotFn.invoke(ds, fitData);
|
||||
}
|
||||
/**
|
||||
* Reverse a previously transformed one-hot mapping.
|
||||
*/
|
||||
public static Map invertOneHot(Object ds, Object fitData) {
|
||||
return (Map)invOneHotFn.invoke(ds, fitData);
|
||||
}
|
||||
/**
|
||||
* Return a map of column to inversely sorted from greatest to least sequence of tuples of
|
||||
* column name, coefficient.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:correlation-type` One of `:pearson`, `:spearman`, or `:kendall`. Defaults to
|
||||
* `:pearson`.
|
||||
*/
|
||||
public static Map correlationTable(Object ds, Object options) {
|
||||
return (Map)corrTableFn.invoke(ds, options);
|
||||
}
|
||||
/**
|
||||
* Return a map of column to inversely sorted from greatest to least sequence of tuples of
|
||||
* column name, pearson correlation coefficient.
|
||||
*/
|
||||
public static Map correlationTable(Object ds) {
|
||||
return (Map)corrTableFn.invoke(ds);
|
||||
}
|
||||
/**
|
||||
* Expand a dataset ensuring that the difference between two successive values is less than
|
||||
* `max-span`.
|
||||
*
|
||||
* @param maxSpan The minimal span value. For datetime types this is interpreted in
|
||||
* millisecond or epoch-millisecond space.
|
||||
* @param missingStrategy Same missing strategy types from `TMD.replaceMissing`.
|
||||
*/
|
||||
public static Map fillRangeReplace(Object ds, Object cname, double maxSpan, Object missingStrategy) {
|
||||
Keyword strat;
|
||||
Object value = null;
|
||||
if (isVector(missingStrategy)) {
|
||||
strat = (Keyword)call(missingStrategy, 0);
|
||||
value = call(missingStrategy, 1);
|
||||
} else {
|
||||
strat = (Keyword)missingStrategy;
|
||||
}
|
||||
return (Map) fillRangeReplaceFn.invoke(ds, cname, maxSpan, strat, value);
|
||||
}
|
||||
/**
|
||||
* Fit a PCA transformation on a dataset.
|
||||
*
|
||||
* @return map of `{:means, :eigenvalues, :eigenvectors}`.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:method` - either `:svd` or `:cov`. Use either SVD transformation or covariance-matrix
|
||||
* base PCA. `:cov` method is somewhat slower but returns accurate variances and thus
|
||||
* is the default.
|
||||
* * `:variance-amount` - Keep columns until variance is just less than variance-amount.
|
||||
* Defaults to 0.95.
|
||||
* * `:n-components` - Return a fixed number of components. Overrides `:variance-amount`
|
||||
* an returns a fixed number of components.
|
||||
* * `:covariance-bias` - When using `:cov` divide by `n-rows` if true and `n-rows - 1` if
|
||||
* false. Defaults to false.
|
||||
*/
|
||||
public static Object fitPCA(Object ds, Object options) {
|
||||
return fitPCAFn.invoke(ds, options);
|
||||
}
|
||||
/**
|
||||
* Fit a PCA transformation onto a dataset keeping 95% of the variance. See documentation
|
||||
* for 2-arity form.
|
||||
*/
|
||||
public static Object fitPCA(Object ds) {
|
||||
return fitPCAFn.invoke(ds);
|
||||
}
|
||||
/**
|
||||
* Transform a dataset by the PCA fit data.
|
||||
*/
|
||||
public static Map transformPCA(Object ds, Object fitData) {
|
||||
return (Map)transPCAFn.invoke(ds, fitData);
|
||||
}
|
||||
/**
|
||||
* Calculate per-column mean, stddev.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:mean?` - Produce per-column means. Defaults to true.
|
||||
* * `:stddev?` - Produce per-column standard deviation. Defaults to true.
|
||||
*/
|
||||
public static Object fitStdScale(Object ds) {
|
||||
return fitStdScaleFn.invoke(ds);
|
||||
}
|
||||
/**
|
||||
* Transform dataset to mean of zero and a standard deviation of 1.
|
||||
*/
|
||||
public static Map transformStdScale(Object ds, Object fitData) {
|
||||
return (Map)transStdScaleFn.invoke(ds, fitData);
|
||||
}
|
||||
/**
|
||||
* Fit a bias and scale the dataset that transforms each colum to a target min-max
|
||||
* value.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:min` - Target minimum value. Defaults it -0.5.
|
||||
* * `:max` - Target maximum value. Defaults to 0.5.
|
||||
*/
|
||||
public static Object fitMinMax(Object ds, Object options) {
|
||||
return fitMinMaxFn.invoke(ds, options);
|
||||
}
|
||||
/**
|
||||
* Fit a minmax transformation that will transform each column to a minimum of -0.5 and
|
||||
* a maximum of 0.5.
|
||||
*/
|
||||
public static Object fitMinMax(Object ds) {
|
||||
return fitMinMaxFn.invoke(ds);
|
||||
}
|
||||
/**
|
||||
* Transform a dataset using a previously fit minimax transformation.
|
||||
*/
|
||||
public static Map transformMinMax(Object ds, Object fitData) {
|
||||
return (Map)transMinMaxFn.invoke(ds, fitData);
|
||||
}
|
||||
/**
|
||||
* Map a LOESS-interpolation transformation onto a dataset. This can be used
|
||||
* to, among other things, smooth out a column before graphing. For the meaning
|
||||
* of the options, see documentation on the
|
||||
* org.apache.commons.math3.analysis.interpolationLoessInterpolator.
|
||||
*
|
||||
* Option defaults have been chosen to map somewhat closely to the R defaults.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:bandwidth` - Defaults to 0.75.
|
||||
* * `:iterations` - Defaults to 4.
|
||||
* * `:accuracy` - Defaults to LoessInterpolator/DEFAULT_ACCURACY.
|
||||
* * `:result-name` - Result column name. Defaults to `yColname.toString + "-loess"`.
|
||||
*/
|
||||
public static Map interpolateLOESS(Object ds, Object xColname, Object yColname, Object options) {
|
||||
return (Map)interpolateLOESSFn.invoke(ds, xColname, yColname, options);
|
||||
}
|
||||
/**
|
||||
* Perform a LOESS interpolation using the default parameters. For options see 4-arity
|
||||
* form of function.
|
||||
*/
|
||||
public static Map interpolateLOESS(Object ds, Object xColname, Object yColname) {
|
||||
return (Map)interpolateLOESSFn.invoke(ds, xColname, yColname);
|
||||
}
|
||||
/**
|
||||
* Produce 2*k datasets from 1 dataset using k-fold algorithm.
|
||||
* Returns a k maps of the form `{:test-ds :train-ds}.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:randomize-dataset?` - When true, shuffle dataset. Defaults to true.
|
||||
* * `:seed` - When randomizing dataset, seed may be either an integer or an implementation
|
||||
* of `java.util.Random`.
|
||||
*/
|
||||
public static Iterable kFold(Object ds, long k, Object options) {
|
||||
return (Iterable)kFoldFn.invoke(ds, k, options);
|
||||
}
|
||||
/**
|
||||
* Return k maps of the form `{:test-ds :train-ds}`. For options see 3-arity form.
|
||||
*/
|
||||
public static Iterable kFold(Object ds, long k) {
|
||||
return (Iterable)kFoldFn.invoke(ds, k);
|
||||
}
|
||||
/**
|
||||
* Split the dataset returning a map of `{:train-ds :test-ds}`.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:randomize-dataset?` - Defaults to true.
|
||||
* * `:seed` - When provided must be an integer or an implementation `java.util.Random`.
|
||||
* * `:train-fraction` - Fraction of dataset to use as training set. Defaults to 0.7.
|
||||
*/
|
||||
public static Map trainTestSplit(Object ds, Object options) {
|
||||
return (Map)trainTestFn.invoke(ds, options);
|
||||
}
|
||||
/**
|
||||
* Randomize then split dataset using 70% of the data for training and the rest for testing.
|
||||
*/
|
||||
public static Map trainTestSplit(Object ds) {
|
||||
return (Map)trainTestFn.invoke(ds);
|
||||
}
|
||||
/**
|
||||
* Set a column in the dataset as the inference target. This information is stored in the
|
||||
* column metadata. This function is short form for:
|
||||
*
|
||||
*```java
|
||||
* Object col = column(ds, cname);
|
||||
* return assoc(ds, cname, varyMeta(col, assocFn, kw("inference-target?"), true));
|
||||
*```
|
||||
*/
|
||||
public static Map setInferenceTarget(Object ds, Object cname) {
|
||||
return (Map)setInfTargetFn.invoke(ds, cname);
|
||||
}
|
||||
/**
|
||||
* Find the inference column. If column was the result of a categorical mapping, reverse
|
||||
* that mapping. Return data in a form that can be efficiently converted to a Buffer.
|
||||
*/
|
||||
public static Object labels(Object ds) {
|
||||
return labelsFn.invoke(ds);
|
||||
}
|
||||
/**
|
||||
* Given a dataset where the column names are labels and the each row is a probabilitly
|
||||
* distribution across the labels, produce a Buffer of labels taking the highest probability
|
||||
* for each row to choose the label.
|
||||
*/
|
||||
public static Object probabilityDistributionToLabels(Object ds) {
|
||||
return probDistToLabel.invoke(ds);
|
||||
}
|
||||
/**
|
||||
* Return a map of val->idx for the inference target.
|
||||
*/
|
||||
public static Map inferenceTargetLabelMap(Object ds) {
|
||||
return (Map)infTargetLabelMap.invoke(ds);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,348 @@
|
||||
package tech.v3.dataset;
|
||||
|
||||
|
||||
import static tech.v3.Clj.*;
|
||||
import clojure.lang.IFn;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* High speed grouping aggregations based on sequences of datasets.
|
||||
*/
|
||||
public class Reductions {
|
||||
private Reductions(){}
|
||||
|
||||
static final IFn reducerFn = requiringResolve("tech.v3.dataset.reductions", "reducer");
|
||||
static final IFn sumFn = requiringResolve("tech.v3.dataset.reductions", "sum");
|
||||
static final IFn meanFn = requiringResolve("tech.v3.dataset.reductions", "mean");
|
||||
static final IFn rowCountFn = requiringResolve("tech.v3.dataset.reductions", "row-count");
|
||||
static final IFn distinctFn = requiringResolve("tech.v3.dataset.reductions", "distinct");
|
||||
static final IFn countDistinctFn = requiringResolve("tech.v3.dataset.reductions", "count-distinct");
|
||||
static final IFn reservoirDsFn = requiringResolve("tech.v3.dataset.reductions", "reservoir-dataset");
|
||||
static final IFn reservoirDescStatFn = requiringResolve("tech.v3.dataset.reductions", "reservoir-desc-stat");
|
||||
static final IFn probSetCardFn = requiringResolve("tech.v3.dataset.reductions.apache-data-sketch", "prob-set-cardinality");
|
||||
static final IFn probQuantilesFn = requiringResolve("tech.v3.dataset.reductions.apache-data-sketch", "prob-quantiles");
|
||||
static final IFn probQuantileFn = requiringResolve("tech.v3.dataset.reductions.apache-data-sketch", "prob-quantile");
|
||||
static final IFn probMedianFn = requiringResolve("tech.v3.dataset.reductions.apache-data-sketch", "prob-median");
|
||||
static final IFn probCdfsFn = requiringResolve("tech.v3.dataset.reductions.apache-data-sketch", "prob-cdfs");
|
||||
static final IFn probPmfsFn = requiringResolve("tech.v3.dataset.reductions.apache-data-sketch", "prob-pmfs");
|
||||
static final IFn probIQRangeFn = requiringResolve("tech.v3.dataset.reductions.apache-data-sketch", "prob-interquartile-range");
|
||||
static final IFn groupByColumnAggFn = requiringResolve("tech.v3.dataset.reductions", "group-by-column-agg");
|
||||
|
||||
|
||||
/**
|
||||
* Group a sequence of datasets by column or columns an in the process perform an aggregation.
|
||||
* The resulting dataset will have one row per grouped key. Columns used as keys will always
|
||||
* be represented in the result.
|
||||
*
|
||||
* @param dsSeq Sequence of datasets such as produced by rowMapcat, dsPmap, or loading many
|
||||
* files.
|
||||
* @param colname Either a single column name or a vector of column names. These will be the
|
||||
* grouping keys.
|
||||
* @param aggMap Map of result colname to reducer. Various reducers are provided or you can
|
||||
* build your own via the `reducer` function.
|
||||
* @param options Options map. Described below. May be null.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:map-initial-capacity` - initial hashmap capacity. Resizing hash-maps is expensive
|
||||
* so we would like to set this to something reasonable. Defaults to 100000.
|
||||
* * `:index-filter` - A function that given a dataset produces a function from long index
|
||||
* to boolean. Only indexes for which the index-filter returns true will be added to the
|
||||
* aggregation. For very large datasets, this is a bit faster than using filter before
|
||||
* the aggregation.
|
||||
*
|
||||
* Example:
|
||||
*
|
||||
*```java
|
||||
* //Begin parallelized expansion
|
||||
*Iterable dsSeq = (Iterable)rowMapcat(srcds, tallyDays, hashmap(kw("result-type"), kw("as-seq")));
|
||||
*
|
||||
* //The first aggregation is to summarize by placement and simulation the year-month tallies.
|
||||
* //We are essentially replacing count with a summarized count. After this statement
|
||||
* //we can guarantee that the dataset has unique tuples of [simulation, placement, year-month]
|
||||
*Map initAgg = Reductions.groupByColumnsAgg(dsSeq, vector("simulation", "placement", "year-month"),
|
||||
* hashmap("count", Reductions.sum("count")),
|
||||
* null);
|
||||
*println(head(initAgg));
|
||||
* //["simulation" "placement" "year-month"]-aggregation [5 4]:
|
||||
*
|
||||
* //| simulation | placement | year-month | count |
|
||||
* //|-----------:|----------:|------------|------:|
|
||||
* //| 0 | 0 | 2020-12 | 622.0 |
|
||||
* //| 0 | 1 | 2020-12 | 591.0 |
|
||||
* //| 0 | 2 | 2020-12 | 500.0 |
|
||||
* //| 0 | 3 | 2020-12 | 549.0 |
|
||||
* //| 0 | 4 | 2020-12 | 595.0 |
|
||||
*
|
||||
* // The second aggregation allows us to build of statistics over each placement/year-month
|
||||
* // pair thus finding out the distribution of a given placement, year-month across simluations
|
||||
*Map result = Reductions.groupByColumnsAgg(vector(initAgg), vector("placement", "year-month"),
|
||||
* hashmap("min-count", Reductions.probQuantile("count", 0.0),
|
||||
* "low-95-count", Reductions.probQuantile("count", 0.05),
|
||||
* "q1-count", Reductions.probQuantile("count", 0.25),
|
||||
* "median-count", Reductions.probQuantile("count", 0.5),
|
||||
* "q3-count", Reductions.probQuantile("count", 0.75),
|
||||
* "high-95-count", Reductions.probQuantile("count", 0.95),
|
||||
* "max-count", Reductions.probQuantile("count", 1.0),
|
||||
* "count", Reductions.sum("count")),
|
||||
* null);
|
||||
* //Take a million row dataset, expand it, then perform two grouping aggregations.
|
||||
*println(head(result));
|
||||
* //["placement" "year-month"]-aggregation [5 10]:
|
||||
*
|
||||
* //| q3-count | median-count | min-count | high-95-count | placement | max-count | count | low-95-count | q1-count | year-month |
|
||||
* //|---------:|-------------:|----------:|--------------:|----------:|----------:|--------:|-------------:|---------:|------------|
|
||||
* //| 646.0 | 593.0 | 366.0 | 716.0 | 36 | 809.0 | 58920.0 | 475.0 | 536.0 | 2020-12 |
|
||||
* //| 621.0 | 560.0 | 376.0 | 739.0 | 36 | 782.0 | 57107.0 | 459.0 | 512.0 | 2020-10 |
|
||||
* //| 168.0 | 139.0 | 25.0 | 211.0 | 0 | 246.0 | 13875.0 | 76.0 | 112.0 | 2021-01 |
|
||||
* //| 658.0 | 607.0 | 384.0 | 745.0 | 0 | 825.0 | 60848.0 | 486.0 | 561.0 | 2020-12 |
|
||||
* //| 628.0 | 581.0 | 422.0 | 693.0 | 0 | 802.0 | 58148.0 | 468.0 | 539.0 | 2020-11 |
|
||||
*```
|
||||
*/
|
||||
public static Map groupByColumnsAgg(Iterable dsSeq, Object colname, Map aggMap, Map options) {
|
||||
return (Map)groupByColumnAggFn.invoke(colname, aggMap, options, dsSeq);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a custom reducer. perElemFn is passed the last return value as the first argument
|
||||
* followed by a value from each column as additional arguments. It must always return the
|
||||
* current context.
|
||||
*
|
||||
* This is a easy way to instantiate tech.v3.datatype.IndexReduction so if you really need
|
||||
* the best possible performance you need to implement three methods of IndexReduction:
|
||||
*
|
||||
* * `prepareBatch` - Passed each dataset before processing. Return value becomes first
|
||||
* argument to `reduceIndex`.
|
||||
* * `reduceIndex` - Passed batchCtx, valCtx, and rowIdx. Must return an updated or
|
||||
* new valCtx.
|
||||
* * `finalize` - Passed valCtx and must return the final per-row value expected in
|
||||
* result dataset. The default is just to return valCtx.
|
||||
*
|
||||
* For `groupByColumnAgg` you do not need to worry about reduceReductions - there is no
|
||||
* merge step.
|
||||
*
|
||||
* @param colname One or more column names. If multiple column names are specified then
|
||||
* perElemFn will need to take additional arguments.
|
||||
* @param perElemFn A function that takes the previous context along with the current row's
|
||||
* column values and returns a new context.
|
||||
* @param finalizeFn Optional function that performs a final calculation taking a context
|
||||
* and returning a value.
|
||||
*/
|
||||
public static Object reducer(Object colname, IFn perElemFn, IFn finalizeFn) {
|
||||
return reducerFn.invoke(colname, perElemFn, finalizeFn);
|
||||
}
|
||||
/**
|
||||
* Create a custom reducer. `perElemFn` is passed the last return value as the first
|
||||
* argument followed by a value from each column as additional arguments. It must always
|
||||
* return the current context.
|
||||
*
|
||||
* This is a easy way to instantiate tech.v3.datatype.IndexReduction so if you really need
|
||||
* the best possible performance you need to implement three methods of IndexReduction:
|
||||
*
|
||||
* * `prepareBatch` - Passed each dataset before processing. Return value becomes first
|
||||
* argument to `reduceIndex`.
|
||||
* * `reduceIndex` - Passed batchCtx, valCtx, and rowIdx. Must return valCtx.
|
||||
* * `finalize` - Passed valCtx and must return the final per-row value expected in
|
||||
* result dataset.
|
||||
*
|
||||
* For `groupByColumnAgg` you do not need to worry about reduceReductions - there is no
|
||||
* merge step.
|
||||
*
|
||||
* @param colname One or more column names. If multiple column names are specified then
|
||||
* perElemFn will need to take additional arguments.
|
||||
* @param perElemFn A function that takes the previous context along with the current row's
|
||||
* column values and returns a new context.
|
||||
*/
|
||||
public static Object reducer(Object colname, IFn perElemFn) {
|
||||
return reducerFn.invoke(colname, perElemFn);
|
||||
}
|
||||
/**
|
||||
* Returns a summation reducer that sums an individual source column.
|
||||
*/
|
||||
public static Object sum(Object colname) {
|
||||
return sumFn.invoke(colname);
|
||||
}
|
||||
/**
|
||||
* Returns a mean reducer that produces a mean value of an individual source column.
|
||||
*/
|
||||
public static Object mean(Object colname) {
|
||||
return meanFn.invoke(colname);
|
||||
}
|
||||
/**
|
||||
* Returns a rowCount reducer that returns the number of source rows aggregated.
|
||||
*/
|
||||
public static Object rowCount(Object colname) {
|
||||
return rowCountFn.invoke(colname);
|
||||
}
|
||||
/**
|
||||
* Returns a distinct reducer produces a set of distinct values.
|
||||
*/
|
||||
public static Object distinct(Object colname) {
|
||||
return distinctFn.invoke(colname);
|
||||
}
|
||||
/**
|
||||
* Returns a distinct reducer that produces a roaringbitmap of distinct values. This is many
|
||||
* times faster than the distinct reducer if your data fits into unsigned int32 space.
|
||||
*/
|
||||
public static Object distinctUInt32(Object colname) {
|
||||
return distinctFn.invoke(colname);
|
||||
}
|
||||
/**
|
||||
* Returns a distinct reducer returns the number of distinct elements.
|
||||
*/
|
||||
public static Object setCardinality(Object colname) {
|
||||
return countDistinctFn.invoke(colname);
|
||||
}
|
||||
/**
|
||||
* Returns a distinct reducer that expects unsigned integer values and returns the number
|
||||
* of distinct elements. This is many times faster than the countDistinct function.
|
||||
*/
|
||||
public static Object setCardinalityUint32(Object colname) {
|
||||
return countDistinctFn.invoke(colname, kw("int32"));
|
||||
}
|
||||
/**
|
||||
* Return a reducer that produces a probabilistically sampled dataset of at most nRows len.
|
||||
*/
|
||||
public static Object reservoirDataset(long nRows) {
|
||||
return reservoirDsFn.invoke(nRows);
|
||||
}
|
||||
/**
|
||||
* Return a reducer which will probabilistically sample the source column producing at most
|
||||
* nRows and then call descriptiveStatistics on it with statName.
|
||||
*
|
||||
* Stat names are described in tech.v3.datatype.Statistics.descriptiveStats.
|
||||
*/
|
||||
public static Object reservoirStats(Object colname, long nRows, Object statName) {
|
||||
return reservoirDescStatFn.invoke(colname, nRows, statName);
|
||||
}
|
||||
/**
|
||||
* Calculate a probabilistic set cardinality for a given column based on one of three
|
||||
* algorithms.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:datatype` - One of `#{:float64 :string}`. Unspecified defaults to `:float64`.
|
||||
* * `:algorithm` - defaults to :hyper-log-log. Further algorithm-specific options
|
||||
* may be included in the options map.
|
||||
*
|
||||
* Algorithm specific options:
|
||||
*
|
||||
* * [:hyper-log-log](https://datasketches.apache.org/docs/HLL/HLL.html)
|
||||
* * `:hll-lgk` - defaults to 12, this is log-base2 of k, so k = 4096. lgK can be
|
||||
* from 4 to 21.
|
||||
* * `:hll-type` - One of #{4,6,8}, defaults to 8. The HLL_4, HLL_6 and HLL_8
|
||||
* represent different levels of compression of the final HLL array where the
|
||||
* 4, 6 and 8 refer to the number of bits each bucket of the HLL array is
|
||||
* compressed down to. The HLL_4 is the most compressed but generally slightly
|
||||
* slower than the other two, especially during union operations.
|
||||
* * [:theta](https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html)
|
||||
* * [:cpc](https://datasketches.apache.org/docs/CPC/CPC.html)
|
||||
* * `:cpc-lgk` - Defaults to 10.
|
||||
*/
|
||||
public static Object probSetCardinality(Object colname, Map options) {
|
||||
return probSetCardFn.invoke(colname, options);
|
||||
}
|
||||
/**
|
||||
* Probabilistic quantile estimation - see [DoublesSketch](https://datasketches.apache.org/api/java/snapshot/apidocs/index.html).
|
||||
*
|
||||
* @param quantiles Sequence of quantiles.
|
||||
* @param k Defaults to 128. This produces a normalized rank error of about 1.7%"
|
||||
*/
|
||||
public static Object probQuantiles(Object colname, Object quantiles, long k) {
|
||||
return probQuantilesFn.invoke(colname, quantiles, k);
|
||||
}
|
||||
/**
|
||||
* Probabilistic quantile estimation using default k of 128.
|
||||
* See [DoublesSketch](https://datasketches.apache.org/api/java/snapshot/apidocs/index.html).
|
||||
*
|
||||
* @param quantiles Sequence of numbers from 0-1.
|
||||
*/
|
||||
public static Object probQuantiles(Object colname, Object quantiles) {
|
||||
return probQuantilesFn.invoke(colname, quantiles);
|
||||
}
|
||||
|
||||
/**
|
||||
* Probabilistic quantile estimation using default k of 128.
|
||||
* See [DoublesSketch](https://datasketches.apache.org/api/java/snapshot/apidocs/index.html).
|
||||
* Multiple quantile calculations on a single source column will be merged into a single quantile
|
||||
* calculation so it may be more convenient to use this function to produce multiple quantiles
|
||||
* mapped to several result columns as opposed to ending up with a single column of maps of quantile
|
||||
* to value.
|
||||
*
|
||||
* @param quantile Number from 0-1.
|
||||
* @param k Defaults to 128. This produces a normalized rank error of about 1.7%
|
||||
*/
|
||||
public static Object probQuantile(Object colname, double quantile, long k) {
|
||||
return probQuantileFn.invoke(colname, quantile);
|
||||
}
|
||||
/**
|
||||
* Probabilistic quantile estimation using default k of 128.
|
||||
* See [DoublesSketch](https://datasketches.apache.org/api/java/snapshot/apidocs/index.html).
|
||||
* Multiple quantiles will be merged into a single quantile calculation so it may be more
|
||||
* convenient to use this function to produce multiple quantiles mapped to several result
|
||||
* columns as opposed to ending up with a single column of maps of quantile to value.
|
||||
*
|
||||
* @param quantile Number from 0-1.
|
||||
*/
|
||||
public static Object probQuantile(Object colname, double quantile) {
|
||||
return probQuantileFn.invoke(colname, quantile);
|
||||
}
|
||||
/**
|
||||
* Probabilistic median. See documentation for probQuantiles.
|
||||
*/
|
||||
public static Object probMedian(Object colname, long k) {
|
||||
return probMedianFn.invoke(colname, k);
|
||||
}
|
||||
/**
|
||||
* Probabilistic median with default K of 128. See documentation for probQuantiles.
|
||||
*/
|
||||
public static Object probMedian(Object colname) {
|
||||
return probMedianFn.invoke(colname);
|
||||
}
|
||||
/**
|
||||
* Probabilistic interquartile range. See documentation for probQuantile.
|
||||
*/
|
||||
public static Object probInterquartileRange(Object colname, long k) {
|
||||
return probIQRangeFn.invoke(colname, k);
|
||||
}
|
||||
/**
|
||||
* Probabilistic interquartile range. See documentation for probQuantile.
|
||||
*/
|
||||
public static Object probInterquartileRange(Object colname) {
|
||||
return probIQRangeFn.invoke(colname);
|
||||
}
|
||||
/**
|
||||
* Probabilistic CDF calculation, one for each double cdf passed in.
|
||||
* See documentation for progQuantiles.
|
||||
*/
|
||||
public static Object probCDFS(Object colname, Object cdfs, long k) {
|
||||
return probCdfsFn.invoke(colname, cdfs, k);
|
||||
}
|
||||
/**
|
||||
* Probabilistic CDF calculation, one for each double cdf passed in.
|
||||
* See documentation for probQuantiles.
|
||||
*/
|
||||
public static Object probCDFS(Object colname, Object cdfs) {
|
||||
return probCdfsFn.invoke(colname, cdfs);
|
||||
}
|
||||
/**
|
||||
* Returns an approximation to the Probability Mass Function (PMF) of the input stream
|
||||
* given a set of splitPoints (values). See [DoublesSketch](https://datasketches.apache.org/api/java/snapshot/apidocs/index.html).
|
||||
* See documentation for probQuantiles.
|
||||
*
|
||||
*/
|
||||
public static Object probPMFS(Object colname, Object pmfs, long k) {
|
||||
return probPmfsFn.invoke(colname, pmfs, k);
|
||||
}
|
||||
/**
|
||||
* Returns an approximation to the Probability Mass Function (PMF) of the input stream
|
||||
* given a set of splitPoints (values). See [DoublesSketch](https://datasketches.apache.org/api/java/snapshot/apidocs/index.html).
|
||||
* See documentation for probQuantiles.
|
||||
*
|
||||
*/
|
||||
public static Object probPMFS(Object colname, Object pmfs) {
|
||||
return probPmfsFn.invoke(colname, pmfs);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,215 @@
|
||||
package tech.v3.dataset;
|
||||
|
||||
import static tech.v3.Clj.*;
|
||||
import clojure.lang.IFn;
|
||||
import clojure.lang.Keyword;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Fixed and variable length rolling windows. For variable rolling windows the dataset
|
||||
* must already be sorted by the target column. Datetime support is provided in terms of
|
||||
* provide specific units in which to perform the rolling operation such as the keyword
|
||||
* `:days`.
|
||||
*
|
||||
*/
|
||||
public class Rolling {
|
||||
|
||||
private Rolling(){}
|
||||
|
||||
static final IFn meanFn = requiringResolve("tech.v3.dataset.rolling", "mean");
|
||||
static final IFn sumFn = requiringResolve("tech.v3.dataset.rolling", "sum");
|
||||
static final IFn minFn = requiringResolve("tech.v3.dataset.rolling", "min");
|
||||
static final IFn maxFn = requiringResolve("tech.v3.dataset.rolling", "max");
|
||||
static final IFn varianceFn = requiringResolve("tech.v3.dataset.rolling", "variance");
|
||||
static final IFn stddevFn = requiringResolve("tech.v3.dataset.rolling", "standard-deviation");
|
||||
static final IFn nth = requiringResolve("tech.v3.dataset.rolling", "nth");
|
||||
static final IFn firstFn = requiringResolve("tech.v3.dataset.rolling", "first");
|
||||
static final IFn lastFn = requiringResolve("tech.v3.dataset.rolling", "last");
|
||||
static final IFn rollingFn = requiringResolve("tech.v3.dataset.rolling", "rolling");
|
||||
|
||||
/**
|
||||
* Fixed or variable rolling window reductions.
|
||||
*
|
||||
* @param windowSpec Window specification specifying the type of window, either a
|
||||
* window over a fixed number of rows or a window based on a fixed logical
|
||||
* quantitative difference i.e. three months or 10 milliseconds.
|
||||
* @param reducerMap map of dest column name to reducer where reducer is a map with
|
||||
* two keys, :column-name which is the input column to use and :reducer which is
|
||||
* an IFn that receives each window of data as a buffer.
|
||||
*
|
||||
* Example:
|
||||
*
|
||||
*```java
|
||||
* Map stocks = makeDataset("https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv");
|
||||
*
|
||||
* //Variable-sized windows require the source column to be sorted.
|
||||
* stocks = sortByColumn(stocks, "date");
|
||||
* Map variableWin = Rolling.rolling(stocks,
|
||||
* Rolling.variableWindow("date", 3, kw("months")),
|
||||
* hashmap("price-mean-3m", Rolling.mean("price"),
|
||||
* "price-max-3m", Rolling.max("price"),
|
||||
* "price-min-3m", Rolling.min("price")));
|
||||
*println(head(variableWin, 10));
|
||||
*https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv [10 6]:
|
||||
* //| symbol | date | price | price-max-3m | price-mean-3m | price-min-3m |
|
||||
* //|--------|------------|-------:|-------------:|--------------:|-------------:|
|
||||
* //| AAPL | 2000-01-01 | 25.94 | 106.11 | 58.92500000 | 25.94 |
|
||||
* //| IBM | 2000-01-01 | 100.52 | 106.11 | 61.92363636 | 28.66 |
|
||||
* //| MSFT | 2000-01-01 | 39.81 | 106.11 | 58.06400000 | 28.66 |
|
||||
* //| AMZN | 2000-01-01 | 64.56 | 106.11 | 60.09222222 | 28.66 |
|
||||
* //| AAPL | 2000-02-01 | 28.66 | 106.11 | 57.56583333 | 28.37 |
|
||||
* //| MSFT | 2000-02-01 | 36.35 | 106.11 | 60.19363636 | 28.37 |
|
||||
* //| IBM | 2000-02-01 | 92.11 | 106.11 | 62.57800000 | 28.37 |
|
||||
* //| AMZN | 2000-02-01 | 68.87 | 106.11 | 59.29666667 | 28.37 |
|
||||
* //| AMZN | 2000-03-01 | 67.00 | 106.11 | 54.65583333 | 21.00 |
|
||||
* //| MSFT | 2000-03-01 | 43.22 | 106.11 | 53.53363636 | 21.00 |
|
||||
*
|
||||
* //Fixed window...
|
||||
*
|
||||
* Object radians = VecMath.mul(2.0*Math.PI, VecMath.div(range(33), 32.0));
|
||||
* Map sinds = makeDataset(hashmap("radians", radians, "sin", VecMath.sin(radians)));
|
||||
* Map fixedWin = Rolling.rolling(sinds,
|
||||
* Rolling.fixedWindow(4),
|
||||
* hashmap("sin-roll-mean", Rolling.mean("sin"),
|
||||
* "sin-roll-max", Rolling.max("sin"),
|
||||
* "sin-roll-min", Rolling.min("sin")));
|
||||
*println(head(fixedWin, 8));
|
||||
* //_unnamed [8 5]:
|
||||
|
||||
* //| sin | radians | sin-roll-max | sin-roll-min | sin-roll-mean |
|
||||
* //|-----------:|-----------:|-------------:|-------------:|--------------:|
|
||||
* //| 0.00000000 | 0.00000000 | 0.19509032 | 0.00000000 | 0.04877258 |
|
||||
* //| 0.19509032 | 0.19634954 | 0.38268343 | 0.00000000 | 0.14444344 |
|
||||
* //| 0.38268343 | 0.39269908 | 0.55557023 | 0.00000000 | 0.28333600 |
|
||||
* //| 0.55557023 | 0.58904862 | 0.70710678 | 0.19509032 | 0.46011269 |
|
||||
* //| 0.70710678 | 0.78539816 | 0.83146961 | 0.38268343 | 0.61920751 |
|
||||
* //| 0.83146961 | 0.98174770 | 0.92387953 | 0.55557023 | 0.75450654 |
|
||||
* //| 0.92387953 | 1.17809725 | 0.98078528 | 0.70710678 | 0.86081030 |
|
||||
* //| 0.98078528 | 1.37444679 | 1.00000000 | 0.83146961 | 0.93403361 |
|
||||
*```
|
||||
*/
|
||||
public static Map rolling(Object ds, Map windowSpec, Map reducerMap) {
|
||||
return (Map)rollingFn.invoke(ds, windowSpec, reducerMap);
|
||||
}
|
||||
/**
|
||||
* Create a variable window specification with a double windowsize for a particular column.
|
||||
* This specification will not work on datetime columns.
|
||||
*/
|
||||
public static Map variableWindow(Object colname, double windowSize) {
|
||||
return hashmap(kw("window-type"), kw("variable"),
|
||||
kw("column-name"), colname,
|
||||
kw("window-size"), windowSize);
|
||||
}
|
||||
/**
|
||||
* Create a variable window specification with a double windowsize for a particular column
|
||||
* and a compFn which must take two values and return a double. The function must take 2
|
||||
* arguments and the arguments are passed in as (later,earlier). This allows the basic
|
||||
* clojure '-' operator to work fine in many cases.
|
||||
*
|
||||
*/
|
||||
public static Map variableWindow(Object colname, double windowSize, Object compFn) {
|
||||
return hashmap(kw("window-type"), kw("variable"),
|
||||
kw("column-name"), colname,
|
||||
kw("window-size"), windowSize,
|
||||
kw("comp-fn"), compFn);
|
||||
}
|
||||
/**
|
||||
* Create a datetime-specific variable window specification with a double windowsize for
|
||||
* a particular column.
|
||||
*
|
||||
* @param datetimeUnit One of `[:milliseconds, :seconds, :hours, :days, :months]`.
|
||||
*/
|
||||
public static Map variableWindow(Object colname, double windowSize, Keyword datetimeUnit) {
|
||||
return hashmap(kw("window-type"), kw("variable"),
|
||||
kw("column-name"), colname,
|
||||
kw("window-size"), windowSize,
|
||||
kw("units"), datetimeUnit);
|
||||
}
|
||||
/**
|
||||
* Return fixed size rolling window. Window will be fixed over `window-size` rows.
|
||||
*/
|
||||
public static Map fixedWindow(long windowSize) {
|
||||
return hashmap(kw("window-type"), kw("fixed"),
|
||||
kw("window-size"), windowSize);
|
||||
}
|
||||
/**
|
||||
* Return fixed size rolling window. Window will be fixed over `window-size` rows.
|
||||
*
|
||||
* @param winPos One of `[:left :center :right]`. This combined with the default
|
||||
* edge mode of `:clamp` dictates the windows of data the reducer sees.
|
||||
*/
|
||||
public static Map fixedWindow(long windowSize, Keyword winPos) {
|
||||
return hashmap(kw("window-type"), kw("fixed"),
|
||||
kw("window-size"), windowSize,
|
||||
kw("relative-window-position"), winPos);
|
||||
}
|
||||
/**
|
||||
* Return fixed size rolling window. Window will be fixed over `window-size` rows.
|
||||
*
|
||||
* @param winPos One of `[:left :center :right]`. This combined with the default
|
||||
* edge mode dictates windows of data the reducer sees.
|
||||
*
|
||||
* @param edgeMode One of `[:zero, null, :clamp]`. Clamp means repeat the end value.
|
||||
*/
|
||||
public static Map fixedWindow(long windowSize, Keyword winPos, Keyword edgeMode) {
|
||||
return hashmap(kw("window-type"), kw("fixed"),
|
||||
kw("window-size"), windowSize,
|
||||
kw("relative-window-position"), winPos,
|
||||
kw("edge-mode"), edgeMode);
|
||||
}
|
||||
/**
|
||||
* Create a columnwise reducer. This reducer gets sub-windows from the column and
|
||||
* must return a scalar value. If srcColname is a vector of colnames then reduceFn
|
||||
* will be passed each column window as a separate argument.
|
||||
*
|
||||
* @param datatype Option datatype, may be nil in which case the dataset will scan the
|
||||
* result to infer datatype. If provided this will enforce the result column datatype.
|
||||
* Reductions to numeric types with fixed datatypes will be slightly faster than
|
||||
* generic reductions which require inference to find the final datatype.
|
||||
*/
|
||||
public static Map reducer(Object srcColname, IFn reduceFn, Keyword datatype) {
|
||||
return hashmap(kw("column-name"), srcColname,
|
||||
kw("reducer"), reduceFn,
|
||||
kw("datatype"), datatype);
|
||||
}
|
||||
/**
|
||||
* Create a columnwise reducer eliding datatype parameter. See documentation
|
||||
* on 3-arity form of function.
|
||||
*/
|
||||
public static Map reducer(Object srcColname, IFn reduceFn) {
|
||||
return hashmap(kw("column-name"), srcColname,
|
||||
kw("reducer"), reduceFn);
|
||||
}
|
||||
/** mean reducer*/
|
||||
public static Map mean(Object colname) {
|
||||
return (Map)meanFn.invoke(colname);
|
||||
}
|
||||
/** sum reducer*/
|
||||
public static Map sum(Object colname) {
|
||||
return (Map)sumFn.invoke(colname);
|
||||
}
|
||||
/** min reducer*/
|
||||
public static Map min(Object colname) {
|
||||
return (Map)minFn.invoke(colname);
|
||||
}
|
||||
/** max reducer*/
|
||||
public static Map max(Object colname) {
|
||||
return (Map)maxFn.invoke(colname);
|
||||
}
|
||||
/** stddev reducer*/
|
||||
public static Map stddev(Object colname) {
|
||||
return (Map)stddevFn.invoke(colname);
|
||||
}
|
||||
/** variance reducer*/
|
||||
public static Map variance(Object colname) {
|
||||
return (Map)varianceFn.invoke(colname);
|
||||
}
|
||||
/** reducer that keeps the first value*/
|
||||
public static Map first(Object colname) {
|
||||
return (Map)firstFn.invoke(colname);
|
||||
}
|
||||
/** reducer that keeps the last value*/
|
||||
public static Map last(Object colname) {
|
||||
return (Map)lastFn.invoke(colname);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,100 @@
|
||||
package tech.v3.libs;
|
||||
|
||||
import static tech.v3.Clj.*;
|
||||
import clojure.lang.IFn;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* Bindings to save/load datasets apache arrow streaming format. These bindings support
|
||||
* JDK-17, memory mapping, and per-column compression.
|
||||
*
|
||||
* Required Dependencies:
|
||||
*
|
||||
*```clojure
|
||||
*[org.apache.arrow/arrow-vector "6.0.0"]
|
||||
*[org.lz4/lz4-java "1.8.0"]
|
||||
*[com.github.luben/zstd-jni "1.5.1-1"]
|
||||
*```
|
||||
*/
|
||||
public class Arrow {
|
||||
|
||||
private Arrow(){}
|
||||
|
||||
static final IFn dsToStreamFn = requiringResolve("tech.v3.libs.arrow", "dataset->stream!");
|
||||
static final IFn streamToDsFn = requiringResolve("tech.v3.libs.arrow", "stream->dataset");
|
||||
static final IFn dsSeqToStreamFn = requiringResolve("tech.v3.libs.arrow", "dataset-seq->stream!");
|
||||
static final IFn streamToDsSeqFn = requiringResolve("tech.v3.libs.arrow", "stream->dataset-seq");
|
||||
|
||||
/**
|
||||
* Save a dataset to apache stream format.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `strings-as-text?`: - defaults to false - Save out strings into arrow files without
|
||||
* dictionaries. This works well if you want to load an arrow file in-place or if
|
||||
* you know the strings in your dataset are either really large or should not be in
|
||||
* string tables.
|
||||
*
|
||||
* * `:compression` - Either `:zstd` or `:lz4`, defaults to no compression (nil).
|
||||
* Per-column compression of the data can result in some significant size savings
|
||||
* (2x+) and thus some significant time savings when transferring over the network.
|
||||
* Using compression makes loading via mmap non-in-place - If you are going to use
|
||||
* compression mmap probably doesn't make sense on load and most likely will
|
||||
* result on slower loading times. Zstd can also be passed in map form with an
|
||||
* addition parameter, `:level` which defaults to 3.
|
||||
*
|
||||
*
|
||||
*```java
|
||||
* //Slightly higher compression than the default.
|
||||
*datasetToStream(ds, "data.arrow-ipc", hashmap(kw("compression"),
|
||||
* hashmap(kw("compression-type"), kw("zstd"),
|
||||
* kw("level"), 5)));
|
||||
*```
|
||||
*/
|
||||
public static void datasetToStream(Object ds, Object pathOrInputStream, Object options) {
|
||||
dsToStreamFn.invoke(ds, pathOrInputStream, options);
|
||||
}
|
||||
/**
|
||||
* Save a sequence of datasets to a single stream file. Datasets must either have matching
|
||||
* schemas or downstream dataset column datatypes must be able to be widened to the initial
|
||||
* dataset column datatypes.
|
||||
*
|
||||
* For options see `datasetToStream`.
|
||||
*/
|
||||
public static void datasetSeqToStream(Iterable dsSeq, Object pathOrInputStream, Object options) {
|
||||
dsSeqToStreamFn.invoke(dsSeq, pathOrInputStream, options);
|
||||
}
|
||||
/**
|
||||
* Load an apache arrow streaming file returning a single dataset. File must only contain a
|
||||
* single record batch.
|
||||
*
|
||||
* Options:
|
||||
*
|
||||
* * `:open-type` - Either `:mmap` or `:input-stream` defaulting to the slower but more robust
|
||||
* `:input-stream` pathway. When using `:mmap` resources will be released when the resource
|
||||
* system dictates - see documentation for [tech.v3.DType.stackResourceContext](https://cnuernber.github.io/dtype-next/javadoc/index.html).
|
||||
* When using `:input-stream` the stream will be closed when the lazy sequence is either fully realized or an
|
||||
* exception is thrown.
|
||||
*
|
||||
* * `close-input-stream?` - When using `:input-stream` `:open-type`, close the input
|
||||
* stream upon exception or when stream is fully realized. Defaults to true.
|
||||
*
|
||||
* * `:integer-datatime-types?` - when true arrow columns in the appropriate packed
|
||||
* datatypes will be represented as their integer types as opposed to their respective
|
||||
* packed types. For example columns of type `:epoch-days` will be returned to the user
|
||||
* as datatype `:epoch-days` as opposed to `:packed-local-date`. This means reading values
|
||||
* will return integers as opposed to `java.time.LocalDate`s.
|
||||
*/
|
||||
public static Map streamToDataset(Object pathOrInputStream, Object options) {
|
||||
return (Map)streamToDsFn.invoke(pathOrInputStream, options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Load an apache arrow streaming file returning a sequence of datasets, one for each record batch.
|
||||
* For options see streamToDataset.
|
||||
*/
|
||||
public static Iterable streamToDatasetSeq(Object pathOrInputStream, Object options) {
|
||||
return (Iterable)streamToDsSeqFn.invoke(pathOrInputStream, options);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
package tech.v3.libs;
|
||||
|
||||
|
||||
import static tech.v3.Clj.*;
|
||||
import clojure.lang.IFn;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Read/write parquet files. Uses the standard hadoop parquet library. One aspect that
|
||||
* may be confusing is that when writing files the parquet system decides when to end
|
||||
* the record batch so a single dataset may end up as a single parquet file with many
|
||||
* record batches.
|
||||
*
|
||||
* Note that in the requiring dependencies I remove slf4j. tmd comes with logback-classic
|
||||
* by default which is less featureful but far less of a security disaster than slf4j. If you
|
||||
* have a setup that already uses slf4j then you should exclude logback-classic from
|
||||
* tmd's dependencies.
|
||||
*
|
||||
* You must disable debug logging else the parquet system is unreasonably slow. See logging
|
||||
* section of [parquet namespace](https://techascent.github.io/tech.ml.dataset/tech.v3.libs.parquet.html).
|
||||
*
|
||||
* Required dependencies:
|
||||
*
|
||||
*```clojure
|
||||
*org.apache.parquet/parquet-hadoop {:mvn/version "1.12.0"
|
||||
* :exclusions [org.slf4j/slf4j-log4j12]}
|
||||
*org.apache.hadoop/hadoop-common {:mvn/version "3.3.0"
|
||||
* :exclusions [org.slf4j/slf4j-log4j12]}
|
||||
*;; We literally need this for 1 POJO formatting object.
|
||||
*org.apache.hadoop/hadoop-mapreduce-client-core {:mvn/version "3.3.0"
|
||||
* :exclusions [org.slf4j/slf4j-log4j12]}
|
||||
*```
|
||||
*/
|
||||
public class Parquet
|
||||
{
|
||||
private Parquet(){}
|
||||
|
||||
static final IFn dsToParquetFn = requiringResolve("tech.v3.libs.parquet", "ds->parquet");
|
||||
static final IFn dsSeqToParquetFn = requiringResolve("tech.v3.libs.parquet", "ds-seq->parquet");
|
||||
static final IFn parquetToDsSeqFn = requiringResolve("tech.v3.libs.parquet", "parquet->ds-seq");
|
||||
static final IFn parquetToDsFn = requiringResolve("tech.v3.libs.parquet", "parquet->ds");
|
||||
static final IFn parquetToMetadataSeq = requiringResolve("tech.v3.libs.parquet", "parquet->metadata-seq");
|
||||
|
||||
public static Iterable parquetMetadata(String path) {
|
||||
return (Iterable)parquetToMetadataSeq.invoke(path);
|
||||
}
|
||||
public static Map parquetToDataset(String path, Object options) {
|
||||
return (Map)parquetToDsFn.invoke(path, options);
|
||||
}
|
||||
public static Iterable parquetToDatasetSeq(String path, Object options) {
|
||||
return (Iterable)parquetToDsSeqFn.invoke(path, options);
|
||||
}
|
||||
public static void datasetToParquet(Object ds, String path, Object options) {
|
||||
dsToParquetFn.invoke(ds, path, options);
|
||||
}
|
||||
public static void datasetSeqToParquet(Iterable dsSeq, String path, Object options) {
|
||||
dsSeqToParquetFn.invoke(dsSeq, path, options);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user