package tech.v3; import static tech.v3.Clj.*; import clojure.lang.Keyword; import clojure.lang.IFn; import java.util.Map; import java.util.Comparator; import org.roaringbitmap.RoaringBitmap; import tech.v3.datatype.Buffer; import tech.v3.datatype.NDBuffer; import ham_fisted.IFnDef; /** * `tech.ml.dataset` is a high performance library for processing columnar data similar to * pandas or R' data table. Datasets are `maps` of their columns and columns derive from * various Clojure interfaces such as IIndexed and IFn to make accessing their data as easy * as possible. * * Columns have a conversion to a `tech.v3.datate.Buffer` object accessible via * `tech.v3.DType.toBuffer()` so if you want higher performance non-boxing access that is * also available. Any bit of sequential data can be turned into a column. The best way * is if the data is already in a primitive array or nio buffer use that as a column - it * will be used in place. It is also possible to direclty instantiate a Buffer object in * a read-only pathway to create a virtualized column: * *```java *println(head(assoc(colmapDs, kw("c"), new tech.v3.datatype.LongReader() { * public long lsize() { return 10; } * public long readLong( long idx) { * return 2*idx; * } * }))); * //testds [5 3]: * * //| :b | :a | :c | * //|----:|---:|---:| * //| 9.0 | 0 | 0 | * //| 8.0 | 1 | 2 | * //| 7.0 | 2 | 4 | * //| 6.0 | 3 | 6 | * //| 5.0 | 4 | 8 | *``` * * Datasets implement a subset of java.util.Map and clojure's persistent map interfaces. * This means you can use various `java.util.Map` functions and you can also use * `clojure.core/assoc`, `clojure.core/dissoc`, and `clojure.core/merge` in order to add and * remove columns from the dataset. These are exposed in `tech.v3.Clj` as equivalently named * functions. In combination with the fact that columns implement Clojure.lang.IIndexed * providing `nth` as well as the single arity IFn invoke method you can do a surprising * amount of dataset processing without using bespoke TMD functions at all. * * All of the functions in `tech.v3.datatype.VecMath` will work with column although most * of those functions require the columns to have no missing data. The recommendation is to * do you missing-value processing first and then move into the various elemwise functions. * Integer columns with missing values will upcast themselves to double columns for any math * operation so the result keeps consistent w/r/t NaN behavior. Again, ideally missing values * should be dealt with before doing operations in the `VecMath` namespace. * * Most of the functions of the dataset (filter, sort, groupBy) will auto-parallelize but * but there are many times where the most efficient use of machine resources is to * parallelize a the outermost level using `pmapDs`. The parallelization primitives check and * run in serial mode of the current thread is already in a parallelization pathway. * */ public class TMD { private TMD(){} static final IFn toDatasetFn = requiringResolve("tech.v3.dataset-api", "->dataset"); static final IFn isDatasetFn = requiringResolve("tech.v3.dataset-api", "dataset?"); static final IFn rowCountFn = requiringResolve("tech.v3.dataset.base", "row-count"); static final IFn columnCountFn = requiringResolve("tech.v3.dataset.base", "column-count"); static final IFn columnFn = requiringResolve("tech.v3.dataset", "column"); static final IFn missingFn = requiringResolve("tech.v3.dataset-api", "missing"); static final IFn selectFn = requiringResolve("tech.v3.dataset-api", "select"); static final IFn selectRowsFn = requiringResolve("tech.v3.dataset-api", "select-rows"); static final IFn dropRowsFn = requiringResolve("tech.v3.dataset-api", "drop-rows"); static final IFn selectColumnsFn = requiringResolve("tech.v3.dataset-api", "select-columns"); static final IFn dropColumnsFn = requiringResolve("tech.v3.dataset-api", "drop-columns"); static final IFn renameColumnsFn = requiringResolve("tech.v3.dataset-api", "rename-columns"); static final IFn replaceMissingFn = requiringResolve("tech.v3.dataset.missing", "replace-missing"); static final IFn rowsFn = requiringResolve("tech.v3.dataset", "rows"); static final IFn rowvecsFn = requiringResolve("tech.v3.dataset", "rowvecs"); static final IFn headFn = requiringResolve("tech.v3.dataset", "head"); static final IFn tailFn = requiringResolve("tech.v3.dataset", "tail"); static final IFn sampleFn = requiringResolve("tech.v3.dataset", "sample"); static final IFn shuffleFn = requiringResolve("tech.v3.dataset", "shuffle"); static final IFn reverseFn = requiringResolve("tech.v3.dataset", "reverse-rows"); static final IFn columnMapFn = requiringResolve("tech.v3.dataset", "column-map"); static final IFn rowMapFn = requiringResolve("tech.v3.dataset", "row-map"); static final IFn rowMapcatFn = requiringResolve("tech.v3.dataset", "row-mapcat"); static final IFn pmapDsFn = requiringResolve("tech.v3.dataset", "pmap-ds"); static final IFn sortByFn = requiringResolve("tech.v3.dataset", "sort-by"); static final IFn sortByColumnFn = requiringResolve("tech.v3.dataset", "sort-by-column"); static final IFn filterFn = requiringResolve("tech.v3.dataset", "filter"); static final IFn filterColumnFn = requiringResolve("tech.v3.dataset", "filter-column"); static final IFn groupByFn = requiringResolve("tech.v3.dataset", "group-by"); static final IFn groupByColumnFn = requiringResolve("tech.v3.dataset", "group-by-column"); static final IFn concatCopyingFn = requiringResolve("tech.v3.dataset", "concat-copying"); static final IFn concatInplaceFn = requiringResolve("tech.v3.dataset", "concat-inplace"); static final IFn uniqueByFn = requiringResolve("tech.v3.dataset", "unique-by"); static final IFn uniqueByColumnFn = requiringResolve("tech.v3.dataset", "unique-by-column"); static final IFn descriptiveStatsFn = requiringResolve("tech.v3.dataset", "descriptive-stats"); static final IFn pdMergeFn = requiringResolve("tech.v3.dataset.join", "pd-merge"); static final IFn joinAsof = requiringResolve("tech.v3.dataset.join", "left-join-asof"); static final Object toNeanderthalDelay = delay(new IFnDef() { public Object invoke() { //Bindings to make as-tensor work with neanderthal require("tech.v3.libs.neanderthal"); //Actual function to convert a dataset into a neanderthal double or float matrix. return requiringResolve("tech.v3.dataset.neanderthal", "dataset->dense"); } }); static final Object neanderthalToDatasetDelay = delay(new IFnDef() { public Object invoke() { //tensor bindings require("tech.v3.libs.neanderthal"); return requiringResolve("tech.v3.dataset.neanderthal", "dense->dataset"); } }); static final Object toTensorDelay = delay(new IFnDef() { public Object invoke() { return requiringResolve("tech.v3.dataset.tensor", "dataset->tensor"); } }); static final Object tensorToDatasetDelay = delay(new IFnDef() { public Object invoke() { return requiringResolve("tech.v3.dataset.tensor", "tensor->dataset"); } }); static final IFn writeFn = requiringResolve("tech.v3.dataset", "write!"); /** * Basic pathway to take data and get back a datasets. If dsData is a string * a built in system can parse csv, tsv, csv.gz, tsv.gz, .json, json.gz and .nippy format files. * Specific other formats such as xlsx, apache arrow and parquet formats are provided * in other classes. * * Aside from string data formats, you can explicitly provide either a sequence of maps * or a map of columns with the map of columns being by far more the most efficient. * In the map-of-columns approach arrays of primitive numeric data and native buffers will * be used in-place. * * The options for parsing a dataset are extensive and documented at * [->dataset](https://techascent.github.io/tech.ml.dataset/tech.v3.dataset.html#var--.3Edataset). * * Example: * *```java * Map ds = makeDataset("https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv"); * tech.v3.Clj.println(head(ds)); * // https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv [5 3]: * // | symbol | date | price | * // |--------|------------|------:| * // | MSFT | 2000-01-01 | 39.81 | * // | MSFT | 2000-02-01 | 36.35 | * // | MSFT | 2000-03-01 | 43.22 | * // | MSFT | 2000-04-01 | 28.37 | * // | MSFT | 2000-05-01 | 25.45 | *Map colmapDs = makeDataset(hashmap(kw("a"), range(10), * kw("b"), toDoubleArray(range(9,-1,-1))), * hashmap(kw("dataset-name"), "testds")); *println(colmapDs); * // testds [10 2]: * * // | :b | :a | * // |----:|---:| * // | 9.0 | 0 | * // | 8.0 | 1 | * // | 7.0 | 2 | * // | 6.0 | 3 | * // | 5.0 | 4 | * // | 4.0 | 5 | * // | 3.0 | 6 | * // | 2.0 | 7 | * // | 1.0 | 8 | * // | 0.0 | 9 | *``` */ public static Map makeDataset(Object dsData, Map options) { return (Map)call(toDatasetFn, dsData, options); } /** * Make a dataset. See 2-arity form of function. */ public static Map makeDataset(Object dsData) { return (Map)call(toDatasetFn, dsData, null); } /** * Returns true if this object is a dataset. */ public static boolean isDataset(Object ds) { return (boolean)call(isDatasetFn, ds); } /** Return the number of rows. */ public static long rowCount(Object ds) { return (long)call(rowCountFn, ds); } /** Return the number of columns. */ public static long columnCount(Object ds) { return (long)call(columnCountFn, ds); } /** Return the column named `cname` else throw exception. */ public static Object column(Object ds, Object cname) { return call(columnFn, ds, cname); } /** * Efficiently create a column definition explicitly specifying name and data. * Typed data will be scanned for missing values and untyped data will be read * element by element to discern datatype and missing information. * The result can be `assoc`d back into the dataset. */ public static Map columnDef(Object name, Object data) { return hashmap(keyword("tech.v3.dataset", "name"), name, keyword("tech.v3.dataset", "data"), data); } /** * Efficiently create a column definition explicitly specifying name, data, and missing. * The result can be `assoc`d back into the dataset. Missing will be converted to a RoaringBitmap * but can additionally be an integer array, a java set, or a sequence of integers. */ public static Map columnDef(Object name, Object data, Object missing) { return hashmap(keyword("tech.v3.dataset", "name"), name, keyword("tech.v3.dataset", "data"), data, keyword("tech.v3.dataset", "missing"), missing); } /** * Efficiently create a column definition explicitly specifying name, data, missing, and * metadata. The result can be `assoc`d back into the dataset and saves the system the * time required to scan for missing elements. Missing will be converted to a RoaringBitmap * but can additionally be an integer array, a java set, or a sequence of integers. */ public static Map columnDef(Object name, Object data, Object missing, Object metadata) { return hashmap(keyword("tech.v3.dataset", "name"), name, keyword("tech.v3.dataset", "data"), data, keyword("tech.v3.dataset", "missing"), missing, keyword("tech.v3.dataset", "metadata"), metadata); } /** * Select a sub-rect of the dataset. Dataset names is a sequence of column names that must * exist in the dataset. Rows is a sequence, list, array, or bitmap of integer row * indexes to select. Dataset returned has column in the order specified * by `columnNames`. */ public static Map select(Object ds, Object columnNames, Object rows) { return (Map)call(selectFn, ds, columnNames, rows); } /** * Select columns by name. All names must exist in the dataset. */ public static Map selectColumns(Object ds, Object columnNames ) { return (Map)call(selectColumnsFn, ds, columnNames); } /** * Drop columns by name. All names must exist in the dataset. * Another option is to use the Clojure function `dissoc`. */ public static Map dropColumns(Object ds, Object columnNames ) { return (Map)call(dropColumnsFn, ds, columnNames); } /** * Rename columns providing a map of oldname to newname. */ public static Map renameColumns(Object ds, Map renameMap) { return (Map)call(renameColumnsFn, ds, renameMap); } /** * Select rows by index. */ public static Map selectRows(Object ds, Object rowIndexes) { return (Map)call(selectRowsFn, ds, rowIndexes); } /** * Drop rows by index. */ public static Map dropRows(Object ds, Object rowIndexes) { return (Map)call(dropRowsFn, ds, rowIndexes); } /** * Return the missing set of a dataset or a column in the form of a RoaringBitmap. */ public static RoaringBitmap missing(Object dsOrColumn) { return (RoaringBitmap)call(missingFn, dsOrColumn); } /** * Replace the missing values from a column or set of columns. To replace across * all columns use the keyword :all. * * Strategy can be: * * * `:up` - take next value * * `:down` - take previous value * * `:lerp` - linearly interpolate across values. Datetime objects will have * interpolation in done in millisecond space. * * `vector(:value, val)` - Provide this value explicity to replace entries. * * `:nearest` - use the nearest value. * * `:midpoint` - use the mean of the range. * * `:abb` - impute missing values using approximate bayesian bootstrap. * * Further documentation is located at [replace-missing](https://techascent.github.io/tech.ml.dataset/tech.v3.dataset.html#var-replace-missing). */ public static Map replaceMissing(Object ds, Object strategy, Object columns) { Keyword actualStrat; Object value; if (isVector(strategy)) { actualStrat = (Keyword)call(strategy,0); value = call(strategy,1); } else { actualStrat = (Keyword)strategy; value = null; } if (value != null ) { return (Map)call(replaceMissingFn, ds, columns, actualStrat, value); } else { return (Map)call(replaceMissingFn, ds, columns, actualStrat); } } /** * Replace missing values. See 3-arity form of function for documentation. */ public static Map replaceMissing(Object ds, Object strategy) { return replaceMissing(ds, strategy, kw("all")); } /** * Return the rows of the dataset in a flyweight map format. Maps share keys * and read their data lazily from the base dataset. */ public static Buffer rows(Object ds) { return (Buffer)call(rowsFn, ds); } /** * Return the rows of the dataset where each row is just a flat Buffer of data. * * When copying is true data is copied upon each access from the underlying dataset. This * makes doing something like using each row as the key in a map more efficient. */ public static Buffer rowvecs(Object ds, boolean copying) { return (Buffer)call(rowvecsFn, ds, hashmap(kw("copying?"), copying)); } /** Return the rows of the dataset where each row is just a flat Buffer of data. */ public static Buffer rowvecs(Object ds) { return (Buffer)call(rowvecsFn, ds); } /** Return the first 5 rows of the dataset */ public static Map head(Object ds) { return (Map)call(headFn, ds); } /** Return the first N rows of the dataset */ public static Map head(Object ds, long nRows) { return (Map)call(headFn, ds, nRows); } /** Return the last 5 rows of the dataset */ public static Map tail(Object ds) { return (Map)call(tailFn, ds); } /** Return the last N rows of the dataset */ public static Map tail(Object ds, long nRows) { return (Map)call(tailFn, ds, nRows); } /** Return a random sampling of 5 rows without replacement of the data */ public static Map sample(Object ds) { return (Map)call(sampleFn, ds); } /** Return a random sampling of N rows without replacement of the data */ public static Map sample(Object ds, long nRows) { return (Map)call(sampleFn, ds, nRows); } /** * Return a random sampling of N rows of the data. * * Options: * * * `:replacement?` - Do sampling with replacement. Defaults to false. * * `:seed` - Either an integer or an implementation of java.util.Random. */ public static Map sample(Object ds, long nRows, Map options) { return (Map)call(sampleFn, ds, nRows, options); } /** Randomly shuffle the dataset rows. */ public static Map shuffle(Object ds) { return (Map)call(shuffleFn, ds); } /** * Randomly shuffle the dataset rows. * * Options: * * * `:seed` - Either an integer or an implementation of java.util.Random. */ public static Map shuffle(Object ds, Map options) { return (Map)call(shuffleFn, ds, options); } /** Reverse the rows of the dataset */ public static Map reverseRows(Object ds) { return (Map)call(reverseFn, ds); } /** * Map a function across 1 or more columns to produce a new column. The new column is * serially scanned to detect datatype and its missing set. */ public static Map columnMap(Object ds, Object resultCname, IFn mapFn, Object srcCnames) { return (Map)call(columnMapFn, ds, resultCname, mapFn, srcCnames); } /** * Map a function across the rows of the dataset with each row in map form. Function must * return a new map for each row. Result is generated in parallel so, when used with a map * factory, this is a suprisingly efficient strategy to create multiple columns at once from * each row. */ public static Map rowMap(Object ds, IFn mapFn) { return (Map)call(rowMapFn, ds, mapFn); } /** * Map a function across the rows of the dataset with each row in map form. Function must * return a new map for each row. Result is generated in parallel so, when used with a map * factory, this is a suprisingly efficient strategy to create multiple columns at once from * each row. * * See options for pmapDs. Especially note `:max-batch-size` and `:result-type`. In * order to conserve memory it may be much more efficient to return a sequence of * datasets rather than one large dataset. If returning sequences of datasets * perhaps consider a transducing pathway across them or the * tech.v3.dataset.reductions namespace. */ public static Object rowMap(Object ds, IFn mapFn, Object options) { return call(rowMapFn, ds, mapFn, options); } /** * Map a function across the rows of the dataset with each row in map form. Function must * return either null or a sequence of maps and thus can produce many new rows for * each input row. Function is called in a parallelized context. Maps returned * must be an implementation of clojure's IPersistentMap. See [tech.v3.Clj.mapFactory](https://cnuernber.github.io/dtype-next/javadoc/tech/v3/DType.html#mapFactory-java.util.List-) * for an efficient way to create those in bulk. * * See options for pmapDs. Especially note `:max-batch-size` and `:result-type`. In * order to conserve memory it may be much more efficient to return a sequence of * datasets rather than one large dataset. If returning sequences of datasets * perhaps consider a transducing pathway across them or the * tech.v3.dataset.reductions namespace. */ public static Object rowMapcat(Object ds, IFn mapFn, Object options) { return call(rowMapcatFn, ds, mapFn, options); } /** * Parallelize mapping a function from dataset->dataset across a dataset. Function may * return null. The original dataset is simply sliced into n-core results and * map-fn is called n-core times with the results either concatenated into a new dataset or * returned as an Iterable. * * Most of the functions of the dataset (filter, sort, groupBy) will auto-parallelize but * but there are many times where the most efficient use of machine resources is to * parallelize a the outermost level. The parallelization primitives check and run in * serial mode of the current thread is already in a parallelization pathway. * * @param mapFn a function from dataset->dataset although it may return null. * * Options: * * * `:max-batch-size` - Defaults to 64000. This controls the size of each parallelized * chunk. * * `:result-type` - Either `:as-seq` in which case the output of this function is a * sequence of datasets or `:as-ds` in which case the output is a single dataset. The * default is `:as-ds`. */ public static Object pmapDS(Object ds, IFn mapFn, Object options) { return call(pmapDsFn, ds, mapFn, options); } /** * Sort a dataset by first mapping `sortFn` over it and then sorting over the result. * `sortFn` is passed each row in map form and the return value is used to sort the * dataset. * * @param sortFn function taking a single argument which is the row-map and returns the value * to sort on. * @param compareFn Comparison operator or comparator. Some examples are the Clojure '<' or * '>' operators - tech.v3.Clj.lessThanFn, tech.v3.Clj.greaterThanFn. The clojure keywords * `:tech.numerics/<` and `:tech.numerics/>` can be used for somewhat higher performance * unboxed primitive comparisons or the Clojure function `compare` - tech.v3.Clj.compareFn - * which is similar to .compareTo except it works with null and the input must implement * Comparable. Finally you can instantiate an instance of java.util.Comparator. * * Options: * * * `:nan-strategy` - General missing strategy. Options are `:first`, `:last`, and * `:exception`. * * `:parallel?` - Uses parallel quicksort when true and regular quicksort when false. */ public static Map sortBy(Object ds, IFn sortFn, Object compareFn, Object options) { return (Map)call(sortByFn, ds, sortFn, compareFn, options); } /** Sort a dataset. See documentation of 4-arity version.*/ public static Map sortBy(Object ds, IFn sortFn, Object compareFn) { return (Map)call(sortByFn, ds, sortFn, compareFn, null); } /** Sort a dataset. See documentation of 4-arity version.*/ public static Map sortBy(Object ds, IFn sortFn) { return (Map)call(sortByFn, ds, sortFn); } /** * Sort a dataset by using the values from column `cname`. * to sort on. * @param compareFn Comparison operator or comparator. Some examples are the Clojure '<' or * '>' operators - tech.v3.Clj.lessThanFn, tech.v3.Clj.greaterThanFn. The clojure keywords * `:tech.numerics/<` and `:tech.numerics/>` can be used for somewhat higher performance * unboxed primitive comparisons or the Clojure function `compare` - tech.v3.Clj.compareFn - * which is similar to .compareTo except it works with null and the input must implement * Comparable. Finally you can instantiate an instance of java.util.Comparator. * * Options: * * * `:nan-strategy` - General missing strategy. Options are `:first`, `:last`, and * `:exception`. * * `:parallel?` - Uses parallel quicksort when true and regular quicksort when false. */ public static Map sortByColumn(Object ds, Object cname, Object compareFn, Object options) { return (Map)call(sortByColumnFn, ds, cname, compareFn, options); } /** Sort a dataset by a specific column. See documentation on 4-arity version.*/ public static Map sortByColumn(Object ds, Object cname, Object compareFn) { return (Map)call(sortByColumnFn, ds, cname, compareFn, null); } /** Sort a dataset by a specific column. See documentation on 4-arity version.*/ public static Map sortByColumn(Object ds, Object cname) { return (Map)call(sortByColumnFn, ds, cname); } /** * Filter a dataset. Predicate gets passed all rows and must return a `truthy` values. */ public static Map filter(Object ds, IFn predicate) { return (Map)call(filterFn, ds, predicate); } /** * Filter a dataset. Predicate gets passed a values from column cname and must * return a `truthy` values. */ public static Map filterColumn(Object ds, Object cname, IFn predicate) { return (Map)call(filterColumnFn, ds, cname, predicate); } /** * Group a dataset returning a Map of keys to dataset. * * @param groupFn Gets passed each row in map format and must return the desired key. * * @return a map of key to dataset. */ public static Map groupBy(Object ds, IFn groupFn) { return (Map)call(groupByFn, ds, groupFn); } /** * Group a dataset by a specific column returning a Map of keys to dataset. * * @return a map of key to dataset. */ public static Map groupByColumn(Object ds, Object cname) { return (Map)call(groupByColumnFn, ds, cname); } /** * Concatenate an Iterable of datasets into one dataset via copying data into one * dataset. This generally results in higher performance than an in-place concatenation * with the exception of small (< 3) numbers of datasets. Null datasets will be silently * ignored. */ public static Map concatCopying(Object datasets) { return (Map)call(applyFn, concatCopyingFn, datasets); } /** * Concatenate an Iterable of datasets into one dataset via creating virtual buffers that * index into the previous datasets. This generally results in lower performance than a * copying concatenation with the exception of small (< 3) numbers of datasets. Null * datasets will be silently ignored. */ public static Map concatInplace(Object datasets) { return (Map)call(applyFn, concatInplaceFn, datasets); } /** * Create a dataset with no duplicates by taking first of duplicate values. * * @param uniqueFn is passed a row and must return the uniqueness criteria. A uniqueFn is * the identity function. */ public static Map uniqueBy(Object ds, IFn uniqueFn) { return (Map)call(uniqueByFn, ds, uniqueFn); } /** * Make a dataset unique using a particular column as the uniqueness criteria and taking * the first value. */ public static Map uniqueByColumn(Object ds, Object cname) { return (Map)call(uniqueByFn, ds, cname); } /** * Create a dataset of the descriptive statistics of the input dataset. This works with * date-time columns, missing values, etc. and serves as very fast way to quickly get a feel * for a dataset. * * Options: * * * `:stat-names` - A set of desired stat names. Possible statistic operations are: * `[:col-name :datatype :n-valid :n-missing :min :quartile-1 :mean :mode :median * :quartile-3 :max :standard-deviation :skew :n-values :values :histogram :first * :last]` */ public static Map descriptiveStats(Object ds, Object options) { return (Map)call(descriptiveStatsFn, ds, options); } /** * Create a dataset of the descriptive statistics of the input dataset. This works with * date-time columns, missing values, etc. and serves as very fast way to quickly get a feel * for a dataset. * * Options: * * * `:stat-names` - A set of desired stat names. Possible statistic operations are: * `[:col-name :datatype :n-valid :n-missing :min :quartile-1 :mean :mode :median * :quartile-3 :max :standard-deviation :skew :n-values :values :histogram :first * :last]` */ public static Map descriptiveStats(Object ds) { return (Map)call(descriptiveStatsFn, ds); } /** * Perform a join operation between two datasets. * * Options: * * * `:on` - column name or list of columns names. Names must be found in both datasets. * * `:left-on` - Column name or list of column names * * `:right-on` - Column name or list of column names * * `:how` - `:left`, `:right` `:inner`, `:outer`, `:cross`. If `:cross`, then it is * an error to provide `:on`, `:left-on`, `:right-on`. Defaults to `:inner`. * * Examples: * *```java *Map dsa = makeDataset(hashmap("a", vector("a", "b", "b", "a", "c"), * "b", range(5), * "c", range(5))); *println(dsa); * //_unnamed [5 3]: * * //| a | b | c | * //|---|--:|--:| * //| a | 0 | 0 | * //| b | 1 | 1 | * //| b | 2 | 2 | * //| a | 3 | 3 | * //| c | 4 | 4 | * * *Map dsb = makeDataset(hashmap("a", vector("a", "b", "a", "b", "d"), * "b", range(5), * "c", range(6,11))); *println(dsb); * //_unnamed [5 3]: * * //| a | b | c | * //|---|--:|---:| * //| a | 0 | 6 | * //| b | 1 | 7 | * //| a | 2 | 8 | * //| b | 3 | 9 | * //| d | 4 | 10 | * * //Join on the columns a,b. Default join mode is inner * println(join(dsa, dsb, hashmap(kw("on"), vector("a", "b")))); * //inner-join [2 4]: * * //| a | b | c | right.c | * //|---|--:|--:|--------:| * //| a | 0 | 0 | 6 | * //| b | 1 | 1 | 7 | * * * //Outer join on same columns *println(join(dsa, dsb, hashmap(kw("on"), vector("a", "b"), * kw("how"), kw("outer")))); * //outer-join [8 4]: * * //| a | b | c | right.c | * //|---|--:|--:|--------:| * //| a | 0 | 0 | 6 | * //| b | 1 | 1 | 7 | * //| b | 2 | 2 | | * //| a | 3 | 3 | | * //| c | 4 | 4 | | * //| a | 2 | | 8 | * //| b | 3 | | 9 | * //| d | 4 | | 10 | *``` */ public static Map join(Map leftDs, Map rightDs, Map options) { return (Map)pdMergeFn.invoke(leftDs, rightDs, options); } /** * Perform a left join but join on nearest value as opposed to matching value. * Both datasets must be sorted by the join column and the join column itself * must be either a datetime column or a numeric column. When the join column * is a datetime column the join happens in millisecond space. * * Options: * * * `:asof-op` - One of the keywords `[:< :<= :nearest :>= :>]`. Defaults to `:<=`. * * Examples: * *```java *println(head(googPrices, 200)); * //GOOG [68 3]: * //| symbol | date | price | * //|--------|------------|-------:| * //| GOOG | 2004-08-01 | 102.37 | * //| GOOG | 2004-09-01 | 129.60 | * //| GOOG | 2005-03-01 | 180.51 | * //| GOOG | 2004-11-01 | 181.98 | * //| GOOG | 2005-02-01 | 187.99 | * //| GOOG | 2004-10-01 | 190.64 | * //| GOOG | 2004-12-01 | 192.79 | * //| GOOG | 2005-01-01 | 195.62 | * //| GOOG | 2005-04-01 | 220.00 | * //| GOOG | 2005-05-01 | 277.27 | * //| GOOG | 2005-08-01 | 286.00 | * //| GOOG | 2005-07-01 | 287.76 | * //| GOOG | 2008-11-01 | 292.96 | * //| GOOG | 2005-06-01 | 294.15 | * //| GOOG | 2008-12-01 | 307.65 | * //| GOOG | 2005-09-01 | 316.46 | * //| GOOG | 2009-02-01 | 337.99 | * //| GOOG | 2009-01-01 | 338.53 | * //| GOOG | 2009-03-01 | 348.06 | * //| GOOG | 2008-10-01 | 359.36 | * //| GOOG | 2006-02-01 | 362.62 | * //| GOOG | 2006-05-01 | 371.82 | * //| GOOG | 2005-10-01 | 372.14 | * //| GOOG | 2006-08-01 | 378.53 | * //| GOOG | 2006-07-01 | 386.60 | * //| GOOG | 2006-03-01 | 390.00 | * //| GOOG | 2009-04-01 | 395.97 | * //| GOOG | 2008-09-01 | 400.52 | * //| GOOG | 2006-09-01 | 401.90 | * //| GOOG | 2005-11-01 | 404.91 | * //| GOOG | 2005-12-01 | 414.86 | * //| GOOG | 2009-05-01 | 417.23 | * //| GOOG | 2006-04-01 | 417.94 | * //| GOOG | 2006-06-01 | 419.33 | * //| GOOG | 2009-06-01 | 421.59 | * //| GOOG | 2006-01-01 | 432.66 | * //| GOOG | 2008-03-01 | 440.47 | * //| GOOG | 2009-07-01 | 443.05 | * //| GOOG | 2007-02-01 | 449.45 | * //| GOOG | 2007-03-01 | 458.16 | * //| GOOG | 2006-12-01 | 460.48 | * //| GOOG | 2009-08-01 | 461.67 | * //| GOOG | 2008-08-01 | 463.29 | * //| GOOG | 2008-02-01 | 471.18 | * //| GOOG | 2007-04-01 | 471.38 | * //| GOOG | 2008-07-01 | 473.75 | * //| GOOG | 2006-10-01 | 476.39 | * //| GOOG | 2006-11-01 | 484.81 | * //| GOOG | 2009-09-01 | 495.85 | * //| GOOG | 2007-05-01 | 497.91 | * //| GOOG | 2007-01-01 | 501.50 | * //| GOOG | 2007-07-01 | 510.00 | * //| GOOG | 2007-08-01 | 515.25 | * //| GOOG | 2007-06-01 | 522.70 | * //| GOOG | 2008-06-01 | 526.42 | * //| GOOG | 2010-02-01 | 526.80 | * //| GOOG | 2010-01-01 | 529.94 | * //| GOOG | 2009-10-01 | 536.12 | * //| GOOG | 2010-03-01 | 560.19 | * //| GOOG | 2008-01-01 | 564.30 | * //| GOOG | 2007-09-01 | 567.27 | * //| GOOG | 2008-04-01 | 574.29 | * //| GOOG | 2009-11-01 | 583.00 | * //| GOOG | 2008-05-01 | 585.80 | * //| GOOG | 2009-12-01 | 619.98 | * //| GOOG | 2007-12-01 | 691.48 | * //| GOOG | 2007-11-01 | 693.00 | * //| GOOG | 2007-10-01 | 707.00 | *Map targetPrices = makeDataset(hashmap("price", new Double[] { 200.0, 300.0, 400.0 })); *println(leftJoinAsof("price", targetPrices, googPrices, hashmap(kw("asof-op"), kw("<=")))); * //asof-<= [3 4]: * //| price | symbol | date | GOOG.price | * //|------:|--------|------------|-----------:| * //| 200.0 | GOOG | 2005-04-01 | 220.00 | * //| 300.0 | GOOG | 2008-12-01 | 307.65 | * //| 400.0 | GOOG | 2008-09-01 | 400.52 | * println(leftJoinAsof("price", targetPrices, googPrices, hashmap(kw("asof-op"), kw(">")))); * //asof-> [3 4]: * //| price | symbol | date | GOOG.price | * //|------:|--------|------------|-----------:| * //| 200.0 | GOOG | 2005-01-01 | 195.62 | * //| 300.0 | GOOG | 2005-06-01 | 294.15 | * //| 400.0 | GOOG | 2009-04-01 | 395.97 | *``` */ public static Map leftJoinAsof(Object colname, Map lhs, Map rhs, Object options) { return (Map)joinAsof.invoke(colname, lhs, rhs, options); } public static Map leftJoinAsof(Object colname, Map lhs, Map rhs) { return (Map)joinAsof.invoke(colname, lhs, rhs); } /** * Convert a dataset to a neanderthal 2D matrix such that the columns of the dataset * become the columns of the matrix. This function dynamically loads the neanderthal * MKL bindings so there may be some pause when first called. If you would like to have * the pause somewhere else call `require("tech.v3.dataset.neanderthal");` at some * previous point of the program. You must have an update-to-date version of * neanderthal in your classpath such as `[uncomplicate/neanderthal "0.43.3"]`. * * See the [neanderthal documentation](https://neanderthal.uncomplicate.org/)` * * @param layout One of `:column` or `:row`. * @param datatype One of `:float32` or `:float64`. * * Note that you can get a tech tensor (tech.v3.datatype.NDBuffer) from a neanderthal * matrix using `tech.v3.DType.asTensor()`. * */ public static Object toNeanderthal(Object ds, Keyword layout, Keyword datatype) { return call(deref(toNeanderthalDelay), ds, layout, datatype); } /** * Convert a dataset to a neanderthal 2D matrix such that the columns of the dataset * become the columns of the matrix. See documentation for 4-arity version of * function. This function creates a column-major float64 (double) matrix. */ public static Object toNeanderthal(Object ds) { return call(deref(toNeanderthalDelay), ds); } /** * Convert a neanderthal matrix to a dataset such that the columns of the matrix * become the columns of the dataset. Column names are the indexes of the columns. */ public static Map neanderthalToDataset(Object denseMat) { return (Map)call(deref(neanderthalToDatasetDelay), denseMat); } /** * Convert a dataset to a jvm-heap based 2D tensor such that the columns of the * dataset become the columns of the tensor. * * @param datatype Any numeric datatype - `:int8`, `:uint8`, `:float32`, `:float64`, etc. */ public static NDBuffer toTensor(Object ds, Keyword datatype) { return (NDBuffer)call(deref(toTensorDelay), ds, datatype); } /** * Convert a dataset to a jvm-heap based 2D double (float64) tensor. */ public static NDBuffer toTensor(Object ds) { return (NDBuffer)call(deref(toTensorDelay), ds); } /** * Convert a tensor to a dataset such that the columns of the tensor * become the columns of the dataset named after their index. */ public static Map tensorToDataset(Object tens) { return (Map)call(deref(tensorToDatasetDelay), tens); } /** * Write a dataset to disc as csv, tsv, csv.gz, tsv.gz, json, json.gz or nippy. * * Reading/writing to parquet or arrow is accessible via separate clasess */ public static void writeDataset(Object ds, String path, Object options) { call(writeFn, ds, path, options); } /** * Write a dataset to disc as csv, tsv, csv.gz, tsv.gz or nippy. */ public static void writeDataset(Object ds, String path) { writeDataset(ds, path, null); } }