init research

2026-02-08 11:20:43 -10:00
commit bdf064f54d
3041 changed files with 1592200 additions and 0 deletions
@@ -0,0 +1,329 @@
+package tech.v3.dataset;
+
+
+import static tech.v3.Clj.*;
+import clojure.lang.IFn;
+import clojure.lang.Keyword;
+import java.util.Map;
+
+
+/**
+ * Functions related to training and evaluating ML models.  The functions are grouped into
+ * a few groups.
+ *
+ * For the purpose of this system, categorical data means a column of data that is not numeric.
+ * it could be strings, keywords, or arbitrary objects.
+ *
+ * Minimal example extra dependencies for PCA:
+ *
+ *```console
+ * [uncomplicate/neanderthal "0.43.3"]
+ *```
+ *
+ * It is also important to note that you can serialize the fit results to nippy automatically
+ * as included in dtype-next are extensions to nippy that work with tensors.
+ */
+public class Modelling {
+  private Modelling(){}
+
+  static final IFn fitCatFn = requiringResolve("tech.v3.dataset.categorical", "fit-categorical-map");
+  static final IFn transCatFn = requiringResolve("tech.v3.dataset.categorical", "transform-categorical-map");
+  static final IFn invCatFn = requiringResolve("tech.v3.dataset.categorical", "invert-categorical-map");
+  static final IFn fitOneHotFn = requiringResolve("tech.v3.dataset.categorical", "fit-one-hot");
+  static final IFn transOneHotFn = requiringResolve("tech.v3.dataset.categorical", "transform-one-hot");
+  static final IFn invOneHotFn = requiringResolve("tech.v3.dataset.categorical", "invert-one-hot-map");
+
+  static final IFn corrTableFn = requiringResolve("tech.v3.dataset.math", "correlation-table");
+  static final IFn fillRangeReplaceFn = requiringResolve("tech.v3.dataset.math", "fill-range-replace");
+  static final IFn fitPCAFn = requiringResolve("tech.v3.dataset.math", "fit-pca");
+  static final IFn fitStdScaleFn = requiringResolve("tech.v3.dataset.math", "fit-std-scale");
+  static final IFn fitMinMaxFn = requiringResolve("tech.v3.dataset.math", "fit-minmax");
+  static final IFn transPCAFn = requiringResolve("tech.v3.dataset.math", "transform-pca");
+  static final IFn transStdScaleFn = requiringResolve("tech.v3.dataset.math", "transform-std-scale");
+  static final IFn transMinMaxFn = requiringResolve("tech.v3.dataset.math", "transform-minmax");
+  static final IFn interpolateLOESSFn = requiringResolve("tech.v3.dataset.math", "interpolate-loess");
+
+  static final IFn kFoldFn = requiringResolve("tech.v3.dataset.modelling", "k-fold-datasets");
+  static final IFn trainTestFn = requiringResolve("tech.v3.dataset.modelling", "train-test-split");
+  static final IFn setInfTargetFn = requiringResolve("tech.v3.dataset.modelling", "set-inference-target");
+  static final IFn labelsFn = requiringResolve("tech.v3.dataset.modelling", "labels");
+  static final IFn probDistToLabel = requiringResolve("tech.v3.dataset.modelling", "probability-distributions->label-column");
+  static final IFn infTargetLabelMap = requiringResolve("tech.v3.dataset.modelling", "inference-target-label-map");
+
+
+  /**
+   * Fit an object->integer transform that takes each value and assigned an integer to it.  The
+   * returned value can be used in transformCategorical to transform the dataset.
+   *
+   * Options:
+   *
+   * * `:table-args` - Either a sequence of vectors [col-val, idx] or a sorted sequence
+   *   of column values where integers will be assigned as per the sorted sequence.  Any values
+   *   found outside the the specified values will be auto-mapped to the next largest integer.
+   * * `:res-dtype` - Datatype of result column.  Defaults to `:float64`.
+   */
+  public static Map fitCategorical(Object ds, Object cname, Object options) {
+    return (Map)fitCatFn.invoke(ds, cname, options);
+  }
+  /**
+   * Fit an object->integer transformation.  Integers will be assigned in random order.  For
+   * more control over the transform see the 3-arity version of the function.
+   */
+  public static Map fitCategorical(Object ds, Object cname) {
+    return (Map)fitCatFn.invoke(ds, cname);
+  }
+  /**
+   * Apply an object->integer transformation with data obtained from fitCategorical.
+   */
+  public static Map transformCategorical(Object ds, Object catFitData) {
+    return (Map)transCatFn.invoke(ds, catFitData);
+  }
+  /**
+   * Reverse a previously transformed categorical mapping.
+   */
+  public static Map invertCategorical(Object ds, Object catFitData) {
+    return (Map)invCatFn.invoke(ds, catFitData);
+  }
+  /**
+   * Fit a transformation from a single column of categorical values to a `one-hot` encoded
+   * group of columns.
+   * .
+   *
+   * Options:
+   *
+   * * `:table-args` - Either a sequence of vectors [col-val, idx] or a sorted sequence
+   *   of column values where integers will be assigned as per the sorted sequence.  Any values
+   *   found outside the the specified values will be auto-mapped to the next largest integer.
+   * * `:res-dtype` - Datatype of result column.  Defaults to `:float64`.
+   *
+   */
+  public static Map fitOneHot(Object ds, Object cname, Object options) {
+    return (Map)fitOneHotFn.invoke(ds, cname, options);
+  }
+  /**
+   * Fit a mapping from a categorical column to a group of one-hot encoded columns.
+   */
+  public static Map fitOneHot(Object ds, Object cname) {
+    return (Map)fitOneHotFn.invoke(ds, cname);
+  }
+  /**
+   * Transform a dataset using a fitted one-hot mapping.
+   */
+  public static Map transformOneHot(Object ds, Object fitData) {
+    return (Map)transOneHotFn.invoke(ds, fitData);
+  }
+  /**
+   * Reverse a previously transformed one-hot mapping.
+   */
+  public static Map invertOneHot(Object ds, Object fitData) {
+    return (Map)invOneHotFn.invoke(ds, fitData);
+  }
+  /**
+   * Return a map of column to inversely sorted from greatest to least sequence of tuples of
+   * column name, coefficient.
+   *
+   * Options:
+   *
+   * * `:correlation-type` One of `:pearson`, `:spearman`, or `:kendall`.  Defaults to
+   *   `:pearson`.
+   */
+  public static Map correlationTable(Object ds, Object options) {
+    return (Map)corrTableFn.invoke(ds, options);
+  }
+  /**
+   * Return a map of column to inversely sorted from greatest to least sequence of tuples of
+   * column name, pearson correlation coefficient.
+   */
+  public static Map correlationTable(Object ds) {
+    return (Map)corrTableFn.invoke(ds);
+  }
+  /**
+   * Expand a dataset ensuring that the difference between two successive values is less than
+   * `max-span`.
+   *
+   * @param maxSpan The minimal span value.  For datetime types this is interpreted in
+   * millisecond or epoch-millisecond space.
+   * @param missingStrategy Same missing strategy types from `TMD.replaceMissing`.
+   */
+  public static Map fillRangeReplace(Object ds, Object cname, double maxSpan, Object missingStrategy) {
+    Keyword strat;
+    Object value = null;
+    if (isVector(missingStrategy)) {
+	strat = (Keyword)call(missingStrategy, 0);
+	value = call(missingStrategy, 1);
+      } else {
+      strat = (Keyword)missingStrategy;
+    }
+    return (Map) fillRangeReplaceFn.invoke(ds, cname, maxSpan, strat, value);
+  }
+  /**
+   * Fit a PCA transformation on a dataset.
+   *
+   * @return map of `{:means, :eigenvalues, :eigenvectors}`.
+   *
+   * Options:
+   *
+   * * `:method` - either `:svd` or `:cov`.  Use either SVD transformation or covariance-matrix
+   *   base PCA.  `:cov` method is somewhat slower but returns accurate variances and thus
+   *   is the default.
+   * * `:variance-amount` - Keep columns until variance is just less than variance-amount.
+   *   Defaults to 0.95.
+   * * `:n-components` - Return a fixed number of components.  Overrides `:variance-amount`
+   *   an returns a fixed number of components.
+   * * `:covariance-bias` - When using `:cov` divide by `n-rows` if true and `n-rows - 1` if
+   *   false.  Defaults to false.
+   */
+  public static Object fitPCA(Object ds, Object options) {
+    return fitPCAFn.invoke(ds, options);
+  }
+  /**
+   * Fit a PCA transformation onto a dataset keeping 95% of the variance.  See documentation
+   * for 2-arity form.
+   */
+  public static Object fitPCA(Object ds) {
+    return fitPCAFn.invoke(ds);
+  }
+  /**
+   * Transform a dataset by the PCA fit data.
+   */
+  public static Map transformPCA(Object ds, Object fitData) {
+    return (Map)transPCAFn.invoke(ds, fitData);
+  }
+  /**
+   * Calculate per-column mean, stddev.
+   *
+   * Options:
+   *
+   * * `:mean?` - Produce per-column means.  Defaults to true.
+   * * `:stddev?` - Produce per-column standard deviation.  Defaults to true.
+   */
+  public static Object fitStdScale(Object ds) {
+    return fitStdScaleFn.invoke(ds);
+  }
+  /**
+   * Transform dataset to mean of zero and a standard deviation of 1.
+   */
+  public static Map transformStdScale(Object ds, Object fitData) {
+    return (Map)transStdScaleFn.invoke(ds, fitData);
+  }
+  /**
+   * Fit a bias and scale the dataset that transforms each colum to a target min-max
+   * value.
+   *
+   * Options:
+   *
+   * * `:min` - Target minimum value.  Defaults it -0.5.
+   * * `:max` - Target maximum value.  Defaults to 0.5.
+   */
+  public static Object fitMinMax(Object ds, Object options) {
+    return fitMinMaxFn.invoke(ds, options);
+  }
+  /**
+   * Fit a minmax transformation that will transform each column to a minimum of -0.5 and
+   * a maximum of 0.5.
+   */
+  public static Object fitMinMax(Object ds) {
+    return fitMinMaxFn.invoke(ds);
+  }
+  /**
+   * Transform a dataset using a previously fit minimax transformation.
+   */
+  public static Map transformMinMax(Object ds, Object fitData) {
+    return (Map)transMinMaxFn.invoke(ds, fitData);
+  }
+  /**
+   * Map a LOESS-interpolation transformation onto a dataset.  This can be used
+   * to, among other things, smooth out a column before graphing.  For the meaning
+   * of the options, see documentation on the
+   * org.apache.commons.math3.analysis.interpolationLoessInterpolator.
+   *
+   * Option defaults have been chosen to map somewhat closely to the R defaults.
+   *
+   * Options:
+   *
+   * * `:bandwidth` - Defaults to 0.75.
+   * * `:iterations` - Defaults to 4.
+   * * `:accuracy` - Defaults to LoessInterpolator/DEFAULT_ACCURACY.
+   * * `:result-name` - Result column name.  Defaults to `yColname.toString +  "-loess"`.
+   */
+  public static Map interpolateLOESS(Object ds, Object xColname, Object yColname, Object options) {
+    return (Map)interpolateLOESSFn.invoke(ds, xColname, yColname, options);
+  }
+  /**
+   * Perform a LOESS interpolation using the default parameters.  For options see 4-arity
+   * form of function.
+   */
+  public static Map interpolateLOESS(Object ds, Object xColname, Object yColname) {
+    return (Map)interpolateLOESSFn.invoke(ds, xColname, yColname);
+  }
+  /**
+   * Produce 2*k datasets from 1 dataset using k-fold algorithm.
+   * Returns a k maps of the form `{:test-ds :train-ds}.
+   *
+   * Options:
+   *
+   * * `:randomize-dataset?` - When true, shuffle dataset.  Defaults to true.
+   * * `:seed` - When randomizing dataset, seed may be either an integer or an implementation
+   *   of `java.util.Random`.
+   */
+  public static Iterable kFold(Object ds, long k, Object options) {
+    return (Iterable)kFoldFn.invoke(ds, k, options);
+  }
+  /**
+   * Return k maps of the form `{:test-ds :train-ds}`.  For options see 3-arity form.
+   */
+  public static Iterable kFold(Object ds, long k) {
+    return (Iterable)kFoldFn.invoke(ds, k);
+  }
+  /**
+   * Split the dataset returning a map of `{:train-ds :test-ds}`.
+   *
+   * Options:
+   *
+   * * `:randomize-dataset?` - Defaults to true.
+   * * `:seed` - When provided must be an integer or an implementation `java.util.Random`.
+   * * `:train-fraction` - Fraction of dataset to use as training set.  Defaults to 0.7.
+   */
+  public static Map trainTestSplit(Object ds, Object options) {
+    return (Map)trainTestFn.invoke(ds, options);
+  }
+  /**
+   * Randomize then split dataset using 70% of the data for training and the rest for testing.
+   */
+  public static Map trainTestSplit(Object ds) {
+    return (Map)trainTestFn.invoke(ds);
+  }
+  /**
+   * Set a column in the dataset as the inference target.  This information is stored in the
+   * column metadata.  This function is short form for:
+   *
+   *```java
+   *  Object col = column(ds, cname);
+   *  return assoc(ds, cname, varyMeta(col, assocFn, kw("inference-target?"), true));
+   *```
+   */
+  public static Map setInferenceTarget(Object ds, Object cname) {
+    return (Map)setInfTargetFn.invoke(ds, cname);
+  }
+  /**
+   * Find the inference column.  If column was the result of a categorical mapping, reverse
+   * that mapping.  Return data in a form that can be efficiently converted to a Buffer.
+   */
+  public static Object labels(Object ds) {
+    return labelsFn.invoke(ds);
+  }
+  /**
+   * Given a dataset where the column names are labels and the each row is a probabilitly
+   * distribution across the labels, produce a Buffer of labels taking the highest probability
+   * for each row to choose the label.
+   */
+  public static Object probabilityDistributionToLabels(Object ds) {
+    return probDistToLabel.invoke(ds);
+  }
+  /**
+   * Return a map of val->idx for the inference target.
+   */
+  public static Map inferenceTargetLabelMap(Object ds) {
+    return (Map)infTargetLabelMap.invoke(ds);
+  }
+}
@@ -0,0 +1,348 @@
+package tech.v3.dataset;
+
+
+import static tech.v3.Clj.*;
+import clojure.lang.IFn;
+import java.util.Map;
+
+
+/**
+ * High speed grouping aggregations based on sequences of datasets.
+ */
+public class Reductions {
+  private Reductions(){}
+
+  static final IFn reducerFn = requiringResolve("tech.v3.dataset.reductions", "reducer");
+  static final IFn sumFn = requiringResolve("tech.v3.dataset.reductions", "sum");
+  static final IFn meanFn = requiringResolve("tech.v3.dataset.reductions", "mean");
+  static final IFn rowCountFn = requiringResolve("tech.v3.dataset.reductions", "row-count");
+  static final IFn distinctFn = requiringResolve("tech.v3.dataset.reductions", "distinct");
+  static final IFn countDistinctFn = requiringResolve("tech.v3.dataset.reductions", "count-distinct");
+  static final IFn reservoirDsFn = requiringResolve("tech.v3.dataset.reductions", "reservoir-dataset");
+  static final IFn reservoirDescStatFn = requiringResolve("tech.v3.dataset.reductions", "reservoir-desc-stat");
+  static final IFn probSetCardFn = requiringResolve("tech.v3.dataset.reductions.apache-data-sketch", "prob-set-cardinality");
+  static final IFn probQuantilesFn = requiringResolve("tech.v3.dataset.reductions.apache-data-sketch", "prob-quantiles");
+  static final IFn probQuantileFn = requiringResolve("tech.v3.dataset.reductions.apache-data-sketch", "prob-quantile");
+  static final IFn probMedianFn = requiringResolve("tech.v3.dataset.reductions.apache-data-sketch", "prob-median");
+  static final IFn probCdfsFn = requiringResolve("tech.v3.dataset.reductions.apache-data-sketch", "prob-cdfs");
+  static final IFn probPmfsFn = requiringResolve("tech.v3.dataset.reductions.apache-data-sketch", "prob-pmfs");
+  static final IFn probIQRangeFn = requiringResolve("tech.v3.dataset.reductions.apache-data-sketch", "prob-interquartile-range");
+  static final IFn groupByColumnAggFn = requiringResolve("tech.v3.dataset.reductions", "group-by-column-agg");
+
+
+  /**
+   * Group a sequence of datasets by column or columns an in the process perform an aggregation.
+   * The resulting dataset will have one row per grouped key.  Columns used as keys will always
+   * be represented in the result.
+   *
+   * @param dsSeq Sequence of datasets such as produced by rowMapcat, dsPmap, or loading many
+   * files.
+   * @param colname Either a single column name or a vector of column names.  These will be the
+   * grouping keys.
+   * @param aggMap Map of result colname to reducer.  Various reducers are provided or you can
+   * build your own via the `reducer` function.
+   * @param options Options map. Described below.  May be null.
+   *
+   * Options:
+   *
+   * * `:map-initial-capacity` - initial hashmap capacity.  Resizing hash-maps is expensive
+   *    so we would like to set this to something reasonable.  Defaults to 100000.
+   * *  `:index-filter` - A function that given a dataset produces a function from long index
+   *    to boolean.  Only indexes for which the index-filter returns true will be added to the
+   *    aggregation.  For very large datasets, this is a bit faster than using filter before
+   *    the aggregation.
+   * 
+   * Example:
+   *
+   *```java
+   * //Begin parallelized expansion
+   *Iterable dsSeq = (Iterable)rowMapcat(srcds, tallyDays, hashmap(kw("result-type"), kw("as-seq")));
+   *
+   * //The first aggregation is to summarize by placement and simulation the year-month tallies.
+   * //We are essentially replacing count with a summarized count.  After this statement
+   * //we can guarantee that the dataset has unique tuples of [simulation, placement, year-month]
+   *Map initAgg = Reductions.groupByColumnsAgg(dsSeq, vector("simulation", "placement", "year-month"),
+   *					       hashmap("count", Reductions.sum("count")),
+   *					       null);
+   *println(head(initAgg));
+   * //["simulation" "placement" "year-month"]-aggregation [5 4]:
+   *
+   * //| simulation | placement | year-month | count |
+   * //|-----------:|----------:|------------|------:|
+   * //|          0 |         0 |    2020-12 | 622.0 |
+   * //|          0 |         1 |    2020-12 | 591.0 |
+   * //|          0 |         2 |    2020-12 | 500.0 |
+   * //|          0 |         3 |    2020-12 | 549.0 |
+   * //|          0 |         4 |    2020-12 | 595.0 |
+   *
+   * // The second aggregation allows us to build of statistics over each placement/year-month
+   * // pair thus finding out the distribution of a given placement, year-month across simluations
+   *Map result = Reductions.groupByColumnsAgg(vector(initAgg), vector("placement", "year-month"),
+   *					      hashmap("min-count",     Reductions.probQuantile("count", 0.0),
+   *						      "low-95-count",  Reductions.probQuantile("count", 0.05),
+   *						      "q1-count",      Reductions.probQuantile("count", 0.25),
+   *						      "median-count",  Reductions.probQuantile("count", 0.5),
+   *						      "q3-count",      Reductions.probQuantile("count", 0.75),
+   *						      "high-95-count", Reductions.probQuantile("count", 0.95),
+   *						      "max-count",     Reductions.probQuantile("count", 1.0),
+   *						      "count",         Reductions.sum("count")),
+   *					      null);
+   * //Take a million row dataset, expand it, then perform two grouping aggregations.
+   *println(head(result));
+   * //["placement" "year-month"]-aggregation [5 10]:
+   *
+   * //| q3-count | median-count | min-count | high-95-count | placement | max-count |   count | low-95-count | q1-count | year-month |
+   * //|---------:|-------------:|----------:|--------------:|----------:|----------:|--------:|-------------:|---------:|------------|
+   * //|    646.0 |        593.0 |     366.0 |         716.0 |        36 |     809.0 | 58920.0 |        475.0 |    536.0 |    2020-12 |
+   * //|    621.0 |        560.0 |     376.0 |         739.0 |        36 |     782.0 | 57107.0 |        459.0 |    512.0 |    2020-10 |
+   * //|    168.0 |        139.0 |      25.0 |         211.0 |         0 |     246.0 | 13875.0 |         76.0 |    112.0 |    2021-01 |
+   * //|    658.0 |        607.0 |     384.0 |         745.0 |         0 |     825.0 | 60848.0 |        486.0 |    561.0 |    2020-12 |
+   * //|    628.0 |        581.0 |     422.0 |         693.0 |         0 |     802.0 | 58148.0 |        468.0 |    539.0 |    2020-11 |
+   *```
+   */
+  public static Map groupByColumnsAgg(Iterable dsSeq, Object colname, Map aggMap, Map options) {
+    return (Map)groupByColumnAggFn.invoke(colname, aggMap, options, dsSeq);
+  }
+
+  /**
+   * Create a custom reducer.  perElemFn is passed the last return value as the first argument
+   * followed by a value from each column as additional arguments.  It must always return the
+   * current context.
+   *
+   * This is a easy way to instantiate tech.v3.datatype.IndexReduction so if you really need
+   * the best possible performance you need to implement three methods of IndexReduction:
+   *
+   * * `prepareBatch` - Passed each dataset before processing.  Return value becomes first
+   *   argument to `reduceIndex`.
+   * * `reduceIndex` - Passed batchCtx, valCtx, and rowIdx.  Must return an updated or
+   *   new valCtx.
+   * * `finalize` - Passed valCtx and must return the final per-row value expected in
+   *   result dataset.  The default is just to return valCtx.
+   *
+   * For `groupByColumnAgg` you do not need to worry about reduceReductions - there is no
+   * merge step.
+   *
+   * @param colname One or more column names.  If multiple column names are specified then
+   * perElemFn will need to take additional arguments.
+   * @param perElemFn A function that takes the previous context along with the current row's
+   * column values and returns a new context.
+   * @param finalizeFn Optional function that performs a final calculation taking a context
+   * and returning a value.
+   */
+  public static Object reducer(Object colname, IFn perElemFn, IFn finalizeFn) {
+    return reducerFn.invoke(colname, perElemFn, finalizeFn);
+  }
+  /**
+   * Create a custom reducer.  `perElemFn` is passed the last return value as the first
+   * argument followed by a value from each column as additional arguments.  It must always
+   * return the current context.
+   *
+   * This is a easy way to instantiate tech.v3.datatype.IndexReduction so if you really need
+   * the best possible performance you need to implement three methods of IndexReduction:
+   *
+   * * `prepareBatch` - Passed each dataset before processing.  Return value becomes first
+   *   argument to `reduceIndex`.
+   * * `reduceIndex` - Passed batchCtx, valCtx, and rowIdx.  Must return valCtx.
+   * * `finalize` - Passed valCtx and must return the final per-row value expected in
+   *   result dataset.
+   *
+   * For `groupByColumnAgg` you do not need to worry about reduceReductions - there is no
+   * merge step.
+   *
+   * @param colname One or more column names.  If multiple column names are specified then
+   * perElemFn will need to take additional arguments.
+   * @param perElemFn A function that takes the previous context along with the current row's
+   * column values and returns a new context.
+   */
+  public static Object reducer(Object colname, IFn perElemFn) {
+    return reducerFn.invoke(colname, perElemFn);
+  }
+  /**
+   * Returns a summation reducer that sums an individual source column.
+   */
+  public static Object sum(Object colname) {
+    return sumFn.invoke(colname);
+  }
+  /**
+   * Returns a mean reducer that produces a mean value of an individual source column.
+   */
+  public static Object mean(Object colname) {
+    return meanFn.invoke(colname);
+  }
+  /**
+   * Returns a rowCount reducer that returns the number of source rows aggregated.
+   */
+  public static Object rowCount(Object colname) {
+    return rowCountFn.invoke(colname);
+  }
+  /**
+   * Returns a distinct reducer produces a set of distinct values.
+   */
+  public static Object distinct(Object colname) {
+    return distinctFn.invoke(colname);
+  }
+  /**
+   * Returns a distinct reducer that produces a roaringbitmap of distinct values.  This is many
+   * times faster than the distinct reducer if your data fits into unsigned int32 space.
+   */
+  public static Object distinctUInt32(Object colname) {
+    return distinctFn.invoke(colname);
+  }
+  /**
+   * Returns a distinct reducer returns the number of distinct elements.
+   */
+  public static Object setCardinality(Object colname) {
+    return countDistinctFn.invoke(colname);
+  }
+  /**
+   * Returns a distinct reducer that expects unsigned integer values and returns the number
+   * of distinct elements.  This is many times faster than the countDistinct function.
+   */
+  public static Object setCardinalityUint32(Object colname) {
+    return countDistinctFn.invoke(colname, kw("int32"));
+  }
+  /**
+   * Return a reducer that produces a probabilistically sampled dataset of at most nRows len.
+   */
+  public static Object reservoirDataset(long nRows) {
+    return reservoirDsFn.invoke(nRows);
+  }
+  /**
+   * Return a reducer which will probabilistically sample the source column producing at most
+   * nRows and then call descriptiveStatistics on it with statName.
+   *
+   * Stat names are described in tech.v3.datatype.Statistics.descriptiveStats.
+   */
+  public static Object reservoirStats(Object colname, long nRows, Object statName) {
+    return reservoirDescStatFn.invoke(colname, nRows, statName);
+  }
+  /**
+   * Calculate a probabilistic set cardinality for a given column based on one of three
+   * algorithms.
+   *
+   * Options:
+   *
+   * * `:datatype` - One of `#{:float64 :string}`.  Unspecified defaults to `:float64`.
+   * * `:algorithm` - defaults to :hyper-log-log.  Further algorithm-specific options
+   *   may be included in the options map.
+   *
+   * Algorithm specific options:
+   *
+   * * [:hyper-log-log](https://datasketches.apache.org/docs/HLL/HLL.html)
+   *     * `:hll-lgk` - defaults to 12, this is log-base2 of k, so k = 4096. lgK can be
+   *        from 4 to 21.
+   *     * `:hll-type` - One of #{4,6,8}, defaults to 8.  The HLL_4, HLL_6 and HLL_8
+   *        represent different levels of compression of the final HLL array where the
+   *        4, 6 and 8 refer to the number of bits each bucket of the HLL array is
+   *        compressed down to. The HLL_4 is the most compressed but generally slightly
+   *        slower than the other two, especially during union operations.
+   * * [:theta](https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html)
+   * * [:cpc](https://datasketches.apache.org/docs/CPC/CPC.html)
+   *     * `:cpc-lgk` - Defaults to 10.
+   */
+  public static Object probSetCardinality(Object colname, Map options) {
+    return probSetCardFn.invoke(colname, options);
+  }
+  /**
+   * Probabilistic quantile estimation - see [DoublesSketch](https://datasketches.apache.org/api/java/snapshot/apidocs/index.html).
+   *
+   * @param quantiles Sequence of quantiles.
+   * @param k Defaults to 128. This produces a normalized rank error of about 1.7%"
+   */
+  public static Object probQuantiles(Object colname, Object quantiles, long k) {
+    return probQuantilesFn.invoke(colname, quantiles, k);
+  }
+  /**
+   * Probabilistic quantile estimation using default k of 128.
+   * See [DoublesSketch](https://datasketches.apache.org/api/java/snapshot/apidocs/index.html).
+   *
+   * @param quantiles Sequence of numbers from 0-1.
+   */
+  public static Object probQuantiles(Object colname, Object quantiles) {
+    return probQuantilesFn.invoke(colname, quantiles);
+  }
+
+  /**
+   * Probabilistic quantile estimation using default k of 128.
+   * See [DoublesSketch](https://datasketches.apache.org/api/java/snapshot/apidocs/index.html).
+   * Multiple quantile calculations on a single source column will be merged into a single quantile 
+   * calculation so it may be more convenient to use this function to produce multiple quantiles 
+   * mapped to several result columns as opposed to ending up with a single column of maps of quantile 
+   * to value.
+   *
+   * @param quantile Number from 0-1.
+   * @param k Defaults to 128. This produces a normalized rank error of about 1.7%
+   */
+  public static Object probQuantile(Object colname, double quantile, long k) {
+    return probQuantileFn.invoke(colname, quantile);
+  }
+  /**
+   * Probabilistic quantile estimation using default k of 128.
+   * See [DoublesSketch](https://datasketches.apache.org/api/java/snapshot/apidocs/index.html).
+   * Multiple quantiles will be merged into a single quantile calculation so it may be more
+   * convenient to use this function to produce multiple quantiles mapped to several result
+   * columns as opposed to ending up with a single column of maps of quantile to value.
+   *
+   * @param quantile Number from 0-1.
+   */
+  public static Object probQuantile(Object colname, double quantile) {
+    return probQuantileFn.invoke(colname, quantile);
+  }
+  /**
+   * Probabilistic median.  See documentation for probQuantiles.
+   */
+  public static Object probMedian(Object colname, long k) {
+    return probMedianFn.invoke(colname, k);
+  }
+  /**
+   * Probabilistic median with default K of 128.  See documentation for probQuantiles.
+   */
+  public static Object probMedian(Object colname) {
+    return probMedianFn.invoke(colname);
+  }
+  /**
+   * Probabilistic interquartile range.  See documentation for probQuantile.
+   */
+  public static Object probInterquartileRange(Object colname, long k) {
+    return probIQRangeFn.invoke(colname, k);
+  }
+  /**
+   * Probabilistic interquartile range.  See documentation for probQuantile.
+   */
+  public static Object probInterquartileRange(Object colname) {
+    return probIQRangeFn.invoke(colname);
+  }
+  /**
+   * Probabilistic CDF calculation, one for each double cdf passed in.
+   * See documentation for progQuantiles.
+   */
+  public static Object probCDFS(Object colname, Object cdfs, long k) {
+    return probCdfsFn.invoke(colname, cdfs, k);
+  }
+  /**
+   * Probabilistic CDF calculation, one for each double cdf passed in.
+   * See documentation for probQuantiles.
+   */
+  public static Object probCDFS(Object colname, Object cdfs) {
+    return probCdfsFn.invoke(colname, cdfs);
+  }
+  /**
+   * Returns an approximation to the Probability Mass Function (PMF) of the input stream
+   * given a set of splitPoints (values). See [DoublesSketch](https://datasketches.apache.org/api/java/snapshot/apidocs/index.html).
+   * See documentation for probQuantiles.
+   *
+   */
+  public static Object probPMFS(Object colname, Object pmfs, long k) {
+    return probPmfsFn.invoke(colname, pmfs, k);
+  }
+  /**
+   * Returns an approximation to the Probability Mass Function (PMF) of the input stream
+   * given a set of splitPoints (values). See [DoublesSketch](https://datasketches.apache.org/api/java/snapshot/apidocs/index.html).
+   * See documentation for probQuantiles.
+   *
+   */
+  public static Object probPMFS(Object colname, Object pmfs) {
+    return probPmfsFn.invoke(colname, pmfs);
+  }
+
+}
@@ -0,0 +1,215 @@
+package tech.v3.dataset;
+
+import static tech.v3.Clj.*;
+import clojure.lang.IFn;
+import clojure.lang.Keyword;
+import java.util.Map;
+
+/**
+ * Fixed and variable length rolling windows.  For variable rolling windows the dataset
+ * must already be sorted by the target column.  Datetime support is provided in terms of
+ * provide specific units in which to perform the rolling operation such as the keyword
+ * `:days`.
+ *
+ */
+public class Rolling {
+
+  private Rolling(){}
+
+  static final IFn meanFn = requiringResolve("tech.v3.dataset.rolling", "mean");
+  static final IFn sumFn = requiringResolve("tech.v3.dataset.rolling", "sum");
+  static final IFn minFn = requiringResolve("tech.v3.dataset.rolling", "min");
+  static final IFn maxFn = requiringResolve("tech.v3.dataset.rolling", "max");
+  static final IFn varianceFn = requiringResolve("tech.v3.dataset.rolling", "variance");
+  static final IFn stddevFn = requiringResolve("tech.v3.dataset.rolling", "standard-deviation");
+  static final IFn nth = requiringResolve("tech.v3.dataset.rolling", "nth");
+  static final IFn firstFn = requiringResolve("tech.v3.dataset.rolling", "first");
+  static final IFn lastFn = requiringResolve("tech.v3.dataset.rolling", "last");
+  static final IFn rollingFn = requiringResolve("tech.v3.dataset.rolling", "rolling");
+
+  /**
+   * Fixed or variable rolling window reductions.
+   *
+   * @param windowSpec Window specification specifying the type of window, either a
+   * window over a fixed number of rows or a window based on a fixed logical
+   * quantitative difference i.e. three months or 10 milliseconds.
+   * @param reducerMap map of dest column name to reducer where reducer is a map with
+   * two keys, :column-name which is the input column to use and :reducer which is
+   * an IFn that receives each window of data as a buffer.
+   *
+   * Example:
+   *
+   *```java
+   * Map stocks = makeDataset("https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv");
+   *
+   * //Variable-sized windows require the source column to be sorted.
+   * stocks = sortByColumn(stocks, "date");
+   * Map variableWin = Rolling.rolling(stocks,
+   *				      Rolling.variableWindow("date", 3, kw("months")),
+   *				      hashmap("price-mean-3m", Rolling.mean("price"),
+   *					      "price-max-3m", Rolling.max("price"),
+   *					      "price-min-3m", Rolling.min("price")));
+   *println(head(variableWin, 10));
+   *https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv [10 6]:
+   * //| symbol |       date |  price | price-max-3m | price-mean-3m | price-min-3m |
+   * //|--------|------------|-------:|-------------:|--------------:|-------------:|
+   * //|   AAPL | 2000-01-01 |  25.94 |       106.11 |   58.92500000 |        25.94 |
+   * //|    IBM | 2000-01-01 | 100.52 |       106.11 |   61.92363636 |        28.66 |
+   * //|   MSFT | 2000-01-01 |  39.81 |       106.11 |   58.06400000 |        28.66 |
+   * //|   AMZN | 2000-01-01 |  64.56 |       106.11 |   60.09222222 |        28.66 |
+   * //|   AAPL | 2000-02-01 |  28.66 |       106.11 |   57.56583333 |        28.37 |
+   * //|   MSFT | 2000-02-01 |  36.35 |       106.11 |   60.19363636 |        28.37 |
+   * //|    IBM | 2000-02-01 |  92.11 |       106.11 |   62.57800000 |        28.37 |
+   * //|   AMZN | 2000-02-01 |  68.87 |       106.11 |   59.29666667 |        28.37 |
+   * //|   AMZN | 2000-03-01 |  67.00 |       106.11 |   54.65583333 |        21.00 |
+   * //|   MSFT | 2000-03-01 |  43.22 |       106.11 |   53.53363636 |        21.00 |
+   *
+   * //Fixed window...
+   *
+   * Object radians = VecMath.mul(2.0*Math.PI, VecMath.div(range(33), 32.0));
+   * Map sinds = makeDataset(hashmap("radians", radians, "sin", VecMath.sin(radians)));
+   * Map fixedWin = Rolling.rolling(sinds,
+   *				   Rolling.fixedWindow(4),
+   *				   hashmap("sin-roll-mean", Rolling.mean("sin"),
+   *					   "sin-roll-max", Rolling.max("sin"),
+   *					   "sin-roll-min", Rolling.min("sin")));
+   *println(head(fixedWin, 8));
+   * //_unnamed [8 5]:
+
+   * //|        sin |    radians | sin-roll-max | sin-roll-min | sin-roll-mean |
+   * //|-----------:|-----------:|-------------:|-------------:|--------------:|
+   * //| 0.00000000 | 0.00000000 |   0.19509032 |   0.00000000 |    0.04877258 |
+   * //| 0.19509032 | 0.19634954 |   0.38268343 |   0.00000000 |    0.14444344 |
+   * //| 0.38268343 | 0.39269908 |   0.55557023 |   0.00000000 |    0.28333600 |
+   * //| 0.55557023 | 0.58904862 |   0.70710678 |   0.19509032 |    0.46011269 |
+   * //| 0.70710678 | 0.78539816 |   0.83146961 |   0.38268343 |    0.61920751 |
+   * //| 0.83146961 | 0.98174770 |   0.92387953 |   0.55557023 |    0.75450654 |
+   * //| 0.92387953 | 1.17809725 |   0.98078528 |   0.70710678 |    0.86081030 |
+   * //| 0.98078528 | 1.37444679 |   1.00000000 |   0.83146961 |    0.93403361 |
+   *```
+   */
+  public static Map rolling(Object ds, Map windowSpec, Map reducerMap) {
+    return (Map)rollingFn.invoke(ds, windowSpec, reducerMap);
+  }
+  /**
+   * Create a variable window specification with a double windowsize for a particular column.
+   * This specification will not work on datetime columns.
+   */
+  public static Map variableWindow(Object colname, double windowSize) {
+    return hashmap(kw("window-type"), kw("variable"),
+		   kw("column-name"), colname,
+		   kw("window-size"), windowSize);
+  }
+  /**
+   * Create a variable window specification with a double windowsize for a particular column
+   * and a compFn which must take two values and return a double.  The function must take 2
+   * arguments and the arguments are passed in as (later,earlier).  This allows the basic
+   * clojure '-' operator to work fine in many cases.
+   *
+   */
+  public static Map variableWindow(Object colname, double windowSize, Object compFn) {
+    return hashmap(kw("window-type"), kw("variable"),
+		   kw("column-name"), colname,
+		   kw("window-size"), windowSize,
+		   kw("comp-fn"), compFn);
+  }
+  /**
+   * Create a datetime-specific variable window specification with a double windowsize for
+   * a particular column.
+   *
+   * @param datetimeUnit One of `[:milliseconds, :seconds, :hours, :days, :months]`.
+   */
+  public static Map variableWindow(Object colname, double windowSize, Keyword datetimeUnit) {
+    return hashmap(kw("window-type"), kw("variable"),
+		   kw("column-name"), colname,
+		   kw("window-size"), windowSize,
+		   kw("units"), datetimeUnit);
+  }
+  /**
+   * Return fixed size rolling window.  Window will be fixed over `window-size` rows.
+   */
+  public static Map fixedWindow(long windowSize) {
+    return hashmap(kw("window-type"), kw("fixed"),
+		   kw("window-size"), windowSize);
+  }
+  /**
+   * Return fixed size rolling window.  Window will be fixed over `window-size` rows.
+   *
+   * @param winPos One of `[:left :center :right]`.  This combined with the default
+   *        edge mode of `:clamp` dictates the windows of data the reducer sees.
+   */
+  public static Map fixedWindow(long windowSize, Keyword winPos) {
+    return hashmap(kw("window-type"), kw("fixed"),
+		   kw("window-size"), windowSize,
+		   kw("relative-window-position"), winPos);
+  }
+  /**
+   * Return fixed size rolling window.  Window will be fixed over `window-size` rows.
+   *
+   * @param winPos One of `[:left :center :right]`.  This combined with the default
+   *        edge mode dictates windows of data the reducer sees.
+   *
+   * @param edgeMode One of `[:zero, null, :clamp]`.  Clamp means repeat the end value.
+   */
+  public static Map fixedWindow(long windowSize, Keyword winPos, Keyword edgeMode) {
+    return hashmap(kw("window-type"), kw("fixed"),
+		   kw("window-size"), windowSize,
+		   kw("relative-window-position"), winPos,
+		   kw("edge-mode"), edgeMode);
+  }
+  /**
+   * Create a columnwise reducer.  This reducer gets sub-windows from the column and
+   * must return a scalar value.  If srcColname is a vector of colnames then reduceFn
+   * will be passed each column window as a separate argument.
+   *
+   * @param datatype Option datatype, may be nil in which case the dataset will scan the
+   * result to infer datatype.  If provided this will enforce the result column datatype.
+   * Reductions to numeric types with fixed datatypes will be slightly faster than
+   * generic reductions which require inference to find the final datatype.
+   */
+  public static Map reducer(Object srcColname, IFn reduceFn, Keyword datatype) {
+    return hashmap(kw("column-name"), srcColname,
+		   kw("reducer"), reduceFn,
+		   kw("datatype"), datatype);
+  }
+  /**
+   * Create a columnwise reducer eliding datatype parameter.  See documentation
+   * on 3-arity form of function.
+   */
+  public static Map reducer(Object srcColname, IFn reduceFn) {
+    return hashmap(kw("column-name"), srcColname,
+		   kw("reducer"), reduceFn);
+  }
+  /** mean reducer*/
+  public static Map mean(Object colname) {
+    return (Map)meanFn.invoke(colname);
+  }
+  /** sum reducer*/
+  public static Map sum(Object colname) {
+    return (Map)sumFn.invoke(colname);
+  }
+  /** min reducer*/
+  public static Map min(Object colname) {
+    return (Map)minFn.invoke(colname);
+  }
+  /** max reducer*/
+  public static Map max(Object colname) {
+    return (Map)maxFn.invoke(colname);
+  }
+  /** stddev reducer*/
+  public static Map stddev(Object colname) {
+    return (Map)stddevFn.invoke(colname);
+  }
+  /** variance reducer*/
+  public static Map variance(Object colname) {
+    return (Map)varianceFn.invoke(colname);
+  }
+  /** reducer that keeps the first value*/
+  public static Map first(Object colname) {
+    return (Map)firstFn.invoke(colname);
+  }
+  /** reducer that keeps the last value*/
+  public static Map last(Object colname) {
+    return (Map)lastFn.invoke(colname);
+  }
+}