df-research/tech.ml.dataset/java_public_api/tech/v3/dataset/Modelling.java

package tech.v3.dataset;


import static tech.v3.Clj.*;
import clojure.lang.IFn;
import clojure.lang.Keyword;
import java.util.Map;


/**
 * Functions related to training and evaluating ML models.  The functions are grouped into
 * a few groups.
 *
 * For the purpose of this system, categorical data means a column of data that is not numeric.
 * it could be strings, keywords, or arbitrary objects.
 *
 * Minimal example extra dependencies for PCA:
 *
 *```console
 * [uncomplicate/neanderthal "0.43.3"]
 *```
 *
 * It is also important to note that you can serialize the fit results to nippy automatically
 * as included in dtype-next are extensions to nippy that work with tensors.
 */
public class Modelling {
  private Modelling(){}

  static final IFn fitCatFn = requiringResolve("tech.v3.dataset.categorical", "fit-categorical-map");
  static final IFn transCatFn = requiringResolve("tech.v3.dataset.categorical", "transform-categorical-map");
  static final IFn invCatFn = requiringResolve("tech.v3.dataset.categorical", "invert-categorical-map");
  static final IFn fitOneHotFn = requiringResolve("tech.v3.dataset.categorical", "fit-one-hot");
  static final IFn transOneHotFn = requiringResolve("tech.v3.dataset.categorical", "transform-one-hot");
  static final IFn invOneHotFn = requiringResolve("tech.v3.dataset.categorical", "invert-one-hot-map");

  static final IFn corrTableFn = requiringResolve("tech.v3.dataset.math", "correlation-table");
  static final IFn fillRangeReplaceFn = requiringResolve("tech.v3.dataset.math", "fill-range-replace");
  static final IFn fitPCAFn = requiringResolve("tech.v3.dataset.math", "fit-pca");
  static final IFn fitStdScaleFn = requiringResolve("tech.v3.dataset.math", "fit-std-scale");
  static final IFn fitMinMaxFn = requiringResolve("tech.v3.dataset.math", "fit-minmax");
  static final IFn transPCAFn = requiringResolve("tech.v3.dataset.math", "transform-pca");
  static final IFn transStdScaleFn = requiringResolve("tech.v3.dataset.math", "transform-std-scale");
  static final IFn transMinMaxFn = requiringResolve("tech.v3.dataset.math", "transform-minmax");
  static final IFn interpolateLOESSFn = requiringResolve("tech.v3.dataset.math", "interpolate-loess");

  static final IFn kFoldFn = requiringResolve("tech.v3.dataset.modelling", "k-fold-datasets");
  static final IFn trainTestFn = requiringResolve("tech.v3.dataset.modelling", "train-test-split");
  static final IFn setInfTargetFn = requiringResolve("tech.v3.dataset.modelling", "set-inference-target");
  static final IFn labelsFn = requiringResolve("tech.v3.dataset.modelling", "labels");
  static final IFn probDistToLabel = requiringResolve("tech.v3.dataset.modelling", "probability-distributions->label-column");
  static final IFn infTargetLabelMap = requiringResolve("tech.v3.dataset.modelling", "inference-target-label-map");


  /**
   * Fit an object->integer transform that takes each value and assigned an integer to it.  The
   * returned value can be used in transformCategorical to transform the dataset.
   *
   * Options:
   *
   * * `:table-args` - Either a sequence of vectors [col-val, idx] or a sorted sequence
   *   of column values where integers will be assigned as per the sorted sequence.  Any values
   *   found outside the the specified values will be auto-mapped to the next largest integer.
   * * `:res-dtype` - Datatype of result column.  Defaults to `:float64`.
   */
  public static Map fitCategorical(Object ds, Object cname, Object options) {
    return (Map)fitCatFn.invoke(ds, cname, options);
  }
  /**
   * Fit an object->integer transformation.  Integers will be assigned in random order.  For
   * more control over the transform see the 3-arity version of the function.
   */
  public static Map fitCategorical(Object ds, Object cname) {
    return (Map)fitCatFn.invoke(ds, cname);
  }
  /**
   * Apply an object->integer transformation with data obtained from fitCategorical.
   */
  public static Map transformCategorical(Object ds, Object catFitData) {
    return (Map)transCatFn.invoke(ds, catFitData);
  }
  /**
   * Reverse a previously transformed categorical mapping.
   */
  public static Map invertCategorical(Object ds, Object catFitData) {
    return (Map)invCatFn.invoke(ds, catFitData);
  }
  /**
   * Fit a transformation from a single column of categorical values to a `one-hot` encoded
   * group of columns.
   * .
   *
   * Options:
   *
   * * `:table-args` - Either a sequence of vectors [col-val, idx] or a sorted sequence
   *   of column values where integers will be assigned as per the sorted sequence.  Any values
   *   found outside the the specified values will be auto-mapped to the next largest integer.
   * * `:res-dtype` - Datatype of result column.  Defaults to `:float64`.
   *
   */
  public static Map fitOneHot(Object ds, Object cname, Object options) {
    return (Map)fitOneHotFn.invoke(ds, cname, options);
  }
  /**
   * Fit a mapping from a categorical column to a group of one-hot encoded columns.
   */
  public static Map fitOneHot(Object ds, Object cname) {
    return (Map)fitOneHotFn.invoke(ds, cname);
  }
  /**
   * Transform a dataset using a fitted one-hot mapping.
   */
  public static Map transformOneHot(Object ds, Object fitData) {
    return (Map)transOneHotFn.invoke(ds, fitData);
  }
  /**
   * Reverse a previously transformed one-hot mapping.
   */
  public static Map invertOneHot(Object ds, Object fitData) {
    return (Map)invOneHotFn.invoke(ds, fitData);
  }
  /**
   * Return a map of column to inversely sorted from greatest to least sequence of tuples of
   * column name, coefficient.
   *
   * Options:
   *
   * * `:correlation-type` One of `:pearson`, `:spearman`, or `:kendall`.  Defaults to
   *   `:pearson`.
   */
  public static Map correlationTable(Object ds, Object options) {
    return (Map)corrTableFn.invoke(ds, options);
  }
  /**
   * Return a map of column to inversely sorted from greatest to least sequence of tuples of
   * column name, pearson correlation coefficient.
   */
  public static Map correlationTable(Object ds) {
    return (Map)corrTableFn.invoke(ds);
  }
  /**
   * Expand a dataset ensuring that the difference between two successive values is less than
   * `max-span`.
   *
   * @param maxSpan The minimal span value.  For datetime types this is interpreted in
   * millisecond or epoch-millisecond space.
   * @param missingStrategy Same missing strategy types from `TMD.replaceMissing`.
   */
  public static Map fillRangeReplace(Object ds, Object cname, double maxSpan, Object missingStrategy) {
    Keyword strat;
    Object value = null;
    if (isVector(missingStrategy)) {
	strat = (Keyword)call(missingStrategy, 0);
	value = call(missingStrategy, 1);
      } else {
      strat = (Keyword)missingStrategy;
    }
    return (Map) fillRangeReplaceFn.invoke(ds, cname, maxSpan, strat, value);
  }
  /**
   * Fit a PCA transformation on a dataset.
   *
   * @return map of `{:means, :eigenvalues, :eigenvectors}`.
   *
   * Options:
   *
   * * `:method` - either `:svd` or `:cov`.  Use either SVD transformation or covariance-matrix
   *   base PCA.  `:cov` method is somewhat slower but returns accurate variances and thus
   *   is the default.
   * * `:variance-amount` - Keep columns until variance is just less than variance-amount.
   *   Defaults to 0.95.
   * * `:n-components` - Return a fixed number of components.  Overrides `:variance-amount`
   *   an returns a fixed number of components.
   * * `:covariance-bias` - When using `:cov` divide by `n-rows` if true and `n-rows - 1` if
   *   false.  Defaults to false.
   */
  public static Object fitPCA(Object ds, Object options) {
    return fitPCAFn.invoke(ds, options);
  }
  /**
   * Fit a PCA transformation onto a dataset keeping 95% of the variance.  See documentation
   * for 2-arity form.
   */
  public static Object fitPCA(Object ds) {
    return fitPCAFn.invoke(ds);
  }
  /**
   * Transform a dataset by the PCA fit data.
   */
  public static Map transformPCA(Object ds, Object fitData) {
    return (Map)transPCAFn.invoke(ds, fitData);
  }
  /**
   * Calculate per-column mean, stddev.
   *
   * Options:
   *
   * * `:mean?` - Produce per-column means.  Defaults to true.
   * * `:stddev?` - Produce per-column standard deviation.  Defaults to true.
   */
  public static Object fitStdScale(Object ds) {
    return fitStdScaleFn.invoke(ds);
  }
  /**
   * Transform dataset to mean of zero and a standard deviation of 1.
   */
  public static Map transformStdScale(Object ds, Object fitData) {
    return (Map)transStdScaleFn.invoke(ds, fitData);
  }
  /**
   * Fit a bias and scale the dataset that transforms each colum to a target min-max
   * value.
   *
   * Options:
   *
   * * `:min` - Target minimum value.  Defaults it -0.5.
   * * `:max` - Target maximum value.  Defaults to 0.5.
   */
  public static Object fitMinMax(Object ds, Object options) {
    return fitMinMaxFn.invoke(ds, options);
  }
  /**
   * Fit a minmax transformation that will transform each column to a minimum of -0.5 and
   * a maximum of 0.5.
   */
  public static Object fitMinMax(Object ds) {
    return fitMinMaxFn.invoke(ds);
  }
  /**
   * Transform a dataset using a previously fit minimax transformation.
   */
  public static Map transformMinMax(Object ds, Object fitData) {
    return (Map)transMinMaxFn.invoke(ds, fitData);
  }
  /**
   * Map a LOESS-interpolation transformation onto a dataset.  This can be used
   * to, among other things, smooth out a column before graphing.  For the meaning
   * of the options, see documentation on the
   * org.apache.commons.math3.analysis.interpolationLoessInterpolator.
   *
   * Option defaults have been chosen to map somewhat closely to the R defaults.
   *
   * Options:
   *
   * * `:bandwidth` - Defaults to 0.75.
   * * `:iterations` - Defaults to 4.
   * * `:accuracy` - Defaults to LoessInterpolator/DEFAULT_ACCURACY.
   * * `:result-name` - Result column name.  Defaults to `yColname.toString +  "-loess"`.
   */
  public static Map interpolateLOESS(Object ds, Object xColname, Object yColname, Object options) {
    return (Map)interpolateLOESSFn.invoke(ds, xColname, yColname, options);
  }
  /**
   * Perform a LOESS interpolation using the default parameters.  For options see 4-arity
   * form of function.
   */
  public static Map interpolateLOESS(Object ds, Object xColname, Object yColname) {
    return (Map)interpolateLOESSFn.invoke(ds, xColname, yColname);
  }
  /**
   * Produce 2*k datasets from 1 dataset using k-fold algorithm.
   * Returns a k maps of the form `{:test-ds :train-ds}.
   *
   * Options:
   *
   * * `:randomize-dataset?` - When true, shuffle dataset.  Defaults to true.
   * * `:seed` - When randomizing dataset, seed may be either an integer or an implementation
   *   of `java.util.Random`.
   */
  public static Iterable kFold(Object ds, long k, Object options) {
    return (Iterable)kFoldFn.invoke(ds, k, options);
  }
  /**
   * Return k maps of the form `{:test-ds :train-ds}`.  For options see 3-arity form.
   */
  public static Iterable kFold(Object ds, long k) {
    return (Iterable)kFoldFn.invoke(ds, k);
  }
  /**
   * Split the dataset returning a map of `{:train-ds :test-ds}`.
   *
   * Options:
   *
   * * `:randomize-dataset?` - Defaults to true.
   * * `:seed` - When provided must be an integer or an implementation `java.util.Random`.
   * * `:train-fraction` - Fraction of dataset to use as training set.  Defaults to 0.7.
   */
  public static Map trainTestSplit(Object ds, Object options) {
    return (Map)trainTestFn.invoke(ds, options);
  }
  /**
   * Randomize then split dataset using 70% of the data for training and the rest for testing.
   */
  public static Map trainTestSplit(Object ds) {
    return (Map)trainTestFn.invoke(ds);
  }
  /**
   * Set a column in the dataset as the inference target.  This information is stored in the
   * column metadata.  This function is short form for:
   *
   *```java
   *  Object col = column(ds, cname);
   *  return assoc(ds, cname, varyMeta(col, assocFn, kw("inference-target?"), true));
   *```
   */
  public static Map setInferenceTarget(Object ds, Object cname) {
    return (Map)setInfTargetFn.invoke(ds, cname);
  }
  /**
   * Find the inference column.  If column was the result of a categorical mapping, reverse
   * that mapping.  Return data in a form that can be efficiently converted to a Buffer.
   */
  public static Object labels(Object ds) {
    return labelsFn.invoke(ds);
  }
  /**
   * Given a dataset where the column names are labels and the each row is a probabilitly
   * distribution across the labels, produce a Buffer of labels taking the highest probability
   * for each row to choose the label.
   */
  public static Object probabilityDistributionToLabels(Object ds) {
    return probDistToLabel.invoke(ds);
  }
  /**
   * Return a map of val->idx for the inference target.
   */
  public static Map inferenceTargetLabelMap(Object ds) {
    return (Map)infTargetLabelMap.invoke(ds);
  }
}