330 lines
13 KiB
Java
Vendored
330 lines
13 KiB
Java
Vendored
package tech.v3.dataset;
|
|
|
|
|
|
import static tech.v3.Clj.*;
|
|
import clojure.lang.IFn;
|
|
import clojure.lang.Keyword;
|
|
import java.util.Map;
|
|
|
|
|
|
/**
|
|
* Functions related to training and evaluating ML models. The functions are grouped into
|
|
* a few groups.
|
|
*
|
|
* For the purpose of this system, categorical data means a column of data that is not numeric.
|
|
* it could be strings, keywords, or arbitrary objects.
|
|
*
|
|
* Minimal example extra dependencies for PCA:
|
|
*
|
|
*```console
|
|
* [uncomplicate/neanderthal "0.43.3"]
|
|
*```
|
|
*
|
|
* It is also important to note that you can serialize the fit results to nippy automatically
|
|
* as included in dtype-next are extensions to nippy that work with tensors.
|
|
*/
|
|
public class Modelling {
|
|
private Modelling(){}
|
|
|
|
static final IFn fitCatFn = requiringResolve("tech.v3.dataset.categorical", "fit-categorical-map");
|
|
static final IFn transCatFn = requiringResolve("tech.v3.dataset.categorical", "transform-categorical-map");
|
|
static final IFn invCatFn = requiringResolve("tech.v3.dataset.categorical", "invert-categorical-map");
|
|
static final IFn fitOneHotFn = requiringResolve("tech.v3.dataset.categorical", "fit-one-hot");
|
|
static final IFn transOneHotFn = requiringResolve("tech.v3.dataset.categorical", "transform-one-hot");
|
|
static final IFn invOneHotFn = requiringResolve("tech.v3.dataset.categorical", "invert-one-hot-map");
|
|
|
|
static final IFn corrTableFn = requiringResolve("tech.v3.dataset.math", "correlation-table");
|
|
static final IFn fillRangeReplaceFn = requiringResolve("tech.v3.dataset.math", "fill-range-replace");
|
|
static final IFn fitPCAFn = requiringResolve("tech.v3.dataset.math", "fit-pca");
|
|
static final IFn fitStdScaleFn = requiringResolve("tech.v3.dataset.math", "fit-std-scale");
|
|
static final IFn fitMinMaxFn = requiringResolve("tech.v3.dataset.math", "fit-minmax");
|
|
static final IFn transPCAFn = requiringResolve("tech.v3.dataset.math", "transform-pca");
|
|
static final IFn transStdScaleFn = requiringResolve("tech.v3.dataset.math", "transform-std-scale");
|
|
static final IFn transMinMaxFn = requiringResolve("tech.v3.dataset.math", "transform-minmax");
|
|
static final IFn interpolateLOESSFn = requiringResolve("tech.v3.dataset.math", "interpolate-loess");
|
|
|
|
static final IFn kFoldFn = requiringResolve("tech.v3.dataset.modelling", "k-fold-datasets");
|
|
static final IFn trainTestFn = requiringResolve("tech.v3.dataset.modelling", "train-test-split");
|
|
static final IFn setInfTargetFn = requiringResolve("tech.v3.dataset.modelling", "set-inference-target");
|
|
static final IFn labelsFn = requiringResolve("tech.v3.dataset.modelling", "labels");
|
|
static final IFn probDistToLabel = requiringResolve("tech.v3.dataset.modelling", "probability-distributions->label-column");
|
|
static final IFn infTargetLabelMap = requiringResolve("tech.v3.dataset.modelling", "inference-target-label-map");
|
|
|
|
|
|
/**
|
|
* Fit an object->integer transform that takes each value and assigned an integer to it. The
|
|
* returned value can be used in transformCategorical to transform the dataset.
|
|
*
|
|
* Options:
|
|
*
|
|
* * `:table-args` - Either a sequence of vectors [col-val, idx] or a sorted sequence
|
|
* of column values where integers will be assigned as per the sorted sequence. Any values
|
|
* found outside the the specified values will be auto-mapped to the next largest integer.
|
|
* * `:res-dtype` - Datatype of result column. Defaults to `:float64`.
|
|
*/
|
|
public static Map fitCategorical(Object ds, Object cname, Object options) {
|
|
return (Map)fitCatFn.invoke(ds, cname, options);
|
|
}
|
|
/**
|
|
* Fit an object->integer transformation. Integers will be assigned in random order. For
|
|
* more control over the transform see the 3-arity version of the function.
|
|
*/
|
|
public static Map fitCategorical(Object ds, Object cname) {
|
|
return (Map)fitCatFn.invoke(ds, cname);
|
|
}
|
|
/**
|
|
* Apply an object->integer transformation with data obtained from fitCategorical.
|
|
*/
|
|
public static Map transformCategorical(Object ds, Object catFitData) {
|
|
return (Map)transCatFn.invoke(ds, catFitData);
|
|
}
|
|
/**
|
|
* Reverse a previously transformed categorical mapping.
|
|
*/
|
|
public static Map invertCategorical(Object ds, Object catFitData) {
|
|
return (Map)invCatFn.invoke(ds, catFitData);
|
|
}
|
|
/**
|
|
* Fit a transformation from a single column of categorical values to a `one-hot` encoded
|
|
* group of columns.
|
|
* .
|
|
*
|
|
* Options:
|
|
*
|
|
* * `:table-args` - Either a sequence of vectors [col-val, idx] or a sorted sequence
|
|
* of column values where integers will be assigned as per the sorted sequence. Any values
|
|
* found outside the the specified values will be auto-mapped to the next largest integer.
|
|
* * `:res-dtype` - Datatype of result column. Defaults to `:float64`.
|
|
*
|
|
*/
|
|
public static Map fitOneHot(Object ds, Object cname, Object options) {
|
|
return (Map)fitOneHotFn.invoke(ds, cname, options);
|
|
}
|
|
/**
|
|
* Fit a mapping from a categorical column to a group of one-hot encoded columns.
|
|
*/
|
|
public static Map fitOneHot(Object ds, Object cname) {
|
|
return (Map)fitOneHotFn.invoke(ds, cname);
|
|
}
|
|
/**
|
|
* Transform a dataset using a fitted one-hot mapping.
|
|
*/
|
|
public static Map transformOneHot(Object ds, Object fitData) {
|
|
return (Map)transOneHotFn.invoke(ds, fitData);
|
|
}
|
|
/**
|
|
* Reverse a previously transformed one-hot mapping.
|
|
*/
|
|
public static Map invertOneHot(Object ds, Object fitData) {
|
|
return (Map)invOneHotFn.invoke(ds, fitData);
|
|
}
|
|
/**
|
|
* Return a map of column to inversely sorted from greatest to least sequence of tuples of
|
|
* column name, coefficient.
|
|
*
|
|
* Options:
|
|
*
|
|
* * `:correlation-type` One of `:pearson`, `:spearman`, or `:kendall`. Defaults to
|
|
* `:pearson`.
|
|
*/
|
|
public static Map correlationTable(Object ds, Object options) {
|
|
return (Map)corrTableFn.invoke(ds, options);
|
|
}
|
|
/**
|
|
* Return a map of column to inversely sorted from greatest to least sequence of tuples of
|
|
* column name, pearson correlation coefficient.
|
|
*/
|
|
public static Map correlationTable(Object ds) {
|
|
return (Map)corrTableFn.invoke(ds);
|
|
}
|
|
/**
|
|
* Expand a dataset ensuring that the difference between two successive values is less than
|
|
* `max-span`.
|
|
*
|
|
* @param maxSpan The minimal span value. For datetime types this is interpreted in
|
|
* millisecond or epoch-millisecond space.
|
|
* @param missingStrategy Same missing strategy types from `TMD.replaceMissing`.
|
|
*/
|
|
public static Map fillRangeReplace(Object ds, Object cname, double maxSpan, Object missingStrategy) {
|
|
Keyword strat;
|
|
Object value = null;
|
|
if (isVector(missingStrategy)) {
|
|
strat = (Keyword)call(missingStrategy, 0);
|
|
value = call(missingStrategy, 1);
|
|
} else {
|
|
strat = (Keyword)missingStrategy;
|
|
}
|
|
return (Map) fillRangeReplaceFn.invoke(ds, cname, maxSpan, strat, value);
|
|
}
|
|
/**
|
|
* Fit a PCA transformation on a dataset.
|
|
*
|
|
* @return map of `{:means, :eigenvalues, :eigenvectors}`.
|
|
*
|
|
* Options:
|
|
*
|
|
* * `:method` - either `:svd` or `:cov`. Use either SVD transformation or covariance-matrix
|
|
* base PCA. `:cov` method is somewhat slower but returns accurate variances and thus
|
|
* is the default.
|
|
* * `:variance-amount` - Keep columns until variance is just less than variance-amount.
|
|
* Defaults to 0.95.
|
|
* * `:n-components` - Return a fixed number of components. Overrides `:variance-amount`
|
|
* an returns a fixed number of components.
|
|
* * `:covariance-bias` - When using `:cov` divide by `n-rows` if true and `n-rows - 1` if
|
|
* false. Defaults to false.
|
|
*/
|
|
public static Object fitPCA(Object ds, Object options) {
|
|
return fitPCAFn.invoke(ds, options);
|
|
}
|
|
/**
|
|
* Fit a PCA transformation onto a dataset keeping 95% of the variance. See documentation
|
|
* for 2-arity form.
|
|
*/
|
|
public static Object fitPCA(Object ds) {
|
|
return fitPCAFn.invoke(ds);
|
|
}
|
|
/**
|
|
* Transform a dataset by the PCA fit data.
|
|
*/
|
|
public static Map transformPCA(Object ds, Object fitData) {
|
|
return (Map)transPCAFn.invoke(ds, fitData);
|
|
}
|
|
/**
|
|
* Calculate per-column mean, stddev.
|
|
*
|
|
* Options:
|
|
*
|
|
* * `:mean?` - Produce per-column means. Defaults to true.
|
|
* * `:stddev?` - Produce per-column standard deviation. Defaults to true.
|
|
*/
|
|
public static Object fitStdScale(Object ds) {
|
|
return fitStdScaleFn.invoke(ds);
|
|
}
|
|
/**
|
|
* Transform dataset to mean of zero and a standard deviation of 1.
|
|
*/
|
|
public static Map transformStdScale(Object ds, Object fitData) {
|
|
return (Map)transStdScaleFn.invoke(ds, fitData);
|
|
}
|
|
/**
|
|
* Fit a bias and scale the dataset that transforms each colum to a target min-max
|
|
* value.
|
|
*
|
|
* Options:
|
|
*
|
|
* * `:min` - Target minimum value. Defaults it -0.5.
|
|
* * `:max` - Target maximum value. Defaults to 0.5.
|
|
*/
|
|
public static Object fitMinMax(Object ds, Object options) {
|
|
return fitMinMaxFn.invoke(ds, options);
|
|
}
|
|
/**
|
|
* Fit a minmax transformation that will transform each column to a minimum of -0.5 and
|
|
* a maximum of 0.5.
|
|
*/
|
|
public static Object fitMinMax(Object ds) {
|
|
return fitMinMaxFn.invoke(ds);
|
|
}
|
|
/**
|
|
* Transform a dataset using a previously fit minimax transformation.
|
|
*/
|
|
public static Map transformMinMax(Object ds, Object fitData) {
|
|
return (Map)transMinMaxFn.invoke(ds, fitData);
|
|
}
|
|
/**
|
|
* Map a LOESS-interpolation transformation onto a dataset. This can be used
|
|
* to, among other things, smooth out a column before graphing. For the meaning
|
|
* of the options, see documentation on the
|
|
* org.apache.commons.math3.analysis.interpolationLoessInterpolator.
|
|
*
|
|
* Option defaults have been chosen to map somewhat closely to the R defaults.
|
|
*
|
|
* Options:
|
|
*
|
|
* * `:bandwidth` - Defaults to 0.75.
|
|
* * `:iterations` - Defaults to 4.
|
|
* * `:accuracy` - Defaults to LoessInterpolator/DEFAULT_ACCURACY.
|
|
* * `:result-name` - Result column name. Defaults to `yColname.toString + "-loess"`.
|
|
*/
|
|
public static Map interpolateLOESS(Object ds, Object xColname, Object yColname, Object options) {
|
|
return (Map)interpolateLOESSFn.invoke(ds, xColname, yColname, options);
|
|
}
|
|
/**
|
|
* Perform a LOESS interpolation using the default parameters. For options see 4-arity
|
|
* form of function.
|
|
*/
|
|
public static Map interpolateLOESS(Object ds, Object xColname, Object yColname) {
|
|
return (Map)interpolateLOESSFn.invoke(ds, xColname, yColname);
|
|
}
|
|
/**
|
|
* Produce 2*k datasets from 1 dataset using k-fold algorithm.
|
|
* Returns a k maps of the form `{:test-ds :train-ds}.
|
|
*
|
|
* Options:
|
|
*
|
|
* * `:randomize-dataset?` - When true, shuffle dataset. Defaults to true.
|
|
* * `:seed` - When randomizing dataset, seed may be either an integer or an implementation
|
|
* of `java.util.Random`.
|
|
*/
|
|
public static Iterable kFold(Object ds, long k, Object options) {
|
|
return (Iterable)kFoldFn.invoke(ds, k, options);
|
|
}
|
|
/**
|
|
* Return k maps of the form `{:test-ds :train-ds}`. For options see 3-arity form.
|
|
*/
|
|
public static Iterable kFold(Object ds, long k) {
|
|
return (Iterable)kFoldFn.invoke(ds, k);
|
|
}
|
|
/**
|
|
* Split the dataset returning a map of `{:train-ds :test-ds}`.
|
|
*
|
|
* Options:
|
|
*
|
|
* * `:randomize-dataset?` - Defaults to true.
|
|
* * `:seed` - When provided must be an integer or an implementation `java.util.Random`.
|
|
* * `:train-fraction` - Fraction of dataset to use as training set. Defaults to 0.7.
|
|
*/
|
|
public static Map trainTestSplit(Object ds, Object options) {
|
|
return (Map)trainTestFn.invoke(ds, options);
|
|
}
|
|
/**
|
|
* Randomize then split dataset using 70% of the data for training and the rest for testing.
|
|
*/
|
|
public static Map trainTestSplit(Object ds) {
|
|
return (Map)trainTestFn.invoke(ds);
|
|
}
|
|
/**
|
|
* Set a column in the dataset as the inference target. This information is stored in the
|
|
* column metadata. This function is short form for:
|
|
*
|
|
*```java
|
|
* Object col = column(ds, cname);
|
|
* return assoc(ds, cname, varyMeta(col, assocFn, kw("inference-target?"), true));
|
|
*```
|
|
*/
|
|
public static Map setInferenceTarget(Object ds, Object cname) {
|
|
return (Map)setInfTargetFn.invoke(ds, cname);
|
|
}
|
|
/**
|
|
* Find the inference column. If column was the result of a categorical mapping, reverse
|
|
* that mapping. Return data in a form that can be efficiently converted to a Buffer.
|
|
*/
|
|
public static Object labels(Object ds) {
|
|
return labelsFn.invoke(ds);
|
|
}
|
|
/**
|
|
* Given a dataset where the column names are labels and the each row is a probabilitly
|
|
* distribution across the labels, produce a Buffer of labels taking the highest probability
|
|
* for each row to choose the label.
|
|
*/
|
|
public static Object probabilityDistributionToLabels(Object ds) {
|
|
return probDistToLabel.invoke(ds);
|
|
}
|
|
/**
|
|
* Return a map of val->idx for the inference target.
|
|
*/
|
|
public static Map inferenceTargetLabelMap(Object ds) {
|
|
return (Map)infTargetLabelMap.invoke(ds);
|
|
}
|
|
}
|