216 lines
9.5 KiB
Java
Vendored
216 lines
9.5 KiB
Java
Vendored
package tech.v3.dataset;
|
|
|
|
import static tech.v3.Clj.*;
|
|
import clojure.lang.IFn;
|
|
import clojure.lang.Keyword;
|
|
import java.util.Map;
|
|
|
|
/**
|
|
* Fixed and variable length rolling windows. For variable rolling windows the dataset
|
|
* must already be sorted by the target column. Datetime support is provided in terms of
|
|
* provide specific units in which to perform the rolling operation such as the keyword
|
|
* `:days`.
|
|
*
|
|
*/
|
|
public class Rolling {
|
|
|
|
private Rolling(){}
|
|
|
|
static final IFn meanFn = requiringResolve("tech.v3.dataset.rolling", "mean");
|
|
static final IFn sumFn = requiringResolve("tech.v3.dataset.rolling", "sum");
|
|
static final IFn minFn = requiringResolve("tech.v3.dataset.rolling", "min");
|
|
static final IFn maxFn = requiringResolve("tech.v3.dataset.rolling", "max");
|
|
static final IFn varianceFn = requiringResolve("tech.v3.dataset.rolling", "variance");
|
|
static final IFn stddevFn = requiringResolve("tech.v3.dataset.rolling", "standard-deviation");
|
|
static final IFn nth = requiringResolve("tech.v3.dataset.rolling", "nth");
|
|
static final IFn firstFn = requiringResolve("tech.v3.dataset.rolling", "first");
|
|
static final IFn lastFn = requiringResolve("tech.v3.dataset.rolling", "last");
|
|
static final IFn rollingFn = requiringResolve("tech.v3.dataset.rolling", "rolling");
|
|
|
|
/**
|
|
* Fixed or variable rolling window reductions.
|
|
*
|
|
* @param windowSpec Window specification specifying the type of window, either a
|
|
* window over a fixed number of rows or a window based on a fixed logical
|
|
* quantitative difference i.e. three months or 10 milliseconds.
|
|
* @param reducerMap map of dest column name to reducer where reducer is a map with
|
|
* two keys, :column-name which is the input column to use and :reducer which is
|
|
* an IFn that receives each window of data as a buffer.
|
|
*
|
|
* Example:
|
|
*
|
|
*```java
|
|
* Map stocks = makeDataset("https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv");
|
|
*
|
|
* //Variable-sized windows require the source column to be sorted.
|
|
* stocks = sortByColumn(stocks, "date");
|
|
* Map variableWin = Rolling.rolling(stocks,
|
|
* Rolling.variableWindow("date", 3, kw("months")),
|
|
* hashmap("price-mean-3m", Rolling.mean("price"),
|
|
* "price-max-3m", Rolling.max("price"),
|
|
* "price-min-3m", Rolling.min("price")));
|
|
*println(head(variableWin, 10));
|
|
*https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv [10 6]:
|
|
* //| symbol | date | price | price-max-3m | price-mean-3m | price-min-3m |
|
|
* //|--------|------------|-------:|-------------:|--------------:|-------------:|
|
|
* //| AAPL | 2000-01-01 | 25.94 | 106.11 | 58.92500000 | 25.94 |
|
|
* //| IBM | 2000-01-01 | 100.52 | 106.11 | 61.92363636 | 28.66 |
|
|
* //| MSFT | 2000-01-01 | 39.81 | 106.11 | 58.06400000 | 28.66 |
|
|
* //| AMZN | 2000-01-01 | 64.56 | 106.11 | 60.09222222 | 28.66 |
|
|
* //| AAPL | 2000-02-01 | 28.66 | 106.11 | 57.56583333 | 28.37 |
|
|
* //| MSFT | 2000-02-01 | 36.35 | 106.11 | 60.19363636 | 28.37 |
|
|
* //| IBM | 2000-02-01 | 92.11 | 106.11 | 62.57800000 | 28.37 |
|
|
* //| AMZN | 2000-02-01 | 68.87 | 106.11 | 59.29666667 | 28.37 |
|
|
* //| AMZN | 2000-03-01 | 67.00 | 106.11 | 54.65583333 | 21.00 |
|
|
* //| MSFT | 2000-03-01 | 43.22 | 106.11 | 53.53363636 | 21.00 |
|
|
*
|
|
* //Fixed window...
|
|
*
|
|
* Object radians = VecMath.mul(2.0*Math.PI, VecMath.div(range(33), 32.0));
|
|
* Map sinds = makeDataset(hashmap("radians", radians, "sin", VecMath.sin(radians)));
|
|
* Map fixedWin = Rolling.rolling(sinds,
|
|
* Rolling.fixedWindow(4),
|
|
* hashmap("sin-roll-mean", Rolling.mean("sin"),
|
|
* "sin-roll-max", Rolling.max("sin"),
|
|
* "sin-roll-min", Rolling.min("sin")));
|
|
*println(head(fixedWin, 8));
|
|
* //_unnamed [8 5]:
|
|
|
|
* //| sin | radians | sin-roll-max | sin-roll-min | sin-roll-mean |
|
|
* //|-----------:|-----------:|-------------:|-------------:|--------------:|
|
|
* //| 0.00000000 | 0.00000000 | 0.19509032 | 0.00000000 | 0.04877258 |
|
|
* //| 0.19509032 | 0.19634954 | 0.38268343 | 0.00000000 | 0.14444344 |
|
|
* //| 0.38268343 | 0.39269908 | 0.55557023 | 0.00000000 | 0.28333600 |
|
|
* //| 0.55557023 | 0.58904862 | 0.70710678 | 0.19509032 | 0.46011269 |
|
|
* //| 0.70710678 | 0.78539816 | 0.83146961 | 0.38268343 | 0.61920751 |
|
|
* //| 0.83146961 | 0.98174770 | 0.92387953 | 0.55557023 | 0.75450654 |
|
|
* //| 0.92387953 | 1.17809725 | 0.98078528 | 0.70710678 | 0.86081030 |
|
|
* //| 0.98078528 | 1.37444679 | 1.00000000 | 0.83146961 | 0.93403361 |
|
|
*```
|
|
*/
|
|
public static Map rolling(Object ds, Map windowSpec, Map reducerMap) {
|
|
return (Map)rollingFn.invoke(ds, windowSpec, reducerMap);
|
|
}
|
|
/**
|
|
* Create a variable window specification with a double windowsize for a particular column.
|
|
* This specification will not work on datetime columns.
|
|
*/
|
|
public static Map variableWindow(Object colname, double windowSize) {
|
|
return hashmap(kw("window-type"), kw("variable"),
|
|
kw("column-name"), colname,
|
|
kw("window-size"), windowSize);
|
|
}
|
|
/**
|
|
* Create a variable window specification with a double windowsize for a particular column
|
|
* and a compFn which must take two values and return a double. The function must take 2
|
|
* arguments and the arguments are passed in as (later,earlier). This allows the basic
|
|
* clojure '-' operator to work fine in many cases.
|
|
*
|
|
*/
|
|
public static Map variableWindow(Object colname, double windowSize, Object compFn) {
|
|
return hashmap(kw("window-type"), kw("variable"),
|
|
kw("column-name"), colname,
|
|
kw("window-size"), windowSize,
|
|
kw("comp-fn"), compFn);
|
|
}
|
|
/**
|
|
* Create a datetime-specific variable window specification with a double windowsize for
|
|
* a particular column.
|
|
*
|
|
* @param datetimeUnit One of `[:milliseconds, :seconds, :hours, :days, :months]`.
|
|
*/
|
|
public static Map variableWindow(Object colname, double windowSize, Keyword datetimeUnit) {
|
|
return hashmap(kw("window-type"), kw("variable"),
|
|
kw("column-name"), colname,
|
|
kw("window-size"), windowSize,
|
|
kw("units"), datetimeUnit);
|
|
}
|
|
/**
|
|
* Return fixed size rolling window. Window will be fixed over `window-size` rows.
|
|
*/
|
|
public static Map fixedWindow(long windowSize) {
|
|
return hashmap(kw("window-type"), kw("fixed"),
|
|
kw("window-size"), windowSize);
|
|
}
|
|
/**
|
|
* Return fixed size rolling window. Window will be fixed over `window-size` rows.
|
|
*
|
|
* @param winPos One of `[:left :center :right]`. This combined with the default
|
|
* edge mode of `:clamp` dictates the windows of data the reducer sees.
|
|
*/
|
|
public static Map fixedWindow(long windowSize, Keyword winPos) {
|
|
return hashmap(kw("window-type"), kw("fixed"),
|
|
kw("window-size"), windowSize,
|
|
kw("relative-window-position"), winPos);
|
|
}
|
|
/**
|
|
* Return fixed size rolling window. Window will be fixed over `window-size` rows.
|
|
*
|
|
* @param winPos One of `[:left :center :right]`. This combined with the default
|
|
* edge mode dictates windows of data the reducer sees.
|
|
*
|
|
* @param edgeMode One of `[:zero, null, :clamp]`. Clamp means repeat the end value.
|
|
*/
|
|
public static Map fixedWindow(long windowSize, Keyword winPos, Keyword edgeMode) {
|
|
return hashmap(kw("window-type"), kw("fixed"),
|
|
kw("window-size"), windowSize,
|
|
kw("relative-window-position"), winPos,
|
|
kw("edge-mode"), edgeMode);
|
|
}
|
|
/**
|
|
* Create a columnwise reducer. This reducer gets sub-windows from the column and
|
|
* must return a scalar value. If srcColname is a vector of colnames then reduceFn
|
|
* will be passed each column window as a separate argument.
|
|
*
|
|
* @param datatype Option datatype, may be nil in which case the dataset will scan the
|
|
* result to infer datatype. If provided this will enforce the result column datatype.
|
|
* Reductions to numeric types with fixed datatypes will be slightly faster than
|
|
* generic reductions which require inference to find the final datatype.
|
|
*/
|
|
public static Map reducer(Object srcColname, IFn reduceFn, Keyword datatype) {
|
|
return hashmap(kw("column-name"), srcColname,
|
|
kw("reducer"), reduceFn,
|
|
kw("datatype"), datatype);
|
|
}
|
|
/**
|
|
* Create a columnwise reducer eliding datatype parameter. See documentation
|
|
* on 3-arity form of function.
|
|
*/
|
|
public static Map reducer(Object srcColname, IFn reduceFn) {
|
|
return hashmap(kw("column-name"), srcColname,
|
|
kw("reducer"), reduceFn);
|
|
}
|
|
/** mean reducer*/
|
|
public static Map mean(Object colname) {
|
|
return (Map)meanFn.invoke(colname);
|
|
}
|
|
/** sum reducer*/
|
|
public static Map sum(Object colname) {
|
|
return (Map)sumFn.invoke(colname);
|
|
}
|
|
/** min reducer*/
|
|
public static Map min(Object colname) {
|
|
return (Map)minFn.invoke(colname);
|
|
}
|
|
/** max reducer*/
|
|
public static Map max(Object colname) {
|
|
return (Map)maxFn.invoke(colname);
|
|
}
|
|
/** stddev reducer*/
|
|
public static Map stddev(Object colname) {
|
|
return (Map)stddevFn.invoke(colname);
|
|
}
|
|
/** variance reducer*/
|
|
public static Map variance(Object colname) {
|
|
return (Map)varianceFn.invoke(colname);
|
|
}
|
|
/** reducer that keeps the first value*/
|
|
public static Map first(Object colname) {
|
|
return (Map)firstFn.invoke(colname);
|
|
}
|
|
/** reducer that keeps the last value*/
|
|
public static Map last(Object colname) {
|
|
return (Map)lastFn.invoke(colname);
|
|
}
|
|
}
|