init research
This commit is contained in:
Vendored
+35
@@ -0,0 +1,35 @@
|
||||
{:paths ["classes"
|
||||
;;Parquet logging is just over the top. We eliminate a lot of noise here.
|
||||
"logback"]
|
||||
:deps {techascent/tech.ml.dataset {:mvn/version "6.069"
|
||||
|
||||
;;smile has bindings to slf4j that error out with an
|
||||
;;exception if there is no logging backend so I
|
||||
;;included logback-classic with tmd. That has
|
||||
;;caused issues with log4j based backends so if you
|
||||
;;use log4j or log4j2 then you need to exclude
|
||||
;;logback-classic from the tmd dependency
|
||||
|
||||
;; :exclusions [ch.qos.logback/logback-classic]
|
||||
|
||||
}
|
||||
uncomplicate/neanderthal {:mvn/version "0.43.3"}
|
||||
;;Re rebuilt the arrow bindings below the schema level to support
|
||||
;;mmap, compression, and JDK-17. Due to this the version of arrow
|
||||
;;included isn't that important.
|
||||
org.apache.arrow/arrow-vector {:mvn/version "6.0.0"}
|
||||
;;Compression packages for compressed arrow.
|
||||
org.lz4/lz4-java {:mvn/version "1.8.0"}
|
||||
com.github.luben/zstd-jni {:mvn/version "1.5.1-1"}
|
||||
|
||||
|
||||
;; Parquet Support
|
||||
org.apache.parquet/parquet-hadoop {:mvn/version "1.12.0"
|
||||
:exclusions [org.slf4j/slf4j-log4j12]}
|
||||
org.apache.hadoop/hadoop-common {:mvn/version "3.3.0"
|
||||
:exclusions [org.slf4j/slf4j-log4j12]}
|
||||
;; We literally need this for 1 POJO formatting object.
|
||||
org.apache.hadoop/hadoop-mapreduce-client-core {:mvn/version "3.3.0"
|
||||
:exclusions [org.slf4j/slf4j-log4j12]}}
|
||||
:jdk-17
|
||||
{:jvm-opts ["--add-modules" "jdk.incubator.foreign" "--enable-native-access=ALL-UNNAMED"]}}
|
||||
+644
@@ -0,0 +1,644 @@
|
||||
package jtest;
|
||||
|
||||
|
||||
import static tech.v3.Clj.*;
|
||||
import static tech.v3.TMD.*;
|
||||
import tech.v3.dataset.Rolling;
|
||||
import tech.v3.dataset.Modelling;
|
||||
import tech.v3.dataset.Reductions;
|
||||
import tech.v3.libs.Arrow;
|
||||
import tech.v3.libs.Parquet;
|
||||
import tech.v3.DType; //access to clone method
|
||||
import static tech.v3.DType.*;
|
||||
import tech.v3.datatype.Pred;
|
||||
import tech.v3.datatype.VecMath;
|
||||
import tech.v3.datatype.Stats;
|
||||
import tech.v3.datatype.Buffer;
|
||||
import tech.v3.libs.Nippy;
|
||||
import tech.v3.datatype.IFnDef;
|
||||
//Fast map creation when you know you will have to create many maps.
|
||||
import tech.v3.dataset.FastStruct;
|
||||
import clojure.lang.RT;
|
||||
import clojure.lang.IFn;
|
||||
import java.util.Map;
|
||||
import java.util.function.Function;
|
||||
|
||||
//Imports for the advanced reduction example at the end.
|
||||
import java.util.HashMap;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.StreamSupport;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.time.LocalDate;
|
||||
import java.time.YearMonth;
|
||||
import java.util.Random;
|
||||
|
||||
|
||||
|
||||
public class TMDDemo {
|
||||
public static void main(String[] args) {
|
||||
println("Loading/compiling library code. Time here can be mitigated with a precompilation step.");
|
||||
//Front-loading requires so when the code starts to run everyting is compiled.
|
||||
//For precompilation see tech.v3.Clj.compile.
|
||||
require("tech.v3.dataset");
|
||||
require("tech.v3.dataset.neanderthal");
|
||||
|
||||
println("Compilation finished.");
|
||||
//Make dataset can take a string, inputStream, a sequence of maps or a map of columns with
|
||||
//the map of columns being the most efficient.
|
||||
//Default file formats:
|
||||
//csv, tsv, csv.gz, tsv.gz, (compressed, general, and surprisingly fast) .nippy
|
||||
Map ds = makeDataset("https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv");
|
||||
println(head(ds));
|
||||
// https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv [5 3]:
|
||||
// | symbol | date | price |
|
||||
// |--------|------------|------:|
|
||||
// | MSFT | 2000-01-01 | 39.81 |
|
||||
// | MSFT | 2000-02-01 | 36.35 |
|
||||
// | MSFT | 2000-03-01 | 43.22 |
|
||||
// | MSFT | 2000-04-01 | 28.37 |
|
||||
// | MSFT | 2000-05-01 | 25.45 |
|
||||
println(head(sortByColumn(ds, "date")));
|
||||
// https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv [5 3]:
|
||||
|
||||
// | symbol | date | price |
|
||||
// |--------|------------|-------:|
|
||||
// | AAPL | 2000-01-01 | 25.94 |
|
||||
// | IBM | 2000-01-01 | 100.52 |
|
||||
// | MSFT | 2000-01-01 | 39.81 |
|
||||
// | AMZN | 2000-01-01 | 64.56 |
|
||||
// | AAPL | 2000-02-01 | 28.66 |
|
||||
println(ds.get("date"));
|
||||
// #tech.v3.dataset.column<packed-local-date>[560]
|
||||
// date
|
||||
// [2000-01-01, 2000-02-01, 2000-03-01, 2000-04-01, 2000-05-01, 2000-06-01, 2000-07-01, 2000-08-01, 2000-09-01, 2000-10-01, 2000-11-01, 2000-12-01, 2001-01-01, 2001-02-01, 2001-03-01, 2001-04-01, 2001-05-01, 2001-06-01, 2001-07-01, 2001-08-01...]
|
||||
|
||||
Object priceCol = ds.get("price");
|
||||
println("first value:", call(priceCol, 0), ", last value:", call(priceCol, -1));
|
||||
//first value: 39.81 , last value: 223.02
|
||||
|
||||
Map colmapDs = makeDataset(hashmap("a", range(10),
|
||||
"b", toDoubleArray(range(9,-1,-1))),
|
||||
hashmap(kw("dataset-name"), "testds"));
|
||||
println(colmapDs);
|
||||
// testds [10 2]:
|
||||
|
||||
// | b | a |
|
||||
// |----:|---:|
|
||||
// | 9.0 | 0 |
|
||||
// | 8.0 | 1 |
|
||||
// | 7.0 | 2 |
|
||||
// | 6.0 | 3 |
|
||||
// | 5.0 | 4 |
|
||||
// | 4.0 | 5 |
|
||||
// | 3.0 | 6 |
|
||||
// | 2.0 | 7 |
|
||||
// | 1.0 | 8 |
|
||||
// | 0.0 | 9 |
|
||||
|
||||
println(meta(colmapDs));
|
||||
// {:name testds}
|
||||
|
||||
//It is also trivial to add a virtual column by instantiating a Buffer object
|
||||
//One thing to note is that colmapDs itself wasn't changed. Assoc create a new
|
||||
//dataset that shared the unchanged portions with the original dataset
|
||||
println(assoc(colmapDs, "c", new tech.v3.datatype.LongReader() {
|
||||
public long lsize() { return 10; }
|
||||
public long readLong( long idx) {
|
||||
return 2*idx;
|
||||
}
|
||||
}));
|
||||
//testds [5 3]:
|
||||
|
||||
//| b | a | c |
|
||||
//|----:|---:|---:|
|
||||
//| 9.0 | 0 | 0 |
|
||||
//| 8.0 | 1 | 2 |
|
||||
//| 7.0 | 2 | 4 |
|
||||
//| 6.0 | 3 | 6 |
|
||||
//| 5.0 | 4 | 8 |
|
||||
|
||||
|
||||
// The metadata on columns has quite a bit of useful informatio in it.
|
||||
println(meta(call(colmapDs, "a")), meta(call(colmapDs, "b")));
|
||||
// {:name a, :datatype :int64, :n-elems 10} {:name b, :datatype :float64, :n-elems 10}
|
||||
|
||||
Buffer rows = rows(colmapDs);
|
||||
println("First row:", call(rows,0), ", last row:", call(rows,-1));
|
||||
// First row: {b 9.0, a 0} , last row: {b 0.0, a 9}
|
||||
|
||||
Buffer rowvecs = rowvecs(colmapDs);
|
||||
println("First rowvec:", call(rowvecs,0), ", last rowvec:", call(rowvecs,-1));
|
||||
// First rowvec: [9.0 0] , last rowvec: [0.0 9]
|
||||
|
||||
println("Tensor format:", toTensor(colmapDs));
|
||||
// Tensor format: #tech.v3.tensor<float64>[10 2]
|
||||
// [[9.000 0.000]
|
||||
// [8.000 1.000]
|
||||
// [7.000 2.000]
|
||||
// [6.000 3.000]
|
||||
// [5.000 4.000]
|
||||
// [4.000 5.000]
|
||||
// [3.000 6.000]
|
||||
// [2.000 7.000]
|
||||
// [1.000 8.000]
|
||||
// [0.000 9.000]]
|
||||
|
||||
println("Neanderthal format:", toNeanderthal(colmapDs));
|
||||
//Neanderthal format: #RealGEMatrix[double, mxn:10x2, layout:column, offset:0]
|
||||
// ▥ ↓ ↓ ┓
|
||||
// → 9.00 0.00
|
||||
// → 8.00 1.00
|
||||
// → ⁙ ⁙
|
||||
// → 1.00 8.00
|
||||
// → 0.00 9.00
|
||||
// ┗ ┛
|
||||
|
||||
Map stocks = makeDataset("https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv");
|
||||
|
||||
|
||||
//Filtering by a column is faster than the generalized row-by-row filter
|
||||
//and it allows us to make an assumption that if the predicate is a constant
|
||||
println(head(filterColumn(stocks, "symbol", Pred.eq("MSFT"))));
|
||||
//https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv [5 3]:
|
||||
|
||||
//| symbol | date | price |
|
||||
//|--------|------------|------:|
|
||||
//| MSFT | 2000-01-01 | 39.81 |
|
||||
//| MSFT | 2000-02-01 | 36.35 |
|
||||
//| MSFT | 2000-03-01 | 43.22 |
|
||||
//| MSFT | 2000-04-01 | 28.37 |
|
||||
//| MSFT | 2000-05-01 | 25.45 |
|
||||
|
||||
//Grouping returns a map of key to dataset. This can serve as a pre-aggregation
|
||||
//step or as a simple index.
|
||||
Map bySymbol = groupByColumn(stocks, "symbol");
|
||||
println(keys(bySymbol));
|
||||
//(MSFT AMZN IBM GOOG AAPL)
|
||||
|
||||
//Construct a new dataset by scanning a sequence of maps. This performs the aggregation
|
||||
//step after grouping by symbol. There is a higher performance way of doing this
|
||||
//described later but this method is most likely sufficient for many many use
|
||||
//cases.
|
||||
println(makeDataset(map(new IFnDef() {
|
||||
public Object invoke(Object kv) {
|
||||
Map.Entry item = (Map.Entry)kv;
|
||||
return hashmap("symbol", item.getKey(),
|
||||
"meanPrice", Stats.mean(column(item.getValue(), "price")));
|
||||
}}, bySymbol)));
|
||||
// _unnamed [5 2]:
|
||||
//| symbol | meanPrice |
|
||||
//|--------|-------------:|
|
||||
//| MSFT | 24.73674797 |
|
||||
//| AMZN | 47.98707317 |
|
||||
//| IBM | 91.26121951 |
|
||||
//| GOOG | 415.87044118 |
|
||||
//| AAPL | 64.73048780 |
|
||||
|
||||
|
||||
//Variable rolling window reductions require the target column to be monotonically
|
||||
//increasing - for each val x(n), x(n+1) is greater or equal. So for financial data
|
||||
//this usually means ordered by date.
|
||||
Map goog = sortByColumn(bySymbol.get("GOOG"), "date");
|
||||
println(head(goog));
|
||||
//GOOG [5 3]:
|
||||
|
||||
//| symbol | date | price |
|
||||
//|--------|------------|-------:|
|
||||
//| GOOG | 2004-08-01 | 102.37 |
|
||||
//| GOOG | 2004-09-01 | 129.60 |
|
||||
//| GOOG | 2004-10-01 | 190.64 |
|
||||
//| GOOG | 2004-11-01 | 181.98 |
|
||||
//| GOOG | 2004-12-01 | 192.79 |
|
||||
|
||||
//If we want our column of dates to be in epoch-days which is a lot more friendly to
|
||||
//machine learning we can easily do so:
|
||||
Buffer dateBuf = toBuffer(column(goog, "date"));
|
||||
//There are many ways to do this but here is a low-level way
|
||||
println(head(assoc(goog, "date",
|
||||
//all integer types funnel through LongBuffer/LongReader pathways.
|
||||
new tech.v3.datatype.LongReader() {
|
||||
//Aside from :int32, kw("epoch-days") is another valid datatype for
|
||||
//precisely this data.
|
||||
public Object elemwiseDatatype() { return int32; }
|
||||
public long lsize() { return dateBuf.lsize(); }
|
||||
public long readLong(long idx) {
|
||||
LocalDate ld = (LocalDate)dateBuf.readObject(idx);
|
||||
//Missing values will be null when using the readObject pathway.
|
||||
//The stocks dataset has no missing values. We strongly encourage
|
||||
//you to deal with missing values before getting into your
|
||||
//pipeline processing pathways.
|
||||
return ld.toEpochDay();
|
||||
}
|
||||
})));
|
||||
//GOOG [5 3]:
|
||||
|
||||
//| symbol | date | price |
|
||||
//|--------|------:|-------:|
|
||||
//| GOOG | 12631 | 102.37 |
|
||||
//| GOOG | 12662 | 129.60 |
|
||||
//| GOOG | 12692 | 190.64 |
|
||||
//| GOOG | 12723 | 181.98 |
|
||||
//| GOOG | 12753 | 192.79 |
|
||||
|
||||
|
||||
|
||||
Map variableWin = Rolling.rolling(goog,
|
||||
Rolling.variableWindow("date", 3, kw("months")),
|
||||
hashmap("price-mean-3m", Rolling.mean("price"),
|
||||
"price-max-3m", Rolling.max("price"),
|
||||
"price-min-3m", Rolling.min("price")));
|
||||
println(head(variableWin, 10));
|
||||
//GOOG [10 6]:
|
||||
|
||||
//| symbol | date | price | price-max-3m | price-mean-3m | price-min-3m |
|
||||
//|--------|------------|-------:|-------------:|--------------:|-------------:|
|
||||
//| GOOG | 2004-08-01 | 102.37 | 190.64 | 140.87000000 | 102.37 |
|
||||
//| GOOG | 2004-09-01 | 129.60 | 190.64 | 167.40666667 | 129.60 |
|
||||
//| GOOG | 2004-10-01 | 190.64 | 192.79 | 188.47000000 | 181.98 |
|
||||
//| GOOG | 2004-11-01 | 181.98 | 195.62 | 190.13000000 | 181.98 |
|
||||
//| GOOG | 2004-12-01 | 192.79 | 195.62 | 192.13333333 | 187.99 |
|
||||
//| GOOG | 2005-01-01 | 195.62 | 195.62 | 188.04000000 | 180.51 |
|
||||
|
||||
//Create a vector from 0->6*PI in 90 increments.
|
||||
Object radians = VecMath.mul(2.0*Math.PI, VecMath.div(range(33), 32.0));
|
||||
Map sinds = makeDataset(hashmap("radians", radians, "sin", VecMath.sin(radians)));
|
||||
Map fixedWin = Rolling.rolling(sinds,
|
||||
Rolling.fixedWindow(4),
|
||||
hashmap("sin-roll-mean", Rolling.mean("sin"),
|
||||
"sin-roll-max", Rolling.max("sin"),
|
||||
"sin-roll-min", Rolling.min("sin")));
|
||||
println(head(fixedWin, 8));
|
||||
//_unnamed [8 5]:
|
||||
|
||||
//| sin | radians | sin-roll-max | sin-roll-min | sin-roll-mean |
|
||||
//|-----------:|-----------:|-------------:|-------------:|--------------:|
|
||||
//| 0.00000000 | 0.00000000 | 0.19509032 | 0.00000000 | 0.04877258 |
|
||||
//| 0.19509032 | 0.19634954 | 0.38268343 | 0.00000000 | 0.14444344 |
|
||||
//| 0.38268343 | 0.39269908 | 0.55557023 | 0.00000000 | 0.28333600 |
|
||||
//| 0.55557023 | 0.58904862 | 0.70710678 | 0.19509032 | 0.46011269 |
|
||||
//| 0.70710678 | 0.78539816 | 0.83146961 | 0.38268343 | 0.61920751 |
|
||||
//| 0.83146961 | 0.98174770 | 0.92387953 | 0.55557023 | 0.75450654 |
|
||||
//| 0.92387953 | 1.17809725 | 0.98078528 | 0.70710678 | 0.86081030 |
|
||||
//| 0.98078528 | 1.37444679 | 1.00000000 | 0.83146961 | 0.93403361 |
|
||||
|
||||
//Join algorithm is a fast in-memory hash-based join
|
||||
Map dsa = makeDataset(hashmap("a", vector("a", "b", "b", "a", "c"),
|
||||
"b", range(5),
|
||||
"c", range(5)));
|
||||
println(dsa);
|
||||
//_unnamed [5 3]:
|
||||
|
||||
//| a | b | c |
|
||||
//|---|--:|--:|
|
||||
//| a | 0 | 0 |
|
||||
//| b | 1 | 1 |
|
||||
//| b | 2 | 2 |
|
||||
//| a | 3 | 3 |
|
||||
//| c | 4 | 4 |
|
||||
|
||||
|
||||
Map dsb = makeDataset(hashmap("a", vector("a", "b", "a", "b", "d"),
|
||||
"b", range(5),
|
||||
"c", range(6,11)));
|
||||
println(dsb);
|
||||
//_unnamed [5 3]:
|
||||
|
||||
//| a | b | c |
|
||||
//|---|--:|---:|
|
||||
//| a | 0 | 6 |
|
||||
//| b | 1 | 7 |
|
||||
//| a | 2 | 8 |
|
||||
//| b | 3 | 9 |
|
||||
//| d | 4 | 10 |
|
||||
|
||||
|
||||
//Join on the columns a,b. Default join mode is inner
|
||||
println(join(dsa, dsb, hashmap(kw("on"), vector("a", "b"))));
|
||||
//inner-join [2 4]:
|
||||
|
||||
//| a | b | c | right.c |
|
||||
//|---|--:|--:|--------:|
|
||||
//| a | 0 | 0 | 6 |
|
||||
//| b | 1 | 1 | 7 |
|
||||
|
||||
|
||||
//Single column join doesn't require column names wrapped in vectors
|
||||
println(join(dsa, dsb, hashmap(kw("on"), "a")));
|
||||
//inner-join [8 5]:
|
||||
|
||||
//| a | b | c | right.b | right.c |
|
||||
//|---|--:|--:|--------:|--------:|
|
||||
//| a | 0 | 0 | 0 | 6 |
|
||||
//| a | 3 | 3 | 0 | 6 |
|
||||
//| b | 1 | 1 | 1 | 7 |
|
||||
//| b | 2 | 2 | 1 | 7 |
|
||||
//| a | 0 | 0 | 2 | 8 |
|
||||
//| a | 3 | 3 | 2 | 8 |
|
||||
//| b | 1 | 1 | 3 | 9 |
|
||||
//| b | 2 | 2 | 3 | 9 |
|
||||
|
||||
|
||||
//Outer join on same columns
|
||||
println(join(dsa, dsb, hashmap(kw("on"), vector("a", "b"),
|
||||
kw("how"), kw("outer"))));
|
||||
//outer-join [8 4]:
|
||||
|
||||
//| a | b | c | right.c |
|
||||
//|---|--:|--:|--------:|
|
||||
//| a | 0 | 0 | 6 |
|
||||
//| b | 1 | 1 | 7 |
|
||||
//| b | 2 | 2 | |
|
||||
//| a | 3 | 3 | |
|
||||
//| c | 4 | 4 | |
|
||||
//| a | 2 | | 8 |
|
||||
//| b | 3 | | 9 |
|
||||
//| d | 4 | | 10 |
|
||||
|
||||
//Specific to timeseries-type information, there is a special join operator
|
||||
//named leftJoinAsof where every column of the left dataset is represented and it is
|
||||
//matched with the 'nearest' of a column of the right dataset.
|
||||
|
||||
Map targetPrices = makeDataset(hashmap("price", new Double[] { 200.0, 300.0, 400.0 }));
|
||||
|
||||
println(leftJoinAsof("price", targetPrices, goog, hashmap(kw("asof-op"), kw("<="))));
|
||||
//asof-<= [3 4]:
|
||||
//| price | symbol | date | GOOG.price |
|
||||
//|------:|--------|------------|-----------:|
|
||||
//| 200.0 | GOOG | 2005-04-01 | 220.00 |
|
||||
//| 300.0 | GOOG | 2008-12-01 | 307.65 |
|
||||
//| 400.0 | GOOG | 2008-09-01 | 400.52 |
|
||||
println(leftJoinAsof("price", targetPrices, goog, hashmap(kw("asof-op"), kw(">"))));
|
||||
//asof-> [3 4]:
|
||||
//| price | symbol | date | GOOG.price |
|
||||
//|------:|--------|------------|-----------:|
|
||||
//| 200.0 | GOOG | 2005-01-01 | 195.62 |
|
||||
//| 300.0 | GOOG | 2005-06-01 | 294.15 |
|
||||
//| 400.0 | GOOG | 2009-04-01 | 395.97 |
|
||||
|
||||
|
||||
//tech.v3.dataset.Modelling moves us more into machine learning pathways
|
||||
//We can do things like PCA transformations or train/test pathways.
|
||||
Object categoricalFit = Modelling.fitCategorical(stocks, "symbol");
|
||||
println(head(Modelling.transformCategorical(stocks, categoricalFit)));
|
||||
//https://github.com/techascent/tech.ml.dataset/raw/master/test/data/stocks.csv [5 3]:
|
||||
|
||||
//| symbol | date | price |
|
||||
//|-------:|------------|-------:|
|
||||
//| 1.0 | 2000-01-01 | 25.94 |
|
||||
//| 4.0 | 2000-01-01 | 100.52 |
|
||||
//| 3.0 | 2000-01-01 | 39.81 |
|
||||
//| 2.0 | 2000-01-01 | 64.56 |
|
||||
//| 1.0 | 2000-02-01 | 28.66 |
|
||||
|
||||
|
||||
//Remember the rolling sinewave dataset from before?
|
||||
//let's run PCA on the dataset.
|
||||
//This pathway will use the slightly slow covariance based method that has the distinct
|
||||
//advantage of producing accurate variances in the eigenvalues member.
|
||||
|
||||
Object pcaFit = Modelling.fitPCA(fixedWin, hashmap(kw("n-components"), 2));
|
||||
println(head(Modelling.transformPCA(fixedWin, pcaFit)));
|
||||
//_unnamed [5 2]:
|
||||
|
||||
//| 0 | 1 |
|
||||
//|------------:|------------:|
|
||||
//| -2.68909118 | -1.63147765 |
|
||||
//| -2.65664577 | -1.31993055 |
|
||||
//| -2.63001624 | -0.99954776 |
|
||||
//| -2.65746329 | -0.60134499 |
|
||||
//| -2.66466548 | -0.23414574 |
|
||||
|
||||
|
||||
//We can save out pipeline data alltogether into a byte array using the Nippy namespace.
|
||||
byte[] data = Nippy.freeze(hashmap("catFit", categoricalFit, "pcaFit", pcaFit));
|
||||
println("pipeline data byte length:", data.length);
|
||||
//pipeline data byte length: 864
|
||||
|
||||
//We can serialize *just* datasets to arrow which gives us an interesting possibility.
|
||||
Arrow.datasetToStream(stocks, "test.arrow", null);
|
||||
|
||||
//We can mmap them back. This step will fail if you are on an m-1 mac unless you add
|
||||
//the memory module. See deps.clj for example command line.
|
||||
try(AutoCloseable resCtx = stackResourceContext()) {
|
||||
//This dataset is loaded in-place. This means that aside from string tables
|
||||
//the columns are just loaded from the mmap pointers.
|
||||
Map mmapds = Arrow.streamToDataset("test.arrow", hashmap(kw("open-type"), kw("mmap")));
|
||||
println(head(mmapds));
|
||||
//test.arrow [5 3]:
|
||||
|
||||
//| symbol | date | price |
|
||||
//|--------|------------|-------:|
|
||||
//| AAPL | 2000-01-01 | 25.94 |
|
||||
//| IBM | 2000-01-01 | 100.52 |
|
||||
//| MSFT | 2000-01-01 | 39.81 |
|
||||
//| AMZN | 2000-01-01 | 64.56 |
|
||||
//| AAPL | 2000-02-01 | 28.66 |
|
||||
|
||||
//Cloning a dataset serves to both realize any lazy columns
|
||||
//and copy the dataset into jvm-heap memory thus allowing you to return
|
||||
//something from the stack resource context.
|
||||
println(head(tech.v3.DType.clone(mmapds)));
|
||||
}
|
||||
catch(Exception e){
|
||||
println(e);
|
||||
e.printStackTrace(System.out);
|
||||
}
|
||||
//Finally we can load/safe to parquet if that is your thing.
|
||||
Parquet.datasetToParquet(stocks, "test.parquet", null);
|
||||
//Specifying a subset of columns to load makes this *much* faster.
|
||||
//To do this use :column-whitelist - see dataset api docs for `->dataset`.
|
||||
//NOTE - If you don't disable debug logging then serializing to/from parquet is
|
||||
//unreasonably slow. See logging section of https://techascent.github.io/tech.ml.dataset/tech.v3.libs.parquet.html.
|
||||
println(head(Parquet.parquetToDataset("test.parquet", null)));
|
||||
//_unnamed [5 3]:
|
||||
|
||||
//| symbol | date | price |
|
||||
//|--------|------------|-------:|
|
||||
//| AAPL | 2000-01-01 | 25.94 |
|
||||
//| IBM | 2000-01-01 | 100.52 |
|
||||
//| MSFT | 2000-01-01 | 39.81 |
|
||||
//| AMZN | 2000-01-01 | 64.56 |
|
||||
//| AAPL | 2000-02-01 | 28.66 |
|
||||
|
||||
|
||||
//Here is a somewhat advanced example. We have a dataset composed of events where each
|
||||
//row has a start,end date. We want to tally information based the days per a given month
|
||||
//that the event happened which means we need to expand the dataset into days then reduce
|
||||
//it to tally over months. Finally we do another crosswise summation to pull out statistics
|
||||
//based on row information in the dataset.
|
||||
int nSims = 100;
|
||||
int nPlacements = 50;
|
||||
int nExpansion = 20;
|
||||
long nRows = 1000000;
|
||||
LocalDate today = LocalDate.now();
|
||||
Random rand = new Random();
|
||||
Object startDates = vec(repeatedly(nRows, new IFnDef() { public Object invoke() { return today.minusDays(400 + rand.nextInt(100)); } }));
|
||||
//Dataset with 1 million rows
|
||||
Map srcds = makeDataset(hashmap("simulation", repeatedly(nRows, new IFnDef() { public Object invoke() { return rand.nextInt(nSims); }}),
|
||||
"placement", repeatedly(nRows, new IFnDef() { public Object invoke() { return rand.nextInt(nPlacements); }}),
|
||||
"start", startDates,
|
||||
"end", map(new IFnDef() { public Object invoke(Object sd) { return ((LocalDate)sd).plusDays(rand.nextInt(nExpansion)); }},
|
||||
startDates)));
|
||||
println(head(srcds));
|
||||
//_unnamed [5 4]:
|
||||
|
||||
//| placement | start | simulation | end |
|
||||
//|-----------:|------------|------------:|------------|
|
||||
//| 14 | 2020-09-28 | 86 | 2020-09-29 |
|
||||
//| 32 | 2020-12-17 | 20 | 2021-01-03 |
|
||||
//| 23 | 2020-10-15 | 37 | 2020-10-24 |
|
||||
//| 49 | 2020-10-07 | 18 | 2020-10-22 |
|
||||
//| 6 | 2020-12-08 | 48 | 2020-12-08 |
|
||||
|
||||
|
||||
|
||||
|
||||
//We are going to be creating a lot of these.
|
||||
IFn mapFact = mapFactory(vector("year-month", "count"));
|
||||
//We want to produce map of yearmonth to day counts.
|
||||
BiFunction<YearMonth,Long,Long> incrementor = new BiFunction<YearMonth,Long,Long>() {
|
||||
public Long apply(YearMonth k, Long v) {
|
||||
if (v != null) {
|
||||
return ((long)v) + 1;
|
||||
} else {
|
||||
return 1L;
|
||||
}
|
||||
}
|
||||
};
|
||||
//Tally the days between start/end, record in map of yearMonth to day tally
|
||||
//Returns a list of maps of "year-month", "count".
|
||||
IFn tallyDays = new IFnDef() {
|
||||
public Object invoke(Object row) {
|
||||
Map rowMap = (Map) row;
|
||||
LocalDate sd = (LocalDate)rowMap.get("start");
|
||||
LocalDate ed = (LocalDate)rowMap.get("end");
|
||||
long ndays = sd.until(ed, java.time.temporal.ChronoUnit.DAYS);
|
||||
HashMap<YearMonth,Long> tally = new HashMap<YearMonth,Long>();
|
||||
for (long idx = 0; idx < ndays; ++idx) {
|
||||
LocalDate cur = sd.plusDays(idx);
|
||||
YearMonth rm = YearMonth.from(cur);
|
||||
tally.compute(rm, incrementor);
|
||||
}
|
||||
ArrayList<Map> retval = new ArrayList<Map>(tally.size());
|
||||
tally.forEach(new BiConsumer<YearMonth,Long>() {
|
||||
public void accept(YearMonth k, Long v) {
|
||||
retval.add((Map)mapFact.invoke(k, v));
|
||||
}
|
||||
});
|
||||
return retval;
|
||||
}
|
||||
};
|
||||
|
||||
println(vec(tallyDays.invoke(hashmap("start", LocalDate.parse("2020-12-17"),
|
||||
"end", LocalDate.parse("2021-01-03")))));
|
||||
//[{year-month #object[java.time.YearMonth 0x5eafef3a 2020-12], count 15} {year-month #object[java.time.YearMonth 0x3bcfebf6 2021-01], count 2}]
|
||||
|
||||
//Next we expand our original dataset to be year-month tallies in addition to
|
||||
//to start/end dates.
|
||||
println(rowMapcat(head(srcds), tallyDays, null));
|
||||
//_unnamed [7 6]:
|
||||
|
||||
//| placement | start | simulation | end | count | year-month |
|
||||
//|----------:|------------|-----------:|------------|------:|------------|
|
||||
//| 11 | 2020-10-29 | 41 | 2020-11-02 | 1 | 2020-11 |
|
||||
//| 11 | 2020-10-29 | 41 | 2020-11-02 | 3 | 2020-10 |
|
||||
//| 13 | 2020-10-11 | 5 | 2020-10-19 | 8 | 2020-10 |
|
||||
//| 16 | 2020-12-08 | 10 | 2020-12-11 | 3 | 2020-12 |
|
||||
//| 1 | 2020-10-15 | 52 | 2020-10-19 | 4 | 2020-10 |
|
||||
|
||||
//Begin parallelized expansion
|
||||
Iterable dsSeq = (Iterable)rowMapcat(srcds, tallyDays, hashmap(kw("result-type"), kw("as-seq")));
|
||||
|
||||
//The first aggregation is to summarize by placement and simulation the year-month tallies.
|
||||
//We are essentially replacing count with a summarized count. After this statement
|
||||
//we can guarantee that the dataset has unique tuples of [simulation, placement, year-month]
|
||||
Map initAgg = Reductions.groupByColumnsAgg(dsSeq, vector("simulation", "placement", "year-month"),
|
||||
hashmap("count", Reductions.sum("count")),
|
||||
null);
|
||||
println(head(initAgg));
|
||||
//["simulation" "placement" "year-month"]-aggregation [5 4]:
|
||||
|
||||
//| simulation | placement | year-month | count |
|
||||
//|-----------:|----------:|------------|------:|
|
||||
//| 0 | 0 | 2020-12 | 622.0 |
|
||||
//| 0 | 1 | 2020-12 | 591.0 |
|
||||
//| 0 | 2 | 2020-12 | 500.0 |
|
||||
//| 0 | 3 | 2020-12 | 549.0 |
|
||||
//| 0 | 4 | 2020-12 | 595.0 |
|
||||
|
||||
// The second aggregation allows us to build of statistics over each placement/year-month
|
||||
// pair thus finding out the distribution of a given placement, year-month across simluations
|
||||
Map result = Reductions.groupByColumnsAgg(vector(initAgg), vector("placement", "year-month"),
|
||||
hashmap("min-count", Reductions.probQuantile("count", 0.0),
|
||||
"low-95-count", Reductions.probQuantile("count", 0.05),
|
||||
"q1-count", Reductions.probQuantile("count", 0.25),
|
||||
"median-count", Reductions.probQuantile("count", 0.5),
|
||||
"q3-count", Reductions.probQuantile("count", 0.75),
|
||||
"high-95-count", Reductions.probQuantile("count", 0.95),
|
||||
"max-count", Reductions.probQuantile("count", 1.0),
|
||||
"count", Reductions.sum("count")),
|
||||
null);
|
||||
//Take a million row dataset, expand it, then perform two grouping aggregations.
|
||||
println(head(result));
|
||||
//["placement" "year-month"]-aggregation [5 10]:
|
||||
|
||||
//| q3-count | median-count | min-count | high-95-count | placement | max-count | count | low-95-count | q1-count | year-month |
|
||||
//|---------:|-------------:|----------:|--------------:|----------:|----------:|--------:|-------------:|---------:|------------|
|
||||
//| 646.0 | 593.0 | 366.0 | 716.0 | 36 | 809.0 | 58920.0 | 475.0 | 536.0 | 2020-12 |
|
||||
//| 621.0 | 560.0 | 376.0 | 739.0 | 36 | 782.0 | 57107.0 | 459.0 | 512.0 | 2020-10 |
|
||||
//| 168.0 | 139.0 | 25.0 | 211.0 | 0 | 246.0 | 13875.0 | 76.0 | 112.0 | 2021-01 |
|
||||
//| 658.0 | 607.0 | 384.0 | 745.0 | 0 | 825.0 | 60848.0 | 486.0 | 561.0 | 2020-12 |
|
||||
//| 628.0 | 581.0 | 422.0 | 693.0 | 0 | 802.0 | 58148.0 | 468.0 | 539.0 | 2020-11 |
|
||||
|
||||
|
||||
//Let's do a quick file size comparison of the original simulation dataset.
|
||||
//We have four columns, placement simulation startdate enddate. We know, however,
|
||||
//that placement and simulation will fit into byte data as they are integers 0-49 and 0-99,
|
||||
//respectively. So let's start there.
|
||||
Map simds = (Map)assoc(srcds,
|
||||
//These are checked casts.
|
||||
"simulation", makeContainer(kw("uint8"), srcds.get("simulation")),
|
||||
"placement", makeContainer(kw("uint8"), srcds.get("placement")));
|
||||
writeDataset(simds, "simulation.csv.gz");
|
||||
writeDataset(simds, "simulation.nippy");
|
||||
Arrow.datasetToStream(simds, "simulation.arrow", null);
|
||||
Arrow.datasetToStream(simds, "simulation-compressed.arrow", hashmap(kw("compression"),
|
||||
hashmap(kw("compression-type"), kw("zstd"),
|
||||
kw("level"), 8)));
|
||||
Parquet.datasetToParquet(simds, "simulation.parquet", null);
|
||||
|
||||
|
||||
IFn fileLen = new IFnDef() {
|
||||
public Object invoke(Object fname) {
|
||||
return new java.io.File(str(fname)).length();
|
||||
}
|
||||
};
|
||||
println(makeDataset(vector(hashmap("file-type", "gzipped csv",
|
||||
"length", fileLen.invoke("simulation.csv.gz")),
|
||||
hashmap("file-type", "nippy",
|
||||
"length", fileLen.invoke("simulation.nippy")),
|
||||
hashmap("file-type", "arrow file",
|
||||
"length", fileLen.invoke("simulation.arrow")),
|
||||
hashmap("file-type", "arrow file compressed",
|
||||
"length", fileLen.invoke("simulation-compressed.arrow")),
|
||||
hashmap("file-type", "parquet",
|
||||
"length", fileLen.invoke("simulation.parquet")))));
|
||||
// _unnamed [5 2]:
|
||||
|
||||
//| file-type | length |
|
||||
//|-----------------------|---------:|
|
||||
//| gzipped csv | 5903963 |
|
||||
//| nippy | 5688556 |
|
||||
//| arrow file | 10501378 |
|
||||
//| arrow file compressed | 3869554 |
|
||||
//| parquet | 3396383 |
|
||||
|
||||
|
||||
// If we load clojure.core.async - which neanderthal does - or we use
|
||||
// clojure.core/pmap then we have to shutdown agents else we get a 1 minute hang
|
||||
// on shutdown.
|
||||
shutdownAgents();
|
||||
}
|
||||
}
|
||||
+13
@@ -0,0 +1,13 @@
|
||||
<configuration debug="false">
|
||||
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
|
||||
<!-- encoders are assigned the type
|
||||
ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
|
||||
<encoder>
|
||||
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
|
||||
</encoder>
|
||||
</appender>
|
||||
|
||||
<root level="info">
|
||||
<appender-ref ref="STDOUT" />
|
||||
</root>
|
||||
</configuration>
|
||||
+7
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
mkdir -p classes
|
||||
|
||||
rm -rf classes/*
|
||||
|
||||
javac -classpath "$(clj -Spath)" -d classes -Xlint:unchecked -sourcepath java $(find java | grep .*.java)
|
||||
+3
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
cd .. && lein install && cd java_test && scripts/run
|
||||
+4
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
|
||||
scripts/compile
|
||||
java -cp "$(clj -Spath)" jtest.TMDDemo
|
||||
Reference in New Issue
Block a user