init research

This commit is contained in:
2026-02-08 11:20:43 -10:00
commit bdf064f54d
3041 changed files with 1592200 additions and 0 deletions
+3
View File
@@ -0,0 +1,3 @@
#!/bin/bash
export GRAALVM_HOME="$(pwd)/graalvm"
+22
View File
@@ -0,0 +1,22 @@
import pandas as pd
import pyarrow as pa
import pyarrow.feather as feather
# with pa.ipc.open_stream("test/data/alldtypes.arrow-ipc") as reader:
# df = reader.read_pandas()
# print(df)
# feather.write_feather(df, "test/data/alldtypes.arrow-feather")
# feather.write_feather(df, "test/data/alldtypes.arrow-feather-compressed", compression='zstd')
# df = df.drop(columns=["local_times"])
# feather.write_feather(df, "test/data/alldtypes.arrow-feather-v1", version=1)
with pa.ipc.open_file("test/data/alldtypes.arrow-file-zstd") as reader:
df = reader.read_pandas()
print(df)
+8
View File
@@ -0,0 +1,8 @@
import pandas as pd
import pyarrow.feather as ft
df = pd.DataFrame({'idx': [0, 1, 2],
'bytedata': [b'\x7f\x45\x4c\x46\x01\x01\x01\x00', b'\x7f\x45\x4c\x46\x01\x01\x01\x00', b'\x7f\x45\x4c\x46\x01\x01\x01\x00']
})
df.to_feather("test/data/arrow_bytes.arrow" )
+17
View File
@@ -0,0 +1,17 @@
import pyarrow as pa
import uuid as uuid
schema = pa.schema([pa.field('id', pa.decimal128(5, 2))])
data = [1, 0, 2]
table = pa.Table.from_arrays([data], schema=schema)
with pa.OSFile('test/data/bigdec.arrow', 'wb') as sink:
with pa.ipc.new_file(sink, schema=schema) as writer:
batch = pa.record_batch([data], schema=schema)
writer.write(batch)
with pa.memory_map('test/data/bigdec.arrow', 'r') as source:
loaded_arrays = pa.ipc.open_file(source).read_all()
print(loaded_arrays[0])
+9
View File
@@ -0,0 +1,9 @@
import pandas as pd
import pyarrow.feather as ft
df_list = pd.DataFrame({'idx': [0, 1, 2],
'class-name': [['dog', 'car'], ['dog', 'flower'], ['car', 'flower']],
'confidence': [[0.8, 0.3], [0.75, 0.85], [0.46, 0.84]],
})
df_list.to_feather("test/data/arrow_list.arrow")
+12
View File
@@ -0,0 +1,12 @@
#!/usr/bin/python
import pyarrow as pa
import pyarrow.feather as feather
my_schema = pa.schema([
pa.field('year', pa.int64()),
pa.field('nullcol', pa.null()),
pa.field('day', pa.int64()),])
pylist = [{'year': 2020, 'nullcol': None, 'day':24}]
table = pa.Table.from_pylist(pylist, schema=my_schema)
feather.write_feather(table, "test/data/withnullcol.arrow", compression="zstd", version=2)
+12
View File
@@ -0,0 +1,12 @@
import pyarrow as pa
import uuid as uuid
schema = pa.schema([pa.field('id', pa.uuid())])
data = [uuid.UUID("8be643d6-0df7-4e5e-837c-f94170c87914").bytes,
uuid.UUID("24bc9cf4-e2e8-444f-bb2d-82394f33ff76").bytes,
uuid.UUID("e8149e1b-aef6-4671-b1b4-3b7a21eed92a").bytes]
with pa.OSFile('test/data/uuid_ext.arrow', 'wb') as sink:
with pa.ipc.new_file(sink, schema=schema) as writer:
batch = pa.record_batch([data], schema=schema)
writer.write(batch)
+8
View File
@@ -0,0 +1,8 @@
#!/usr/bin/Rscript
library(arrow)
library(dplyr)
df=data_frame(texts=rep("A character vector containing abbreviations for the character strings in its first argument. Duplicates in the original names.arg will be given identical abbreviations. If any non-duplicated elements have the same minlength abbreviations then, if method = both.sides the basic internal abbreviate() algorithm is applied to the characterwise reversed strings; if there are still duplicated abbreviations and if strict = FALSE as by default, minlength is incremented by one and new abbreviations are found for those elements only. This process is repeated until all unique elements of names.arg have unique abbreviations.",10000000)
)
arrow::write_ipc_stream(df,"10m.arrow")
+5
View File
@@ -0,0 +1,5 @@
#!/bin/bash
set -e
clojure -A:dev -X:codox
+3
View File
@@ -0,0 +1,3 @@
#!/bin/bash
clj-kondo --lint src
+8
View File
@@ -0,0 +1,8 @@
#!/usr/bin/env bash
set -eu
jdk_profile="${1:-jdk-8}"
rm -rf target/classes
clojure -T:"${jdk_profile}":build compile
+10
View File
@@ -0,0 +1,10 @@
#!/bin/bash
set -e
scripts/run-tests-m1
rm -rf pom.xml
clojure -T:build jar
cp target/classes/META-INF/maven/techascent/tech.ml.dataset/pom.xml .
scripts/build-docs
clojure -X:deploy
+3
View File
@@ -0,0 +1,3 @@
import pandas as pd
out_df = pd.DataFrame().reset_index()
out_df.to_feather("test/data/empty.arrow")
+14
View File
@@ -0,0 +1,14 @@
#!/bin/bash
VERSION="21.0.2"
if [ ! -e jdk-$VERSION ]; then
echo "Downloading JDK $VERSION"
wget https://download.java.net/java/GA/jdk21.0.2/f2283984656d49d69e91c558476027ac/13/GPL/openjdk-21.0.2_macos-aarch64_bin.tar.gz
tar -xvzf openjdk-21.0.2_macos-aarch64_bin.tar.gz
rm openjdk-21.0.2_macos-aarch64_bin.tar.gz
fi
export PATH=$(pwd)/jdk-$VERSION/bin:$PATH
export JAVA_HOME=$(pwd)/jdk-$VERSION/
+15
View File
@@ -0,0 +1,15 @@
#!/usr/bin/env bash
DATA_DIR=test/data/ames-house-prices
mkdir -p $DATA_DIR
wget https://s3.us-east-2.amazonaws.com/tech.public.data/house-prices-advanced-regression-techniques.zip
unzip -o house-prices-advanced-regression-techniques.zip -d $DATA_DIR
# Of course the files have incorrect permissions...
chmod 644 $(find test/data/ames-house-prices -type f)
rm house-prices-advanced-regression-techniques.zip
+7
View File
@@ -0,0 +1,7 @@
#!/bin/bash
wget https://github.com/graalvm/graalvm-ce-builds/releases/download/vm-20.2.0/graalvm-ce-java8-linux-amd64-20.2.0.tar.gz
tar -xvzf graalvm-ce-java8-linux-amd64-20.2.0.tar.gz
ln -s "$(pwd)/graalvm-ce-java8-20.2.0" "$(pwd)/graalvm"
rm graalvm-ce-java8-linux-amd64-20.2.0.tar.gz
graalvm/bin/gu install native-image
+15
View File
@@ -0,0 +1,15 @@
#!/bin/bash
mkdir -p test/data/nyc-taxi
YEAR=2009
while [ $YEAR -le 2019 ]
do
MONTHS=("01" "02" "03" "04" "05" "06"
"07" "08" "09" "10" "11" "12")
for MONTH in "${MONTHS[@]}"
do
wget "https://ursa-labs-taxi-data.s3.us-east-2.amazonaws.com/$YEAR/$MONTH/data.parquet" -O "test/data/nyc-taxi/$YEAR-$MONTH-data.parquet"
done
YEAR=$(( $YEAR + 1 ))
done
+9
View File
@@ -0,0 +1,9 @@
#!/bin/bash
set -e
# scripts/run-tests
clojure -T:build jar
cp target/classes/META-INF/maven/techascent/tech.ml.dataset/pom.xml .
clojure -X:install
+16
View File
@@ -0,0 +1,16 @@
#!/bin/bash
if [ ! -e markdown-doclet/markdown-doclet-1.4-all.jar ]; then
mkdir -p markdown-doclet
cd markdown-doclet
wget https://repo1.maven.org/maven2/ch/raffael/markdown-doclet/markdown-doclet/1.4/markdown-doclet-1.4-all.jar
cd ../
fi
javadoc -sourcepath java_public_api -d docs/javadoc -Xdoclint:none -classpath "$(lein classpath):$(pwd)/target/classes" \
-doclet ch.raffael.mddoclet.MarkdownDoclet -docletpath markdown-doclet/markdown-doclet-1.4-all.jar \
-doctitle "tech.ml.dataset Documentation"\
-windowtitle "tech.ml.dataset Documentation"\
--allow-script-in-comments\
-header "<script async src=\"https://www.googletagmanager.com/gtag/js?id=G-RGTB4J7LGP\"</script><script>window.dataLayer = window.dataLayer || [];\nfunction gtag(){dataLayer.push(arguments);}\ngtag('js', new Date());\ngtag('config', 'G-RGTB4J7LGP');"\
tech.v3 tech.v3.dataset tech.v3.libs
+27
View File
@@ -0,0 +1,27 @@
#!/usr/bin/python3
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
rows = [{"id": 1,
"val":[("a",{"weight":41.5, "temp":36.1}),
("b",{"weight":31.6,"temp":34.5})],
"val2":[("va", {"weight":2, "temp":3}),
("vb", {"weight":3, "temp":4})]},
{"id": 2,
"val":[("a",{"weight":11.5, "temp":22.1}),
("b",{"weight":31.6,"temp":34.5})]},
{"id": 3,
"val":[("a",{"weight":22.5,"temp":33.1}),
("b",{"weight":33.6, "temp":44.5}),
("c",{"weight":44.6, "temp":55.5})],
"val2":[("vb", {"weight":5, "temp":10})]
}]
df2 = pd.DataFrame(rows)
mystruct = pa.struct([pa.field("weight", pa.float32()),
pa.field("temp", pa.float32())])
mymap = pa.map_(pa.string(), mystruct)
schema = pa.schema([pa.field('id', pa.int32()), pa.field('val', mymap), pa.field("val2", mymap)])
print(schema)
table = pa.Table.from_pandas(df2, schema)
pq.write_table(table, 'test/data/nested.parquet')
+6
View File
@@ -0,0 +1,6 @@
import pyarrow.parquet as pq
import pyarrow as pa
d = {'name': ['sample1', 'sample2'], 'decimals': [ 3.4199, 1.2455] }
table = pa.Table.from_pydict(d)
table = table.set_column(1, 'decimals', table.column('decimals').cast(pa.decimal128(12,9)))
pq.write_table(table, 'test/data/decimaltable.parquet')
+24
View File
@@ -0,0 +1,24 @@
#!/usr/bin/env bash
set -eu
java_version=$(java -version 2>&1 | awk '/version/ {print $3}')
jdk_profile=""
case "${java_version}" in
\"1.8*)
jdk_profile="jdk-8"
;;
\"11*)
jdk_profile="jdk-11"
;;
\"17*)
jdk_profile="jdk-17"
;;
esac
echo "java version detected ${java_version} -- jdk profile detected: ${jdk_profile}"
scripts/compile "${jdk_profile}"
clojure -X:dev:codegen
clojure -M:dev:"${jdk_profile}":test --dir test --dir neanderthal
+5
View File
@@ -0,0 +1,5 @@
#!/bin/bash
scripts/compile
clojure -X:dev-mac-m1:codegen
clojure -M:dev-mac-m1:test
+9
View File
@@ -0,0 +1,9 @@
#!/usr/bin/Rscript
library(dplyr)
library(arrow)
library(uuid)
df=data_frame(uuids=rep(UUIDgenerate(), 10))
arrow::write_ipc_stream(df, "test/data/uuid.arrow")
+9
View File
@@ -0,0 +1,9 @@
#!/usr/bin/Rscript
library(dplyr)
library(arrow)
library(uuid)
df=data_frame(uuids=rep(UUIDgenerate(), 10))
write_parquet(df, "uuid.parquet")