init research

This commit is contained in:
2026-02-08 11:20:43 -10:00
commit bdf064f54d
3041 changed files with 1592200 additions and 0 deletions
+93
View File
@@ -0,0 +1,93 @@
package tech.v3.dataset;
import clojure.lang.RT;
import clojure.lang.IDeref;
import java.util.function.LongConsumer;
import ham_fisted.ArrayLists;
import ham_fisted.IMutList;
import org.roaringbitmap.RoaringBitmap;
public class ByteValidity {
public static int trimIndexes(int[] indexes, int nIndexes, long maxIdx) {
while(nIndexes > 0) {
if(Integer.toUnsignedLong(indexes[nIndexes-1]) >= maxIdx)
--nIndexes;
else
break;
}
return nIndexes;
}
public static abstract class ValidityBase implements LongConsumer, IDeref {
long nElems;
int idx;
int nIndexes;
int[] indexes;
public ValidityBase(long nElems, long maxIndexes) {
this.nElems = nElems;
indexes = new int[(int)maxIndexes];
nIndexes = 0;
idx = 0;
}
}
public static class ValidityIndexReducer implements LongConsumer, IDeref {
IMutList indexes;
public final long maxIdx;
int idx;
public ValidityIndexReducer(IMutList indexes, long maxIdx) {
this.indexes = indexes;
this.maxIdx = maxIdx;
this.idx = 0;
}
public int trimIndexes() {
int nIndexes = indexes.size();
//empty loop intentional
for(;nIndexes > 0 && indexes.getLong(nIndexes-1) >= maxIdx; --nIndexes);
return nIndexes;
}
public void accept(long value) {
if(value != 0) {
int intVal = (int)value;
int offset = idx * 8;
if( (intVal & 1) == 1) indexes.addLong(offset);
if( (intVal & 2) == 2) indexes.addLong(offset+1);
if( (intVal & 4) == 4) indexes.addLong(offset+2);
if( (intVal & 8) == 8) indexes.addLong(offset+3);
if( (intVal & 16) == 16) indexes.addLong(offset+4);
if( (intVal & 32) == 32) indexes.addLong(offset+5);
if( (intVal & 64) == 64) indexes.addLong(offset+6);
if( (intVal & 128) == 128) indexes.addLong(offset+7);
}
++idx;
}
public Object deref() {
return indexes.subList(0, trimIndexes());
}
}
public static class MissingIndexReducer extends ValidityBase {
public MissingIndexReducer(long nElems, long maxIndexes) {
super(nElems, maxIndexes);
}
public void accept(long value) {
if(value != -1) {
int intVal = (int)value;
int offset = idx * 8;
if( (intVal & 1) != 1) indexes[nIndexes++] = offset;
if( (intVal & 2) != 2) indexes[nIndexes++] = offset+1;
if( (intVal & 4) != 4) indexes[nIndexes++] = offset+2;
if( (intVal & 8) != 8) indexes[nIndexes++] = offset+3;
if( (intVal & 16) != 16) indexes[nIndexes++] = offset+4;
if( (intVal & 32) != 32) indexes[nIndexes++] = offset+5;
if( (intVal & 64) != 64) indexes[nIndexes++] = offset+6;
if( (intVal & 128) != 128) indexes[nIndexes++] = offset+7;
}
++idx;
}
public Object deref() {
RoaringBitmap rb = new RoaringBitmap();
rb.addN(indexes, 0, trimIndexes(indexes, nIndexes, nElems));
return rb;
}
}
}
+65
View File
@@ -0,0 +1,65 @@
package tech.v3.dataset;
import tech.v3.datatype.IndexConsumer;
import tech.v3.datatype.ECount;
import ham_fisted.Reducible;
import ham_fisted.IMutList;
import org.roaringbitmap.RoaringBitmap;
import clojure.lang.IDeref;
import clojure.lang.IFn;
import clojure.lang.Keyword;
import clojure.lang.PersistentArrayMap;
public class IntColParser
implements IDeref, PParser, ECount {
public final IndexConsumer data;
public final RoaringBitmap missing;
public final Object colname;
long lastidx;
public IntColParser(IFn rangeFn, IMutList dlist, Object colname) {
data = new IndexConsumer(rangeFn, dlist);
missing = new RoaringBitmap();
this.colname = colname;
}
public long lsize() { return lastidx; }
public void addMissing(long idx) {
if(lastidx < idx) {
missing.add(lastidx, idx);
}
lastidx = idx+1;
}
public void addValue(long idx, Object val) {
addMissing(idx);
if (val instanceof Long)
data.accept((Long)val);
else if (val instanceof Integer)
data.accept((Integer)val);
else if (val instanceof Short)
data.accept((Short)val);
else if (val instanceof Byte)
data.accept((Byte)val);
else if (val == null)
missing.add((int)idx);
else
throw new RuntimeException("Value " + String.valueOf(val) + " is not an integer value");
}
public Object deref() {
return new PersistentArrayMap(new Object[] {
Keyword.intern("tech.v3.dataset", "data"), data.deref(),
Keyword.intern("tech.v3.dataset", "missing"), missing,
Keyword.intern("tech.v3.dataset", "name"), colname});
}
public Object finalize(long rowcount) {
addMissing(rowcount);
return deref();
}
}
+193
View File
@@ -0,0 +1,193 @@
/*
* Copyright (c) 2010-2020 Haifeng Li. All rights reserved.
*
* Smile is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* Smile is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Smile. If not, see <https://www.gnu.org/licenses/>.
*/
package tech.v3.dataset;
import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.file.Path;
import org.apache.parquet.io.InputFile;
import org.apache.parquet.io.SeekableInputStream;
/**
* Parquet InputFile with a local java.nio.Path.
* Adapted from https://github.com/tideworks/arvo2parquet
*
* @author Haifeng Li
* @hacks Chris Nuernberger
*/
public class LocalInputFile implements InputFile {
/** Local file object. */
private final RandomAccessFile input;
/**
* Constructor.
* @param path the input file path.
* @throws FileNotFoundException when file cannot be found.
*/
public LocalInputFile(Path path) throws FileNotFoundException {
input = new RandomAccessFile(path.toFile(), "r");
}
@Override
public long getLength() throws IOException {
return input.length();
}
@Override
public SeekableInputStream newStream() throws IOException {
return new SeekableInputStream() {
private final byte[] page = new byte[8192];
private long markPos = 0;
@Override
public int read() throws IOException {
return input.read();
}
@Override
public int read(byte[] b) throws IOException {
return input.read(b);
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
return input.read(b, off, len);
}
@Override
public long skip(long n) throws IOException {
final long savPos = input.getFilePointer();
final long amtLeft = input.length() - savPos;
n = Math.min(n, amtLeft);
final long newPos = savPos + n;
input.seek(newPos);
final long curPos = input.getFilePointer();
return curPos - savPos;
}
@Override
public int available() {
return 0;
}
@Override
public void close() throws IOException {
input.close();
}
@SuppressWarnings({"unchecked", "unused", "UnusedReturnValue"})
private <T extends Throwable, R> R uncheckedExceptionThrow(Throwable t) throws T {
throw (T) t;
}
@Override
public synchronized void mark(int readlimit) {
try {
markPos = input.getFilePointer();
} catch (IOException e) {
uncheckedExceptionThrow(e);
}
}
@Override
public synchronized void reset() throws IOException {
input.seek(markPos);
}
@Override
public boolean markSupported() {
return true;
}
@Override
public long getPos() throws IOException {
return input.getFilePointer();
}
@Override
public void seek(long l) throws IOException {
input.seek(l);
}
@Override
public void readFully(byte[] bytes) throws IOException {
input.readFully(bytes);
}
@Override
public void readFully(byte[] bytes, int i, int i1) throws IOException {
input.readFully(bytes, i, i1);
}
@Override
public int read(ByteBuffer byteBuffer) throws IOException {
return readDirectBuffer(byteBuffer, page, input::read);
}
@Override
public void readFully(ByteBuffer byteBuffer) throws IOException {
readFullyDirectBuffer(byteBuffer, page, input::read);
}
};
}
private interface ByteBufferReader {
int read(byte[] b, int off, int len) throws IOException;
}
private int readDirectBuffer(ByteBuffer byteBuffer, byte[] page, ByteBufferReader reader) throws IOException {
// copy all the bytes that return immediately, stopping at the first
// read that doesn't return a full buffer.
int nextReadLength = Math.min(byteBuffer.remaining(), page.length);
int totalBytesRead = 0;
int bytesRead;
while ((bytesRead = reader.read(page, 0, nextReadLength)) == page.length) {
byteBuffer.put(page);
totalBytesRead += bytesRead;
nextReadLength = Math.min(byteBuffer.remaining(), page.length);
}
if (bytesRead < 0) {
// return -1 if nothing was read
return totalBytesRead == 0 ? -1 : totalBytesRead;
} else {
// copy the last partial buffer
byteBuffer.put(page, 0, bytesRead);
totalBytesRead += bytesRead;
return totalBytesRead;
}
}
private static void readFullyDirectBuffer(ByteBuffer byteBuffer, byte[] page, ByteBufferReader reader) throws IOException {
int nextReadLength = Math.min(byteBuffer.remaining(), page.length);
int bytesRead = 0;
while (nextReadLength > 0 && (bytesRead = reader.read(page, 0, nextReadLength)) >= 0) {
byteBuffer.put(page, 0, bytesRead);
nextReadLength = Math.min(byteBuffer.remaining(), page.length);
}
if (bytesRead < 0 && byteBuffer.remaining() > 0) {
throw new EOFException("Reached the end of stream with " + byteBuffer.remaining() + " bytes left to read");
}
}
}
@@ -0,0 +1,22 @@
package tech.v3.dataset;
import java.io.InputStream;
import java.io.IOException;
public class NoCloseInputStream extends InputStream {
public final InputStream stream;
public NoCloseInputStream(InputStream _stream) {
stream = _stream;
}
public int available() throws IOException { return stream.available(); }
//Explicitly do not forward close call
public void close(){}
public void mark(int maxBytes) { stream.mark(maxBytes); }
public boolean markSupported() { return stream.markSupported(); }
public int read() throws IOException { return stream.read(); }
public int read(byte[] data) throws IOException { return stream.read(data); }
public int read(byte[] data, int off, int len) throws IOException { return stream.read(data,off,len); }
public void reset() throws IOException { stream.reset(); }
public long skip(long n) throws IOException { return stream.skip(n); }
}
@@ -0,0 +1,20 @@
package tech.v3.dataset;
import java.io.OutputStream;
import java.io.IOException;
public class NoCloseOutputStream extends OutputStream {
public final OutputStream stream;
public NoCloseOutputStream(OutputStream os) {
stream = os;
}
public void close() throws IOException {}
public void fluse() throws IOException { stream.flush(); }
public void write(byte[] b) throws IOException { stream.write(b); }
public void write(byte[] b, int off, int len) throws IOException {
stream.write(b,off,len);
}
public void write(int b) throws IOException { stream.write(b); }
}
+7
View File
@@ -0,0 +1,7 @@
package tech.v3.dataset;
public interface PParser {
public void addValue(long idx, Object value);
public Object finalize(long rowcount);
}
@@ -0,0 +1,55 @@
package tech.v3.dataset;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.io.api.RecordConsumer;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.api.WriteSupport;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.io.OutputFile;
import clojure.lang.IFn;
import java.util.Map;
public class ParquetRowWriter extends WriteSupport<Long>
{
public final IFn rowWriter;
public final MessageType schema;
public final Map<String,String> metadata;
public RecordConsumer consumer;
public Object dataset;
public ParquetRowWriter(IFn _writer, MessageType _schema, Map<String,String> _meta) {
rowWriter = _writer;
schema = _schema;
metadata = _meta;
consumer = null;
//Outside forces must set dataset
dataset = null;
}
@Override
public WriteContext init(Configuration configuration) {
return new WriteContext( schema, metadata );
}
@Override
public void prepareForWrite(RecordConsumer recordConsumer) {
consumer = recordConsumer;
}
@Override
public void write(Long record) {
rowWriter.invoke(dataset,record,consumer);
}
public static class WriterBuilder extends ParquetWriter.Builder<Long,WriterBuilder>
{
public final ParquetRowWriter writer;
public WriterBuilder(OutputFile outf, ParquetRowWriter _writer) {
super(outf);
writer = _writer;
}
public WriterBuilder self() { return this; }
protected WriteSupport<Long> getWriteSupport(Configuration conf) {
return writer;
}
}
}
+32
View File
@@ -0,0 +1,32 @@
package tech.v3.dataset;
import clojure.lang.Keyword;
import java.lang.Iterable;
import java.lang.AutoCloseable;
public class Spreadsheet
{
public interface Workbook extends Iterable, AutoCloseable
{
}
public interface Sheet extends Iterable
{
public String name();
public String id();
public String stableId();
}
public interface Row extends Iterable
{
public int getRowNum();
}
public interface Cell
{
boolean missing();
int getColumnNum();
Object value();
double doubleValue();
boolean boolValue();
}
}
+32
View File
@@ -0,0 +1,32 @@
package tech.v3.dataset;
import java.util.Objects;
public class Text implements Comparable
{
public final String text;
public Text(String data) {
Objects.requireNonNull(data);
text = data;
}
public String toString() { return text; }
public int hashCode() { return text.hashCode(); }
public boolean equals(Object other) {
String strData = null;
if (other instanceof String) {
strData = (String)other;
} else if (other instanceof Text) {
strData = ((Text)other).text;
}
return text.equals(strData);
}
public int compareTo(Object other) {
String strData = null;
if (other instanceof String) {
strData = (String)other;
} else if (other instanceof Text) {
strData = ((Text)other).text;
}
return text.compareTo(strData);
}
}