init research
This commit is contained in:
@@ -0,0 +1,93 @@
|
||||
package tech.v3.dataset;
|
||||
|
||||
import clojure.lang.RT;
|
||||
import clojure.lang.IDeref;
|
||||
import java.util.function.LongConsumer;
|
||||
import ham_fisted.ArrayLists;
|
||||
import ham_fisted.IMutList;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
|
||||
|
||||
public class ByteValidity {
|
||||
public static int trimIndexes(int[] indexes, int nIndexes, long maxIdx) {
|
||||
while(nIndexes > 0) {
|
||||
if(Integer.toUnsignedLong(indexes[nIndexes-1]) >= maxIdx)
|
||||
--nIndexes;
|
||||
else
|
||||
break;
|
||||
}
|
||||
return nIndexes;
|
||||
}
|
||||
public static abstract class ValidityBase implements LongConsumer, IDeref {
|
||||
long nElems;
|
||||
int idx;
|
||||
int nIndexes;
|
||||
int[] indexes;
|
||||
public ValidityBase(long nElems, long maxIndexes) {
|
||||
this.nElems = nElems;
|
||||
indexes = new int[(int)maxIndexes];
|
||||
nIndexes = 0;
|
||||
idx = 0;
|
||||
}
|
||||
}
|
||||
public static class ValidityIndexReducer implements LongConsumer, IDeref {
|
||||
IMutList indexes;
|
||||
public final long maxIdx;
|
||||
int idx;
|
||||
public ValidityIndexReducer(IMutList indexes, long maxIdx) {
|
||||
this.indexes = indexes;
|
||||
this.maxIdx = maxIdx;
|
||||
this.idx = 0;
|
||||
}
|
||||
public int trimIndexes() {
|
||||
int nIndexes = indexes.size();
|
||||
//empty loop intentional
|
||||
for(;nIndexes > 0 && indexes.getLong(nIndexes-1) >= maxIdx; --nIndexes);
|
||||
|
||||
return nIndexes;
|
||||
}
|
||||
public void accept(long value) {
|
||||
if(value != 0) {
|
||||
int intVal = (int)value;
|
||||
int offset = idx * 8;
|
||||
if( (intVal & 1) == 1) indexes.addLong(offset);
|
||||
if( (intVal & 2) == 2) indexes.addLong(offset+1);
|
||||
if( (intVal & 4) == 4) indexes.addLong(offset+2);
|
||||
if( (intVal & 8) == 8) indexes.addLong(offset+3);
|
||||
if( (intVal & 16) == 16) indexes.addLong(offset+4);
|
||||
if( (intVal & 32) == 32) indexes.addLong(offset+5);
|
||||
if( (intVal & 64) == 64) indexes.addLong(offset+6);
|
||||
if( (intVal & 128) == 128) indexes.addLong(offset+7);
|
||||
}
|
||||
++idx;
|
||||
}
|
||||
public Object deref() {
|
||||
return indexes.subList(0, trimIndexes());
|
||||
}
|
||||
}
|
||||
public static class MissingIndexReducer extends ValidityBase {
|
||||
public MissingIndexReducer(long nElems, long maxIndexes) {
|
||||
super(nElems, maxIndexes);
|
||||
}
|
||||
public void accept(long value) {
|
||||
if(value != -1) {
|
||||
int intVal = (int)value;
|
||||
int offset = idx * 8;
|
||||
if( (intVal & 1) != 1) indexes[nIndexes++] = offset;
|
||||
if( (intVal & 2) != 2) indexes[nIndexes++] = offset+1;
|
||||
if( (intVal & 4) != 4) indexes[nIndexes++] = offset+2;
|
||||
if( (intVal & 8) != 8) indexes[nIndexes++] = offset+3;
|
||||
if( (intVal & 16) != 16) indexes[nIndexes++] = offset+4;
|
||||
if( (intVal & 32) != 32) indexes[nIndexes++] = offset+5;
|
||||
if( (intVal & 64) != 64) indexes[nIndexes++] = offset+6;
|
||||
if( (intVal & 128) != 128) indexes[nIndexes++] = offset+7;
|
||||
}
|
||||
++idx;
|
||||
}
|
||||
public Object deref() {
|
||||
RoaringBitmap rb = new RoaringBitmap();
|
||||
rb.addN(indexes, 0, trimIndexes(indexes, nIndexes, nElems));
|
||||
return rb;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
package tech.v3.dataset;
|
||||
|
||||
|
||||
import tech.v3.datatype.IndexConsumer;
|
||||
import tech.v3.datatype.ECount;
|
||||
import ham_fisted.Reducible;
|
||||
import ham_fisted.IMutList;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
import clojure.lang.IDeref;
|
||||
import clojure.lang.IFn;
|
||||
import clojure.lang.Keyword;
|
||||
import clojure.lang.PersistentArrayMap;
|
||||
|
||||
public class IntColParser
|
||||
implements IDeref, PParser, ECount {
|
||||
|
||||
public final IndexConsumer data;
|
||||
public final RoaringBitmap missing;
|
||||
public final Object colname;
|
||||
long lastidx;
|
||||
|
||||
public IntColParser(IFn rangeFn, IMutList dlist, Object colname) {
|
||||
data = new IndexConsumer(rangeFn, dlist);
|
||||
missing = new RoaringBitmap();
|
||||
this.colname = colname;
|
||||
}
|
||||
|
||||
public long lsize() { return lastidx; }
|
||||
|
||||
public void addMissing(long idx) {
|
||||
if(lastidx < idx) {
|
||||
missing.add(lastidx, idx);
|
||||
}
|
||||
lastidx = idx+1;
|
||||
}
|
||||
|
||||
public void addValue(long idx, Object val) {
|
||||
addMissing(idx);
|
||||
|
||||
if (val instanceof Long)
|
||||
data.accept((Long)val);
|
||||
else if (val instanceof Integer)
|
||||
data.accept((Integer)val);
|
||||
else if (val instanceof Short)
|
||||
data.accept((Short)val);
|
||||
else if (val instanceof Byte)
|
||||
data.accept((Byte)val);
|
||||
else if (val == null)
|
||||
missing.add((int)idx);
|
||||
else
|
||||
throw new RuntimeException("Value " + String.valueOf(val) + " is not an integer value");
|
||||
}
|
||||
|
||||
public Object deref() {
|
||||
return new PersistentArrayMap(new Object[] {
|
||||
Keyword.intern("tech.v3.dataset", "data"), data.deref(),
|
||||
Keyword.intern("tech.v3.dataset", "missing"), missing,
|
||||
Keyword.intern("tech.v3.dataset", "name"), colname});
|
||||
}
|
||||
|
||||
public Object finalize(long rowcount) {
|
||||
addMissing(rowcount);
|
||||
return deref();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,193 @@
|
||||
/*
|
||||
* Copyright (c) 2010-2020 Haifeng Li. All rights reserved.
|
||||
*
|
||||
* Smile is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* Smile is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Smile. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package tech.v3.dataset;
|
||||
|
||||
import java.io.EOFException;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Path;
|
||||
import org.apache.parquet.io.InputFile;
|
||||
import org.apache.parquet.io.SeekableInputStream;
|
||||
|
||||
/**
|
||||
* Parquet InputFile with a local java.nio.Path.
|
||||
* Adapted from https://github.com/tideworks/arvo2parquet
|
||||
*
|
||||
* @author Haifeng Li
|
||||
* @hacks Chris Nuernberger
|
||||
*/
|
||||
public class LocalInputFile implements InputFile {
|
||||
/** Local file object. */
|
||||
private final RandomAccessFile input;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
* @param path the input file path.
|
||||
* @throws FileNotFoundException when file cannot be found.
|
||||
*/
|
||||
public LocalInputFile(Path path) throws FileNotFoundException {
|
||||
input = new RandomAccessFile(path.toFile(), "r");
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getLength() throws IOException {
|
||||
return input.length();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekableInputStream newStream() throws IOException {
|
||||
return new SeekableInputStream() {
|
||||
private final byte[] page = new byte[8192];
|
||||
private long markPos = 0;
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
return input.read();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(byte[] b) throws IOException {
|
||||
return input.read(b);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(byte[] b, int off, int len) throws IOException {
|
||||
return input.read(b, off, len);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long skip(long n) throws IOException {
|
||||
final long savPos = input.getFilePointer();
|
||||
final long amtLeft = input.length() - savPos;
|
||||
n = Math.min(n, amtLeft);
|
||||
final long newPos = savPos + n;
|
||||
input.seek(newPos);
|
||||
final long curPos = input.getFilePointer();
|
||||
return curPos - savPos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int available() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
input.close();
|
||||
}
|
||||
|
||||
@SuppressWarnings({"unchecked", "unused", "UnusedReturnValue"})
|
||||
private <T extends Throwable, R> R uncheckedExceptionThrow(Throwable t) throws T {
|
||||
throw (T) t;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void mark(int readlimit) {
|
||||
try {
|
||||
markPos = input.getFilePointer();
|
||||
} catch (IOException e) {
|
||||
uncheckedExceptionThrow(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void reset() throws IOException {
|
||||
input.seek(markPos);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean markSupported() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getPos() throws IOException {
|
||||
return input.getFilePointer();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void seek(long l) throws IOException {
|
||||
input.seek(l);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readFully(byte[] bytes) throws IOException {
|
||||
input.readFully(bytes);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readFully(byte[] bytes, int i, int i1) throws IOException {
|
||||
input.readFully(bytes, i, i1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(ByteBuffer byteBuffer) throws IOException {
|
||||
return readDirectBuffer(byteBuffer, page, input::read);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readFully(ByteBuffer byteBuffer) throws IOException {
|
||||
readFullyDirectBuffer(byteBuffer, page, input::read);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private interface ByteBufferReader {
|
||||
int read(byte[] b, int off, int len) throws IOException;
|
||||
}
|
||||
|
||||
private int readDirectBuffer(ByteBuffer byteBuffer, byte[] page, ByteBufferReader reader) throws IOException {
|
||||
// copy all the bytes that return immediately, stopping at the first
|
||||
// read that doesn't return a full buffer.
|
||||
int nextReadLength = Math.min(byteBuffer.remaining(), page.length);
|
||||
int totalBytesRead = 0;
|
||||
int bytesRead;
|
||||
|
||||
while ((bytesRead = reader.read(page, 0, nextReadLength)) == page.length) {
|
||||
byteBuffer.put(page);
|
||||
totalBytesRead += bytesRead;
|
||||
nextReadLength = Math.min(byteBuffer.remaining(), page.length);
|
||||
}
|
||||
|
||||
if (bytesRead < 0) {
|
||||
// return -1 if nothing was read
|
||||
return totalBytesRead == 0 ? -1 : totalBytesRead;
|
||||
} else {
|
||||
// copy the last partial buffer
|
||||
byteBuffer.put(page, 0, bytesRead);
|
||||
totalBytesRead += bytesRead;
|
||||
return totalBytesRead;
|
||||
}
|
||||
}
|
||||
|
||||
private static void readFullyDirectBuffer(ByteBuffer byteBuffer, byte[] page, ByteBufferReader reader) throws IOException {
|
||||
int nextReadLength = Math.min(byteBuffer.remaining(), page.length);
|
||||
int bytesRead = 0;
|
||||
|
||||
while (nextReadLength > 0 && (bytesRead = reader.read(page, 0, nextReadLength)) >= 0) {
|
||||
byteBuffer.put(page, 0, bytesRead);
|
||||
nextReadLength = Math.min(byteBuffer.remaining(), page.length);
|
||||
}
|
||||
|
||||
if (bytesRead < 0 && byteBuffer.remaining() > 0) {
|
||||
throw new EOFException("Reached the end of stream with " + byteBuffer.remaining() + " bytes left to read");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
package tech.v3.dataset;
|
||||
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
public class NoCloseInputStream extends InputStream {
|
||||
public final InputStream stream;
|
||||
public NoCloseInputStream(InputStream _stream) {
|
||||
stream = _stream;
|
||||
}
|
||||
public int available() throws IOException { return stream.available(); }
|
||||
//Explicitly do not forward close call
|
||||
public void close(){}
|
||||
public void mark(int maxBytes) { stream.mark(maxBytes); }
|
||||
public boolean markSupported() { return stream.markSupported(); }
|
||||
public int read() throws IOException { return stream.read(); }
|
||||
public int read(byte[] data) throws IOException { return stream.read(data); }
|
||||
public int read(byte[] data, int off, int len) throws IOException { return stream.read(data,off,len); }
|
||||
public void reset() throws IOException { stream.reset(); }
|
||||
public long skip(long n) throws IOException { return stream.skip(n); }
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
package tech.v3.dataset;
|
||||
|
||||
|
||||
import java.io.OutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
public class NoCloseOutputStream extends OutputStream {
|
||||
public final OutputStream stream;
|
||||
public NoCloseOutputStream(OutputStream os) {
|
||||
stream = os;
|
||||
}
|
||||
public void close() throws IOException {}
|
||||
public void fluse() throws IOException { stream.flush(); }
|
||||
public void write(byte[] b) throws IOException { stream.write(b); }
|
||||
public void write(byte[] b, int off, int len) throws IOException {
|
||||
stream.write(b,off,len);
|
||||
}
|
||||
public void write(int b) throws IOException { stream.write(b); }
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
package tech.v3.dataset;
|
||||
|
||||
|
||||
public interface PParser {
|
||||
public void addValue(long idx, Object value);
|
||||
public Object finalize(long rowcount);
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
package tech.v3.dataset;
|
||||
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
import org.apache.parquet.io.api.RecordConsumer;
|
||||
import org.apache.parquet.hadoop.ParquetWriter;
|
||||
import org.apache.parquet.hadoop.api.WriteSupport;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.parquet.io.OutputFile;
|
||||
import clojure.lang.IFn;
|
||||
import java.util.Map;
|
||||
|
||||
public class ParquetRowWriter extends WriteSupport<Long>
|
||||
{
|
||||
public final IFn rowWriter;
|
||||
public final MessageType schema;
|
||||
public final Map<String,String> metadata;
|
||||
public RecordConsumer consumer;
|
||||
public Object dataset;
|
||||
public ParquetRowWriter(IFn _writer, MessageType _schema, Map<String,String> _meta) {
|
||||
rowWriter = _writer;
|
||||
schema = _schema;
|
||||
metadata = _meta;
|
||||
consumer = null;
|
||||
//Outside forces must set dataset
|
||||
dataset = null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public WriteContext init(Configuration configuration) {
|
||||
return new WriteContext( schema, metadata );
|
||||
}
|
||||
|
||||
@Override
|
||||
public void prepareForWrite(RecordConsumer recordConsumer) {
|
||||
consumer = recordConsumer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Long record) {
|
||||
rowWriter.invoke(dataset,record,consumer);
|
||||
}
|
||||
public static class WriterBuilder extends ParquetWriter.Builder<Long,WriterBuilder>
|
||||
{
|
||||
public final ParquetRowWriter writer;
|
||||
public WriterBuilder(OutputFile outf, ParquetRowWriter _writer) {
|
||||
super(outf);
|
||||
writer = _writer;
|
||||
}
|
||||
public WriterBuilder self() { return this; }
|
||||
protected WriteSupport<Long> getWriteSupport(Configuration conf) {
|
||||
return writer;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
package tech.v3.dataset;
|
||||
|
||||
import clojure.lang.Keyword;
|
||||
import java.lang.Iterable;
|
||||
import java.lang.AutoCloseable;
|
||||
|
||||
public class Spreadsheet
|
||||
{
|
||||
public interface Workbook extends Iterable, AutoCloseable
|
||||
{
|
||||
}
|
||||
public interface Sheet extends Iterable
|
||||
{
|
||||
public String name();
|
||||
|
||||
public String id();
|
||||
|
||||
public String stableId();
|
||||
}
|
||||
public interface Row extends Iterable
|
||||
{
|
||||
public int getRowNum();
|
||||
}
|
||||
public interface Cell
|
||||
{
|
||||
boolean missing();
|
||||
int getColumnNum();
|
||||
Object value();
|
||||
double doubleValue();
|
||||
boolean boolValue();
|
||||
}
|
||||
}
|
||||
+32
@@ -0,0 +1,32 @@
|
||||
package tech.v3.dataset;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
public class Text implements Comparable
|
||||
{
|
||||
public final String text;
|
||||
public Text(String data) {
|
||||
Objects.requireNonNull(data);
|
||||
text = data;
|
||||
}
|
||||
public String toString() { return text; }
|
||||
public int hashCode() { return text.hashCode(); }
|
||||
public boolean equals(Object other) {
|
||||
String strData = null;
|
||||
if (other instanceof String) {
|
||||
strData = (String)other;
|
||||
} else if (other instanceof Text) {
|
||||
strData = ((Text)other).text;
|
||||
}
|
||||
return text.equals(strData);
|
||||
}
|
||||
public int compareTo(Object other) {
|
||||
String strData = null;
|
||||
if (other instanceof String) {
|
||||
strData = (String)other;
|
||||
} else if (other instanceof Text) {
|
||||
strData = ((Text)other).text;
|
||||
}
|
||||
return text.compareTo(strData);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user