init research
This commit is contained in:
+274
@@ -0,0 +1,274 @@
|
||||
package org.jetbrains.kotlinx.dataframe.documentationCsv
|
||||
|
||||
import io.deephaven.csv.CsvSpecs
|
||||
import org.apache.commons.csv.CSVFormat
|
||||
import org.jetbrains.kotlinx.dataframe.DataFrame
|
||||
import org.jetbrains.kotlinx.dataframe.api.ParserOptions
|
||||
import org.jetbrains.kotlinx.dataframe.impl.io.typesDeephavenAlreadyParses
|
||||
import org.jetbrains.kotlinx.dataframe.io.AdjustCSVFormat
|
||||
import org.jetbrains.kotlinx.dataframe.io.AdjustCsvSpecs
|
||||
import org.jetbrains.kotlinx.dataframe.io.ColType
|
||||
import org.jetbrains.kotlinx.dataframe.io.Compression
|
||||
import org.jetbrains.kotlinx.dataframe.io.DefaultNullStringsContentLink
|
||||
import org.jetbrains.kotlinx.dataframe.io.QuoteMode
|
||||
import java.nio.charset.Charset
|
||||
|
||||
/**
|
||||
* Contains both the default values of csv/tsv parameters and the parameter KDocs.
|
||||
*/
|
||||
@Suppress("ktlint:standard:class-naming", "ClassName", "KDocUnresolvedReference")
|
||||
internal object DelimParams {
|
||||
|
||||
/**
|
||||
* @param path The file path to read.
|
||||
* Use [charset] to specify the encoding.
|
||||
* Can also be compressed as `.gz` or `.zip`, see [Compression][Compression].
|
||||
*/
|
||||
typealias PATH_READ = Nothing
|
||||
|
||||
/**
|
||||
* @param file The file to read.
|
||||
* Use [charset] to specify the encoding.
|
||||
* Can also be compressed as `.gz` or `.zip`, see [Compression][Compression].
|
||||
*/
|
||||
typealias FILE_READ = Nothing
|
||||
|
||||
/**
|
||||
* @param url The URL from which to fetch the data.
|
||||
* Use [charset] to specify the encoding.
|
||||
* Can also be compressed as `.gz` or `.zip`, see [Compression][Compression].
|
||||
*/
|
||||
typealias URL_READ = Nothing
|
||||
|
||||
/**
|
||||
* @param fileOrUrl The file path or URL to read the data from.
|
||||
* Use [charset] to specify the encoding.
|
||||
* Can also be compressed as `.gz` or `.zip`, see [Compression][Compression].
|
||||
*/
|
||||
typealias FILE_OR_URL_READ = Nothing
|
||||
|
||||
/**
|
||||
* @param inputStream Represents the file to read.
|
||||
* Use [charset] to specify the encoding.
|
||||
*/
|
||||
typealias INPUT_STREAM_READ = Nothing
|
||||
|
||||
/** @param text The raw data to read in the form of a [String]. */
|
||||
typealias TEXT_READ = Nothing
|
||||
|
||||
/** @param file The file to write to. */
|
||||
typealias FILE_WRITE = Nothing
|
||||
|
||||
/** @param path The path pointing to a file to write to. */
|
||||
typealias PATH_WRITE = Nothing
|
||||
|
||||
/** @param writer The [Appendable] to write to. */
|
||||
typealias WRITER_WRITE = Nothing
|
||||
|
||||
/**
|
||||
* @param charset The [character set][java.nio.charset.Charset] the input is encoded in.
|
||||
* Default: `null`
|
||||
*
|
||||
* If `null`, the Charset will be read from the BOM of the provided input,
|
||||
* defaulting to [UTF-8][Charsets.UTF_8] if no BOM is found.
|
||||
*/
|
||||
val CHARSET: Charset? = null
|
||||
|
||||
/**
|
||||
* @param delimiter The field delimiter character. Default: ','.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
*/
|
||||
const val CSV_DELIMITER: Char = ','
|
||||
|
||||
/**
|
||||
* @param delimiter The field delimiter character. Default: '\t'.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
*/
|
||||
const val TSV_DELIMITER: Char = '\t'
|
||||
|
||||
/**
|
||||
* @param delimiter The field delimiter character. Default: ','.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
*/
|
||||
const val DELIM_DELIMITER: Char = ','
|
||||
|
||||
/**
|
||||
* @param header Optional column titles. Default: empty list.
|
||||
*
|
||||
* If non-empty, the data will be read with [header] as the column titles
|
||||
* (use [skipLines] if there's a header in the data).
|
||||
* If empty (default), the header will be read from the data.
|
||||
*/
|
||||
val HEADER: List<String> = emptyList()
|
||||
|
||||
/**
|
||||
* @param hasFixedWidthColumns Whether the data has fixed-width columns instead of a single delimiter.
|
||||
* Default: `false`.
|
||||
*
|
||||
* Fixed-width columns can occur, for instance, in multi-space delimited data, where the columns are separated
|
||||
* by multiple spaces instead of a single delimiter, so columns are visually aligned.
|
||||
* Column widths are determined by the header in the data (if present), or manually by setting
|
||||
* [fixedColumnWidths].
|
||||
*/
|
||||
const val HAS_FIXED_WIDTH_COLUMNS: Boolean = false
|
||||
|
||||
/**
|
||||
* @param fixedColumnWidths The fixed column widths. Default: empty list.
|
||||
*
|
||||
* Requires [hasFixedWidthColumns]. If empty, the column widths will be determined by the header in the data
|
||||
* (if present), else, this manually sets the column widths.
|
||||
* The number of widths should match the number of columns.
|
||||
*/
|
||||
val FIXED_COLUMN_WIDTHS: List<Int> = emptyList()
|
||||
|
||||
/**
|
||||
* @param compression The compression of the data.
|
||||
* Default: [Compression.None], unless detected otherwise from the input file or url.
|
||||
*/
|
||||
val COMPRESSION: Compression<*> = Compression.None
|
||||
|
||||
/**
|
||||
* @param colTypes The expected [ColType] per column name. Default: empty map, a.k.a. infer every column type.
|
||||
*
|
||||
* If supplied for a certain column name (inferred from data or given by [header]),
|
||||
* the parser will parse the column with the specified name as the specified type, else it will infer the type.
|
||||
*
|
||||
* e.g. `colTypes = `[mapOf][mapOf]`("colName" `[to][to]` `[ColType][ColType]`.`[Int][ColType.Int]`)`.
|
||||
* You can also set [ColType][ColType]`.`[DEFAULT][ColType.DEFAULT]` `[to][to]` `[ColType][ColType]`.X`
|
||||
* to set a _default_ column type, like [ColType.String].
|
||||
*/
|
||||
val COL_TYPES: Map<String, ColType> = emptyMap()
|
||||
|
||||
/**
|
||||
* @param skipLines The number of lines to skip before reading the header and data. Default: `0`.
|
||||
*
|
||||
* Useful for files with metadata, or comments at the beginning, or to give a custom [header].
|
||||
*/
|
||||
const val SKIP_LINES: Long = 0L
|
||||
|
||||
/**
|
||||
* @param readLines The maximum number of lines to read from the data. Default: `null`.
|
||||
*
|
||||
* If `null`, all lines will be read.
|
||||
*/
|
||||
val READ_LINES: Long? = null
|
||||
|
||||
/**
|
||||
* @param parserOptions Optional [parsing options][ParserOptions] for columns initially read as [String].
|
||||
* Default, `null`.
|
||||
*
|
||||
* Can configure locale, date format, double parsing, skipping types, etc.
|
||||
*
|
||||
* If [parserOptions] or any of the arguments are `null`, the global parser configuration
|
||||
* ([DataFrame.parser][DataFrame.Companion.parser]) will be queried.
|
||||
*
|
||||
* The only exceptions are:
|
||||
* - [nullStrings][ParserOptions.nullStrings], which, if `null`,
|
||||
* will take the global setting + [["", "NA", "N/A", "null", "NULL", "None", "none", "NIL", "nil"]][org.jetbrains.kotlinx.dataframe.io.DEFAULT_DELIM_NULL_STRINGS].
|
||||
* - [skipTypes][ParserOptions.skipTypes], which will always add [typesDeephavenAlreadyParses] to
|
||||
* the given types or the global setting.
|
||||
*/
|
||||
val PARSER_OPTIONS: ParserOptions? = null
|
||||
|
||||
/**
|
||||
* @param ignoreEmptyLines Whether to skip intermediate empty lines. Default: `false`.
|
||||
*
|
||||
* If `false`, empty lines will be interpreted as having _empty_ values if [allowMissingColumns].
|
||||
*/
|
||||
const val IGNORE_EMPTY_LINES: Boolean = false
|
||||
|
||||
/**
|
||||
* @param allowMissingColumns Whether to allow rows with fewer columns than the header. Default: `true`.
|
||||
*
|
||||
* If `true`, rows that are too short will be interpreted as _empty_ values.
|
||||
*/
|
||||
const val ALLOW_MISSING_COLUMNS: Boolean = true
|
||||
|
||||
/**
|
||||
* @param ignoreExcessColumns Whether to ignore rows with more columns than the header. Default: `true`.
|
||||
*
|
||||
* If `true`, rows that are too long will have those columns dropped.
|
||||
*/
|
||||
const val IGNORE_EXCESS_COLUMNS: Boolean = true
|
||||
|
||||
/**
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
*/
|
||||
const val QUOTE: Char = '"'
|
||||
|
||||
/**
|
||||
* @param ignoreSurroundingSpaces Whether to ignore leading and trailing blanks around non-quoted fields.
|
||||
* Default: `true`.
|
||||
*/
|
||||
const val IGNORE_SURROUNDING_SPACES: Boolean = true
|
||||
|
||||
/**
|
||||
* @param trimInsideQuoted Whether to ignore leading and trailing blanks inside quoted fields.
|
||||
* Default: `false`.
|
||||
*/
|
||||
const val TRIM_INSIDE_QUOTED: Boolean = false
|
||||
|
||||
/**
|
||||
* @param parseParallel Whether to parse the data in parallel. Default: `true`.
|
||||
*
|
||||
* If `true`, the data will be read and parsed in parallel by the Deephaven parser.
|
||||
* This is usually faster but can be turned off for debugging.
|
||||
*/
|
||||
const val PARSE_PARALLEL: Boolean = true
|
||||
|
||||
/**
|
||||
* @param adjustCsvSpecs Optional extra [CsvSpecs] configuration. Default: `{ it }`.
|
||||
*
|
||||
* Before instantiating the [CsvSpecs], the [CsvSpecs.Builder] will be passed to this lambda.
|
||||
* This will allow you to configure/overwrite any CSV / TSV parsing options.
|
||||
*/
|
||||
val ADJUST_CSV_SPECS: AdjustCsvSpecs = { it }
|
||||
|
||||
/** @param includeHeader Whether to include the header in the output. Default: `true`. */
|
||||
const val INCLUDE_HEADER: Boolean = true
|
||||
|
||||
/**
|
||||
* @param quoteMode The [QuoteMode] to use when writing CSV / TSV files.
|
||||
* Default: [QuoteMode.MINIMAL].
|
||||
*/
|
||||
val QUOTE_MODE: QuoteMode = QuoteMode.MINIMAL
|
||||
|
||||
/**
|
||||
* @param escapeChar The escape character to use when writing CSV / TSV files with [QuoteMode.NONE].
|
||||
* Default: `null`. This will double-quote the value.
|
||||
*/
|
||||
val ESCAPE_CHAR: Char? = null
|
||||
|
||||
/**
|
||||
* @param commentChar The character that indicates a comment line in a CSV / TSV file.
|
||||
* Default: `'#'`.
|
||||
*/
|
||||
const val COMMENT_CHAR: Char = '#'
|
||||
|
||||
/**
|
||||
* @param recordSeparator The character that separates records in a CSV / TSV file.
|
||||
* Default: `'\n'`, a Unix-newline.
|
||||
*/
|
||||
const val RECORD_SEPARATOR: String = "\n"
|
||||
|
||||
/**
|
||||
* @param headerComments A list of comments to include at the beginning of the CSV / TSV file.
|
||||
* Default: empty list.
|
||||
*/
|
||||
val HEADER_COMMENTS: List<String> = emptyList()
|
||||
|
||||
/**
|
||||
* @param adjustCsvFormat Optional extra [CSVFormat] configuration. Default: `{ it }`.
|
||||
*
|
||||
* Before instantiating the [CSVFormat], the [CSVFormat.Builder] will be passed to this lambda.
|
||||
* This will allow you to configure/overwrite any CSV / TSV writing options.
|
||||
*/
|
||||
val ADJUST_CSV_FORMAT: AdjustCSVFormat = { it }
|
||||
}
|
||||
+69
@@ -0,0 +1,69 @@
|
||||
package org.jetbrains.kotlinx.dataframe.documentationCsv
|
||||
|
||||
import kotlin.annotation.AnnotationTarget.ANNOTATION_CLASS
|
||||
import kotlin.annotation.AnnotationTarget.CLASS
|
||||
import kotlin.annotation.AnnotationTarget.CONSTRUCTOR
|
||||
import kotlin.annotation.AnnotationTarget.FIELD
|
||||
import kotlin.annotation.AnnotationTarget.FILE
|
||||
import kotlin.annotation.AnnotationTarget.FUNCTION
|
||||
import kotlin.annotation.AnnotationTarget.LOCAL_VARIABLE
|
||||
import kotlin.annotation.AnnotationTarget.PROPERTY
|
||||
import kotlin.annotation.AnnotationTarget.PROPERTY_GETTER
|
||||
import kotlin.annotation.AnnotationTarget.PROPERTY_SETTER
|
||||
import kotlin.annotation.AnnotationTarget.TYPE
|
||||
import kotlin.annotation.AnnotationTarget.TYPEALIAS
|
||||
import kotlin.annotation.AnnotationTarget.VALUE_PARAMETER
|
||||
|
||||
/**
|
||||
* Any `Documentable` annotated with this annotation will be excluded from the generated sources by
|
||||
* the documentation processor.
|
||||
*
|
||||
* **NOTE: DO NOT RENAME!**
|
||||
*/
|
||||
@Target(
|
||||
CLASS,
|
||||
ANNOTATION_CLASS,
|
||||
PROPERTY,
|
||||
FIELD,
|
||||
LOCAL_VARIABLE,
|
||||
VALUE_PARAMETER,
|
||||
CONSTRUCTOR,
|
||||
FUNCTION,
|
||||
PROPERTY_GETTER,
|
||||
PROPERTY_SETTER,
|
||||
TYPE,
|
||||
TYPEALIAS,
|
||||
FILE,
|
||||
)
|
||||
internal annotation class ExcludeFromSources
|
||||
|
||||
/**
|
||||
* Any `Documentable` annotated with this annotation will be exported to HTML by the documentation
|
||||
* processor.
|
||||
*
|
||||
* You can use @exportAsHtmlStart and @exportAsHtmlEnd to specify a range of the doc to
|
||||
* export to HTML.
|
||||
*
|
||||
* **NOTE: DO NOT RENAME!**
|
||||
*
|
||||
* @param theme Whether to include a simple theme in the HTML file. Default is `true`.
|
||||
* @param stripReferences Whether to strip `[references]` from the HTML file. Default is `true`.
|
||||
* This is useful when you want to include the HTML file in a website, where the references are not
|
||||
* needed or would break.
|
||||
*/
|
||||
@Target(
|
||||
CLASS,
|
||||
ANNOTATION_CLASS,
|
||||
PROPERTY,
|
||||
FIELD,
|
||||
LOCAL_VARIABLE,
|
||||
VALUE_PARAMETER,
|
||||
CONSTRUCTOR,
|
||||
FUNCTION,
|
||||
PROPERTY_GETTER,
|
||||
PROPERTY_SETTER,
|
||||
TYPE,
|
||||
TYPEALIAS,
|
||||
FILE,
|
||||
)
|
||||
internal annotation class ExportAsHtml(val theme: Boolean = true, val stripReferences: Boolean = true)
|
||||
+24
@@ -0,0 +1,24 @@
|
||||
package org.jetbrains.kotlinx.dataframe.impl.io
|
||||
|
||||
import io.deephaven.csv.containers.ByteSlice
|
||||
import io.deephaven.csv.tokenization.Tokenizer.CustomDoubleParser
|
||||
import org.jetbrains.kotlinx.dataframe.api.ParserOptions
|
||||
|
||||
/**
|
||||
* Wrapper around [FastDoubleParser] so we can use it from Deephaven.
|
||||
*/
|
||||
internal class DataFrameCustomDoubleParser(parserOptions: ParserOptions? = null) : CustomDoubleParser {
|
||||
|
||||
private val fastDoubleParser = FastDoubleParser(parserOptions)
|
||||
|
||||
override fun parse(bs: ByteSlice): Double =
|
||||
try {
|
||||
fastDoubleParser.parseOrNull(bs.data(), bs.begin(), bs.size())
|
||||
} catch (_: Exception) {
|
||||
null
|
||||
} ?: throw NumberFormatException()
|
||||
|
||||
override fun parse(cs: CharSequence): Double =
|
||||
fastDoubleParser.parseOrNull(cs.toString())
|
||||
?: throw NumberFormatException()
|
||||
}
|
||||
+204
@@ -0,0 +1,204 @@
|
||||
package org.jetbrains.kotlinx.dataframe.impl.io
|
||||
|
||||
import io.deephaven.csv.parsers.DataType
|
||||
import io.deephaven.csv.parsers.DataType.BOOLEAN_AS_BYTE
|
||||
import io.deephaven.csv.parsers.DataType.BYTE
|
||||
import io.deephaven.csv.parsers.DataType.CHAR
|
||||
import io.deephaven.csv.parsers.DataType.DATETIME_AS_LONG
|
||||
import io.deephaven.csv.parsers.DataType.DOUBLE
|
||||
import io.deephaven.csv.parsers.DataType.FLOAT
|
||||
import io.deephaven.csv.parsers.DataType.INT
|
||||
import io.deephaven.csv.parsers.DataType.LONG
|
||||
import io.deephaven.csv.parsers.DataType.SHORT
|
||||
import io.deephaven.csv.parsers.DataType.STRING
|
||||
import io.deephaven.csv.parsers.DataType.TIMESTAMP_AS_LONG
|
||||
import io.deephaven.csv.sinks.Sink
|
||||
import io.deephaven.csv.sinks.SinkFactory
|
||||
import io.deephaven.csv.sinks.Source
|
||||
import kotlinx.datetime.toKotlinLocalDateTime
|
||||
import java.time.LocalDateTime
|
||||
import java.time.ZoneOffset
|
||||
import kotlin.time.Duration.Companion.nanoseconds
|
||||
|
||||
internal interface SinkSource<T : Any> :
|
||||
Sink<T>,
|
||||
Source<T>
|
||||
|
||||
/**
|
||||
* Implementation of Deephaven's [Sink] and [Source] that stores data in an [ArrayList].
|
||||
*
|
||||
* The implementation is based on [Writing Your Own Data Sinks](https://github.com/deephaven/deephaven-csv/blob/main/ADVANCED.md).
|
||||
*
|
||||
* If we ever store column data unboxed / primitively, this needs to be modified.
|
||||
*/
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
internal class ListSink(val columnIndex: Int, val dataType: DataType) : SinkSource<Any> {
|
||||
|
||||
@Suppress("ktlint:standard:comment-wrapping", "ktlint:standard:no-consecutive-comments")
|
||||
companion object {
|
||||
val SINK_FACTORY: SinkFactory = SinkFactory.of(
|
||||
// unused in Parsers.DEFAULT:
|
||||
/* byteSinkSupplier = */ { ListSink(it, BYTE) as SinkSource<ByteArray> },
|
||||
/* shortSinkSupplier = */ { ListSink(it, SHORT) as SinkSource<ShortArray> },
|
||||
/* intSinkSupplier = */ { ListSink(it, INT) as SinkSource<IntArray> },
|
||||
/* longSinkSupplier = */ { ListSink(it, LONG) as SinkSource<LongArray> },
|
||||
// unused in Parsers.COMPLETE and Parsers.DEFAULT:
|
||||
/* floatSinkSupplier = */ { ListSink(it, FLOAT) as SinkSource<FloatArray> },
|
||||
/* doubleSinkSupplier = */ { ListSink(it, DOUBLE) as SinkSource<DoubleArray> },
|
||||
/* booleanAsByteSinkSupplier = */ { ListSink(it, BOOLEAN_AS_BYTE) as SinkSource<ByteArray> },
|
||||
/* charSinkSupplier = */ { ListSink(it, CHAR) as SinkSource<CharArray> },
|
||||
/* stringSinkSupplier = */ { ListSink(it, STRING) as SinkSource<Array<String>> },
|
||||
/* dateTimeAsLongSinkSupplier = */ { ListSink(it, DATETIME_AS_LONG) as SinkSource<LongArray> },
|
||||
// unused in Parsers.COMPLETE and Parsers.DEFAULT:
|
||||
/* timestampAsLongSinkSupplier = */ { ListSink(it, TIMESTAMP_AS_LONG) as SinkSource<LongArray> },
|
||||
)
|
||||
}
|
||||
|
||||
private val _data: MutableList<Any?> = ArrayList(1000)
|
||||
|
||||
val data: List<Any?>
|
||||
get() = _data
|
||||
|
||||
var hasNulls: Boolean = false
|
||||
private set
|
||||
|
||||
private fun getValue(src: Any, srcIndex: Int, isNull: BooleanArray): Any? =
|
||||
if (isNull[srcIndex]) {
|
||||
hasNulls = true
|
||||
null
|
||||
} else {
|
||||
when (dataType) {
|
||||
BOOLEAN_AS_BYTE -> (src as ByteArray)[srcIndex] == 1.toByte()
|
||||
|
||||
// unused in Parsers.DEFAULT
|
||||
BYTE -> (src as ByteArray)[srcIndex]
|
||||
|
||||
// unused in Parsers.DEFAULT
|
||||
SHORT -> (src as ShortArray)[srcIndex]
|
||||
|
||||
INT -> (src as IntArray)[srcIndex]
|
||||
|
||||
LONG -> (src as LongArray)[srcIndex]
|
||||
|
||||
// unused in Parsers.COMPLETE and Parsers.DEFAULT
|
||||
FLOAT -> (src as FloatArray)[srcIndex]
|
||||
|
||||
DOUBLE -> (src as DoubleArray)[srcIndex]
|
||||
|
||||
CHAR -> (src as CharArray)[srcIndex]
|
||||
|
||||
STRING -> (src as Array<String>)[srcIndex]
|
||||
|
||||
DATETIME_AS_LONG -> (src as LongArray)[srcIndex].nanoseconds
|
||||
.toComponents { seconds, nanoseconds ->
|
||||
LocalDateTime.ofEpochSecond(seconds, nanoseconds, ZoneOffset.UTC)
|
||||
}.toKotlinLocalDateTime()
|
||||
|
||||
// unused in Parsers.COMPLETE and Parsers.DEFAULT
|
||||
TIMESTAMP_AS_LONG -> (src as LongArray)[srcIndex].nanoseconds
|
||||
.toComponents { seconds, nanoseconds ->
|
||||
LocalDateTime.ofEpochSecond(seconds, nanoseconds, ZoneOffset.UTC)
|
||||
}.toKotlinLocalDateTime()
|
||||
|
||||
else -> error("unsupported parser")
|
||||
}
|
||||
}
|
||||
|
||||
private fun writeAppending(
|
||||
src: Any,
|
||||
destBegin: Int,
|
||||
destEnd: Int,
|
||||
isNull: BooleanArray,
|
||||
) {
|
||||
while (data.size < destBegin) {
|
||||
_data += null
|
||||
hasNulls = true
|
||||
}
|
||||
for ((srcIndex, _) in (destBegin..<destEnd).withIndex()) {
|
||||
_data += getValue(src, srcIndex, isNull)
|
||||
}
|
||||
}
|
||||
|
||||
private fun writeReplacing(
|
||||
src: Any,
|
||||
destBegin: Int,
|
||||
destEnd: Int,
|
||||
isNull: BooleanArray,
|
||||
) {
|
||||
for ((srcIndex, destIndex) in (destBegin..<destEnd).withIndex()) {
|
||||
_data[destIndex] = getValue(src, srcIndex, isNull)
|
||||
}
|
||||
}
|
||||
|
||||
override fun write(
|
||||
src: Any,
|
||||
isNull: BooleanArray,
|
||||
destBegin: Long,
|
||||
destEnd: Long,
|
||||
appending: Boolean,
|
||||
) {
|
||||
if (destBegin == destEnd) return
|
||||
val destBeginAsInt = destBegin.toInt()
|
||||
val destEndAsInt = destEnd.toInt()
|
||||
if (appending) {
|
||||
writeAppending(src = src, destBegin = destBeginAsInt, destEnd = destEndAsInt, isNull = isNull)
|
||||
} else {
|
||||
writeReplacing(src = src, destBegin = destBeginAsInt, destEnd = destEndAsInt, isNull = isNull)
|
||||
}
|
||||
}
|
||||
|
||||
override fun read(
|
||||
dest: Any,
|
||||
isNull: BooleanArray,
|
||||
srcBegin: Long,
|
||||
srcEnd: Long,
|
||||
) {
|
||||
if (srcBegin == srcEnd) return
|
||||
val srcBeginAsInt = srcBegin.toInt()
|
||||
val srcEndAsInt = srcEnd.toInt()
|
||||
|
||||
when (dataType) {
|
||||
BYTE -> {
|
||||
dest as ByteArray
|
||||
for ((srcIndex, destIndex) in (srcBeginAsInt..<srcEndAsInt).withIndex()) {
|
||||
val value = data[srcIndex] as Byte?
|
||||
if (value != null) dest[destIndex] = value
|
||||
isNull[destIndex] = value == null
|
||||
}
|
||||
}
|
||||
|
||||
SHORT -> {
|
||||
dest as ShortArray
|
||||
for ((srcIndex, destIndex) in (srcBeginAsInt..<srcEndAsInt).withIndex()) {
|
||||
val value = data[srcIndex] as Short?
|
||||
if (value != null) dest[destIndex] = value
|
||||
isNull[destIndex] = value == null
|
||||
}
|
||||
}
|
||||
|
||||
INT -> {
|
||||
dest as IntArray
|
||||
for ((srcIndex, destIndex) in (srcBeginAsInt..<srcEndAsInt).withIndex()) {
|
||||
val value = data[srcIndex] as Int?
|
||||
if (value != null) dest[destIndex] = value
|
||||
isNull[destIndex] = value == null
|
||||
}
|
||||
}
|
||||
|
||||
LONG -> {
|
||||
dest as LongArray
|
||||
for ((srcIndex, destIndex) in (srcBeginAsInt..<srcEndAsInt).withIndex()) {
|
||||
val value = data[srcIndex] as Long?
|
||||
if (value != null) dest[destIndex] = value
|
||||
isNull[destIndex] = value == null
|
||||
}
|
||||
}
|
||||
|
||||
// Deephaven's fast path for numeric type inference supports only byte, short, int, and long
|
||||
// so this should never be reached
|
||||
else -> error("unsupported sink state")
|
||||
}
|
||||
}
|
||||
|
||||
override fun getUnderlying(): ListSink = this
|
||||
}
|
||||
+448
@@ -0,0 +1,448 @@
|
||||
@file:JvmName("ReadDelimDeephavenKt")
|
||||
|
||||
package org.jetbrains.kotlinx.dataframe.impl.io
|
||||
|
||||
import io.deephaven.csv.CsvSpecs
|
||||
import io.deephaven.csv.parsers.DataType
|
||||
import io.deephaven.csv.parsers.DataType.BOOLEAN_AS_BYTE
|
||||
import io.deephaven.csv.parsers.DataType.BYTE
|
||||
import io.deephaven.csv.parsers.DataType.CHAR
|
||||
import io.deephaven.csv.parsers.DataType.DATETIME_AS_LONG
|
||||
import io.deephaven.csv.parsers.DataType.DOUBLE
|
||||
import io.deephaven.csv.parsers.DataType.FLOAT
|
||||
import io.deephaven.csv.parsers.DataType.INT
|
||||
import io.deephaven.csv.parsers.DataType.LONG
|
||||
import io.deephaven.csv.parsers.DataType.SHORT
|
||||
import io.deephaven.csv.parsers.DataType.STRING
|
||||
import io.deephaven.csv.parsers.DataType.TIMESTAMP_AS_LONG
|
||||
import io.deephaven.csv.parsers.Parser
|
||||
import io.deephaven.csv.parsers.Parsers
|
||||
import io.deephaven.csv.reading.CsvReader
|
||||
import io.deephaven.csv.util.CsvReaderException
|
||||
import kotlinx.datetime.LocalDate
|
||||
import kotlinx.datetime.LocalDateTime
|
||||
import kotlinx.datetime.LocalTime
|
||||
import org.apache.commons.io.input.BOMInputStream
|
||||
import org.jetbrains.kotlinx.dataframe.DataColumn
|
||||
import org.jetbrains.kotlinx.dataframe.DataFrame
|
||||
import org.jetbrains.kotlinx.dataframe.DataRow
|
||||
import org.jetbrains.kotlinx.dataframe.api.ParserOptions
|
||||
import org.jetbrains.kotlinx.dataframe.api.convertTo
|
||||
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
|
||||
import org.jetbrains.kotlinx.dataframe.api.parse
|
||||
import org.jetbrains.kotlinx.dataframe.api.parser
|
||||
import org.jetbrains.kotlinx.dataframe.api.tryParse
|
||||
import org.jetbrains.kotlinx.dataframe.columns.ValueColumn
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ADJUST_CSV_SPECS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ALLOW_MISSING_COLUMNS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.CHARSET
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.COL_TYPES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.COMPRESSION
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.FIXED_COLUMN_WIDTHS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.HAS_FIXED_WIDTH_COLUMNS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.HEADER
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.IGNORE_EMPTY_LINES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.IGNORE_EXCESS_COLUMNS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.IGNORE_SURROUNDING_SPACES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.INPUT_STREAM_READ
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.PARSER_OPTIONS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.PARSE_PARALLEL
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.QUOTE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.READ_LINES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.SKIP_LINES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.TRIM_INSIDE_QUOTED
|
||||
import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator
|
||||
import org.jetbrains.kotlinx.dataframe.io.AdjustCsvSpecs
|
||||
import org.jetbrains.kotlinx.dataframe.io.ColType
|
||||
import org.jetbrains.kotlinx.dataframe.io.Compression
|
||||
import org.jetbrains.kotlinx.dataframe.io.DEFAULT_DELIM_NULL_STRINGS
|
||||
import org.jetbrains.kotlinx.dataframe.io.skippingBomCharacters
|
||||
import org.jetbrains.kotlinx.dataframe.io.toKType
|
||||
import org.jetbrains.kotlinx.dataframe.io.useDecompressed
|
||||
import java.io.InputStream
|
||||
import java.math.BigDecimal
|
||||
import java.math.BigInteger
|
||||
import java.net.URL
|
||||
import java.nio.charset.Charset
|
||||
import kotlin.reflect.KType
|
||||
import kotlin.reflect.full.withNullability
|
||||
import kotlin.reflect.typeOf
|
||||
import kotlin.time.Duration
|
||||
import kotlin.time.Instant as StdlibInstant
|
||||
import kotlinx.datetime.Instant as DeprecatedInstant
|
||||
|
||||
/**
|
||||
* Implementation to read delimiter-separated data from an [InputStream] based on the Deephaven CSV library.
|
||||
*
|
||||
* @param inputStream Represents the file to read.
|
||||
* Use [charset] to specify the encoding.
|
||||
* @param charset The [character set][java.nio.charset.Charset] the input is encoded in.
|
||||
* Default: `null`
|
||||
*
|
||||
* If `null`, the Charset will be read from the BOM of the provided input,
|
||||
* defaulting to [UTF-8][Charsets.UTF_8] if no BOM is found.
|
||||
* @param delimiter The field delimiter character. The default is ',' for CSV, 't' for TSV.
|
||||
* @param header Optional column titles. Default: empty list.
|
||||
*
|
||||
* If non-empty, the data will be read with [header] as the column titles
|
||||
* (use [skipLines] if there's a header in the data).
|
||||
* If empty (default), the header will be read from the data.
|
||||
* @param colTypes The expected [ColType] per column name. Default: empty map, a.k.a. infer every column type.
|
||||
*
|
||||
* If supplied for a certain column name (inferred from data or given by [header]),
|
||||
* the parser will parse the column with the specified name as the specified type, else it will infer the type.
|
||||
*
|
||||
* e.g. `colTypes = `[mapOf][mapOf]`("colName" `[to][to]` `[ColType][ColType]`.`[Int][ColType.Int]`)`.
|
||||
* You can also set [ColType][ColType]`.`[DEFAULT][ColType.DEFAULT]` `[to][to]` `[ColType][ColType]`.X`
|
||||
* to set a _default_ column type, like [ColType.String].
|
||||
* @param skipLines The number of lines to skip before reading the header and data. Default: `0`.
|
||||
*
|
||||
* Useful for files with metadata, or comments at the beginning, or to give a custom [header].
|
||||
* @param readLines The maximum number of lines to read from the data. Default: `null`.
|
||||
*
|
||||
* If `null`, all lines will be read.
|
||||
* @param hasFixedWidthColumns Whether the data has fixed-width columns instead of a single delimiter.
|
||||
* Default: `false`.
|
||||
*
|
||||
* Fixed-width columns can occur, for instance, in multi-space delimited data, where the columns are separated
|
||||
* by multiple spaces instead of a single delimiter, so columns are visually aligned.
|
||||
* Column widths are determined by the header in the data (if present), or manually by setting
|
||||
* [fixedColumnWidths].
|
||||
* @param fixedColumnWidths The fixed column widths. Default: empty list.
|
||||
*
|
||||
* Requires [hasFixedWidthColumns]. If empty, the column widths will be determined by the header in the data
|
||||
* (if present), else, this manually sets the column widths.
|
||||
* The number of widths should match the number of columns.
|
||||
* @param parserOptions Optional [parsing options][ParserOptions] for columns initially read as [String].
|
||||
* Default, `null`.
|
||||
*
|
||||
* Can configure locale, date format, double parsing, skipping types, etc.
|
||||
*
|
||||
* If [parserOptions] or any of the arguments are `null`, the global parser configuration
|
||||
* ([DataFrame.parser][DataFrame.Companion.parser]) will be queried.
|
||||
*
|
||||
* The only exceptions are:
|
||||
* - [nullStrings][ParserOptions.nullStrings], which, if `null`,
|
||||
* will take the global setting + [["", "NA", "N/A", "null", "NULL", "None", "none", "NIL", "nil"]][org.jetbrains.kotlinx.dataframe.io.DEFAULT_DELIM_NULL_STRINGS].
|
||||
* - [skipTypes][ParserOptions.skipTypes], which will always add [typesDeephavenAlreadyParses][org.jetbrains.kotlinx.dataframe.impl.io.typesDeephavenAlreadyParses] to
|
||||
* the given types or the global setting.
|
||||
* @param ignoreEmptyLines Whether to skip intermediate empty lines. Default: `false`.
|
||||
*
|
||||
* If `false`, empty lines will be interpreted as having _empty_ values if [allowMissingColumns].
|
||||
* @param allowMissingColumns Whether to allow rows with fewer columns than the header. Default: `true`.
|
||||
*
|
||||
* If `true`, rows that are too short will be interpreted as _empty_ values.
|
||||
* @param ignoreExcessColumns Whether to ignore rows with more columns than the header. Default: `true`.
|
||||
*
|
||||
* If `true`, rows that are too long will have those columns dropped.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param ignoreSurroundingSpaces Whether to ignore leading and trailing blanks around non-quoted fields.
|
||||
* Default: `true`.
|
||||
* @param trimInsideQuoted Whether to ignore leading and trailing blanks inside quoted fields.
|
||||
* Default: `false`.
|
||||
* @param parseParallel Whether to parse the data in parallel. Default: `true`.
|
||||
*
|
||||
* If `true`, the data will be read and parsed in parallel by the Deephaven parser.
|
||||
* This is usually faster but can be turned off for debugging.
|
||||
* @param compression The compression of the data.
|
||||
* Default: [Compression.None], unless detected otherwise from the input file or url.
|
||||
* @param adjustCsvSpecs Optional extra [CsvSpecs] configuration. Default: `{ it }`.
|
||||
*
|
||||
* Before instantiating the [CsvSpecs], the [CsvSpecs.Builder] will be passed to this lambda.
|
||||
* This will allow you to configure/overwrite any CSV / TSV parsing options.
|
||||
*/
|
||||
internal fun readDelimImpl(
|
||||
inputStream: InputStream,
|
||||
delimiter: Char,
|
||||
header: List<String>,
|
||||
charset: Charset?,
|
||||
hasFixedWidthColumns: Boolean,
|
||||
fixedColumnWidths: List<Int>,
|
||||
colTypes: Map<String, ColType>,
|
||||
skipLines: Long,
|
||||
readLines: Long?,
|
||||
parserOptions: ParserOptions?,
|
||||
ignoreEmptyLines: Boolean,
|
||||
allowMissingColumns: Boolean,
|
||||
ignoreExcessColumns: Boolean,
|
||||
quote: Char,
|
||||
ignoreSurroundingSpaces: Boolean,
|
||||
trimInsideQuoted: Boolean,
|
||||
parseParallel: Boolean,
|
||||
compression: Compression<*>,
|
||||
adjustCsvSpecs: AdjustCsvSpecs,
|
||||
): DataFrame<*> {
|
||||
// set up the csv specs
|
||||
val csvSpecs = with(CsvSpecs.builder()) {
|
||||
customDoubleParser(DataFrameCustomDoubleParser(parserOptions))
|
||||
|
||||
// use the given nullStrings if provided, else take the global ones + some extras
|
||||
val nullStrings = parserOptions?.nullStrings ?: (DataFrame.parser.nulls + DEFAULT_DELIM_NULL_STRINGS)
|
||||
nullValueLiterals(nullStrings)
|
||||
headerLegalizer(::legalizeHeader)
|
||||
numRows(readLines ?: Long.MAX_VALUE)
|
||||
ignoreEmptyLines(ignoreEmptyLines)
|
||||
allowMissingColumns(allowMissingColumns)
|
||||
ignoreExcessColumns(ignoreExcessColumns)
|
||||
if (!hasFixedWidthColumns) delimiter(delimiter)
|
||||
quote(quote)
|
||||
ignoreSurroundingSpaces(ignoreSurroundingSpaces)
|
||||
trim(trimInsideQuoted)
|
||||
concurrent(parseParallel)
|
||||
header(header)
|
||||
hasFixedWidthColumns(hasFixedWidthColumns)
|
||||
if (hasFixedWidthColumns && fixedColumnWidths.isNotEmpty()) fixedColumnWidths(fixedColumnWidths)
|
||||
skipLines(takeHeaderFromCsv = header.isEmpty(), skipLines = skipLines)
|
||||
parsers(parserOptions, colTypes)
|
||||
|
||||
adjustCsvSpecs(this, this)
|
||||
}.build()
|
||||
|
||||
val csvReaderResult = inputStream.useDecompressed(compression) { decompressedInputStream ->
|
||||
// read the csv
|
||||
try {
|
||||
val deBommedInputString = decompressedInputStream.skippingBomCharacters()
|
||||
|
||||
// choose charset like: provided? -> from BOM? -> UTF-8
|
||||
val streamCharset = charset
|
||||
?: (deBommedInputString as? BOMInputStream)?.bom?.let { Charset.forName(it.charsetName) }
|
||||
?: Charsets.UTF_8
|
||||
|
||||
@Suppress("ktlint:standard:comment-wrapping")
|
||||
CsvReader.read(
|
||||
/* specs = */ csvSpecs,
|
||||
/* stream = */ deBommedInputString,
|
||||
/* streamCharset = */ streamCharset,
|
||||
/* sinkFactory = */ ListSink.SINK_FACTORY,
|
||||
)
|
||||
} catch (e: CsvReaderException) {
|
||||
// catch case when the file is empty and header needs to be inferred from it.
|
||||
if (e.message ==
|
||||
"Can't proceed because hasHeaderRow is set but input file is empty or shorter than skipHeaderRows"
|
||||
) {
|
||||
return@readDelimImpl DataFrame.empty()
|
||||
}
|
||||
throw IllegalStateException(
|
||||
"Could not read delimiter-separated data: CsvReaderException: ${e.message}: ${e.cause?.message ?: ""}",
|
||||
e,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
val defaultColType = colTypes[ColType.DEFAULT]
|
||||
|
||||
// convert each ResultColumn to a DataColumn
|
||||
val cols = csvReaderResult.map {
|
||||
it.toDataColumn(
|
||||
parserOptions = parserOptions,
|
||||
desiredColType = colTypes[it.name()] ?: defaultColType,
|
||||
)
|
||||
}
|
||||
|
||||
return dataFrameOf(cols)
|
||||
}
|
||||
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
private fun CsvReader.ResultColumn.toDataColumn(
|
||||
parserOptions: ParserOptions?,
|
||||
desiredColType: ColType?,
|
||||
): DataColumn<*> {
|
||||
val listSink = data()!! as ListSink
|
||||
val columnData = listSink.data
|
||||
val dataType = listSink.dataType
|
||||
val hasNulls = listSink.hasNulls
|
||||
val type = dataType().toKType().withNullability(hasNulls)
|
||||
|
||||
val column = DataColumn.createValueColumn(
|
||||
name = name(),
|
||||
values = columnData,
|
||||
type = type,
|
||||
)
|
||||
if (dataType != STRING) return column
|
||||
|
||||
// attempt to perform additional parsing if necessary, will remain String if it fails
|
||||
column as ValueColumn<String?>
|
||||
|
||||
return when {
|
||||
desiredColType != null ->
|
||||
column.convertTo(
|
||||
newType = desiredColType.toKType().withNullability(true),
|
||||
parserOptions = parserOptions,
|
||||
)
|
||||
|
||||
else -> {
|
||||
val givenSkipTypes = parserOptions?.skipTypes ?: DataFrame.parser.skipTypes
|
||||
// no need to check for types that Deephaven already parses, skip those too
|
||||
val adjustedSkipTypes = givenSkipTypes + typesDeephavenAlreadyParses
|
||||
val adjustedParserOptions = (parserOptions ?: ParserOptions())
|
||||
.copy(skipTypes = adjustedSkipTypes)
|
||||
|
||||
column.tryParse(adjustedParserOptions)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun DataType?.toKType(): KType =
|
||||
when (this) {
|
||||
BOOLEAN_AS_BYTE -> typeOf<Boolean>()
|
||||
|
||||
// unused in Parsers.DEFAULT
|
||||
BYTE -> typeOf<Byte>()
|
||||
|
||||
// unused in Parsers.DEFAULT
|
||||
SHORT -> typeOf<Short>()
|
||||
|
||||
INT -> typeOf<Int>()
|
||||
|
||||
LONG -> typeOf<Long>()
|
||||
|
||||
// unused in Parsers.COMPLETE and Parsers.DEFAULT
|
||||
FLOAT -> typeOf<Float>()
|
||||
|
||||
DOUBLE -> typeOf<Double>()
|
||||
|
||||
DATETIME_AS_LONG -> typeOf<LocalDateTime>()
|
||||
|
||||
CHAR -> typeOf<Char>()
|
||||
|
||||
STRING -> typeOf<String>()
|
||||
|
||||
// unused in Parsers.COMPLETE and Parsers.DEFAULT
|
||||
TIMESTAMP_AS_LONG -> typeOf<LocalDateTime>()
|
||||
|
||||
DataType.CUSTOM -> error("custom data type")
|
||||
|
||||
null -> error("null data type")
|
||||
}
|
||||
|
||||
private fun legalizeHeader(header: Array<String>): Array<String> {
|
||||
val generator = ColumnNameGenerator()
|
||||
return header.map { generator.addUnique(it) }.toTypedArray()
|
||||
}
|
||||
|
||||
private fun CsvSpecs.Builder.skipLines(takeHeaderFromCsv: Boolean, skipLines: Long): CsvSpecs.Builder =
|
||||
if (takeHeaderFromCsv) {
|
||||
skipHeaderRows(skipLines)
|
||||
} else {
|
||||
skipRows(skipLines)
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the correct parsers for the csv, based on [colTypes] and [ParserOptions.skipTypes].
|
||||
* If [ColType.DEFAULT] is present, it sets the default parser.
|
||||
*
|
||||
* Logic overview:
|
||||
*
|
||||
* - if no [colTypes] are given
|
||||
* - let deephaven use all its [default parsers][Parsers.DEFAULT] minus [Parsers.DATETIME]
|
||||
* - subtract parsers of [skipTypes][ParserOptions.skipTypes] if those are supplied
|
||||
* - if [colTypes] are supplied
|
||||
* - if [ColType.DEFAULT] is among the values
|
||||
* - set the parser for each supplied column+colType
|
||||
* - let deephaven use _only_ the parser given as [ColType.DEFAULT] type
|
||||
* - if [ColType.DEFAULT] is not among the values
|
||||
* - set the parser for each supplied column+coltype
|
||||
* - let deephaven use all its [default parsers][Parsers.DEFAULT] minus [Parsers.DATETIME]
|
||||
* - subtract parsers of [skipTypes][ParserOptions.skipTypes] if those are supplied
|
||||
*
|
||||
* We will not use [Deephaven's DateTime parser][Parsers.DATETIME].
|
||||
* This is done to avoid different behavior compared to [DataFrame.parse];
|
||||
* Deephaven parses [Instant] as [LocalDateTime]. [Issue #1047](https://github.com/Kotlin/dataframe/issues/1047)
|
||||
*
|
||||
* Note that `skipTypes` will never skip a type explicitly set by `colTypes`.
|
||||
* This is intended.
|
||||
*/
|
||||
private fun CsvSpecs.Builder.parsers(parserOptions: ParserOptions?, colTypes: Map<String, ColType>): CsvSpecs.Builder {
|
||||
for ((colName, colType) in colTypes) {
|
||||
if (colName == ColType.DEFAULT) continue
|
||||
putParserForName(colName, colType.toCsvParser())
|
||||
}
|
||||
// BOOLEAN, INT, LONG, DOUBLE, CHAR, STRING
|
||||
val defaultParsers = Parsers.DEFAULT - Parsers.DATETIME
|
||||
val skipTypes = parserOptions?.skipTypes ?: DataFrame.parser.skipTypes
|
||||
val parsersToUse = when {
|
||||
ColType.DEFAULT in colTypes ->
|
||||
listOf(colTypes[ColType.DEFAULT]!!.toCsvParser(), Parsers.STRING)
|
||||
|
||||
skipTypes.isNotEmpty() -> {
|
||||
val parsersToSkip = skipTypes
|
||||
.mapNotNull { it.toColType().toCsvParserOrNull() }
|
||||
defaultParsers.toSet() - parsersToSkip.toSet()
|
||||
}
|
||||
|
||||
else -> defaultParsers
|
||||
}
|
||||
parsers(parsersToUse)
|
||||
return this
|
||||
}
|
||||
|
||||
private fun CsvSpecs.Builder.header(header: List<String>): CsvSpecs.Builder =
|
||||
if (header.isEmpty()) {
|
||||
// take header from csv
|
||||
hasHeaderRow(true)
|
||||
} else {
|
||||
hasHeaderRow(false)
|
||||
.headers(header)
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a [ColType] to a [Parser] from the Deephaven CSV library.
|
||||
* If no direct [Parser] exists, it returns `null`.
|
||||
*/
|
||||
internal fun ColType.toCsvParserOrNull(): Parser<*>? =
|
||||
when (this) {
|
||||
ColType.Int -> Parsers.INT
|
||||
ColType.Long -> Parsers.LONG
|
||||
ColType.Double -> Parsers.DOUBLE
|
||||
ColType.Char -> Parsers.CHAR
|
||||
ColType.Boolean -> Parsers.BOOLEAN
|
||||
ColType.String -> Parsers.STRING
|
||||
else -> null
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a [ColType] to a [Parser] from the Deephaven CSV library.
|
||||
* If no direct [Parser] exists, it defaults to [Parsers.STRING] so that [DataFrame.parse] can handle it.
|
||||
*/
|
||||
internal fun ColType.toCsvParser(): Parser<*> = toCsvParserOrNull() ?: Parsers.STRING
|
||||
|
||||
internal fun KType.toColType(): ColType =
|
||||
when (this.withNullability(false)) {
|
||||
typeOf<Int>() -> ColType.Int
|
||||
typeOf<Long>() -> ColType.Long
|
||||
typeOf<Double>() -> ColType.Double
|
||||
typeOf<Boolean>() -> ColType.Boolean
|
||||
typeOf<BigDecimal>() -> ColType.BigDecimal
|
||||
typeOf<BigInteger>() -> ColType.BigInteger
|
||||
typeOf<LocalDate>() -> ColType.LocalDate
|
||||
typeOf<LocalTime>() -> ColType.LocalTime
|
||||
typeOf<LocalDateTime>() -> ColType.LocalDateTime
|
||||
typeOf<String>() -> ColType.String
|
||||
typeOf<DeprecatedInstant>() -> ColType.DeprecatedInstant
|
||||
typeOf<StdlibInstant>() -> ColType.StdlibInstant
|
||||
typeOf<Duration>() -> ColType.Duration
|
||||
typeOf<URL>() -> ColType.Url
|
||||
typeOf<DataFrame<*>>() -> ColType.JsonArray
|
||||
typeOf<DataRow<*>>() -> ColType.JsonObject
|
||||
typeOf<Char>() -> ColType.Char
|
||||
else -> ColType.String
|
||||
}
|
||||
|
||||
/**
|
||||
* Types that Deephaven already parses, so we can skip them when
|
||||
* defaulting to DataFrame's String parsers.
|
||||
*
|
||||
* [LocalDateTime] and [java.time.LocalDateTime] are not included because Deephaven cannot recognize all formats.
|
||||
*/
|
||||
internal val typesDeephavenAlreadyParses: Set<KType> =
|
||||
setOf(
|
||||
typeOf<Int>(),
|
||||
typeOf<Long>(),
|
||||
typeOf<Double>(),
|
||||
typeOf<Char>(),
|
||||
typeOf<Boolean>(),
|
||||
)
|
||||
+112
@@ -0,0 +1,112 @@
|
||||
@file:JvmName("WriteDelimDeephavenKt")
|
||||
|
||||
package org.jetbrains.kotlinx.dataframe.impl.io
|
||||
|
||||
import org.apache.commons.csv.CSVFormat
|
||||
import org.jetbrains.kotlinx.dataframe.AnyFrame
|
||||
import org.jetbrains.kotlinx.dataframe.AnyRow
|
||||
import org.jetbrains.kotlinx.dataframe.api.forEach
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.CommonWriteDelimDocs
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ADJUST_CSV_FORMAT
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.CSV_DELIMITER
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.WRITER_WRITE
|
||||
import org.jetbrains.kotlinx.dataframe.io.AdjustCSVFormat
|
||||
import org.jetbrains.kotlinx.dataframe.io.QuoteMode
|
||||
import org.jetbrains.kotlinx.dataframe.io.toJson
|
||||
import org.apache.commons.csv.QuoteMode as ApacheQuoteMode
|
||||
|
||||
/**
|
||||
* Writes [df] to [writer] in a delimiter-separated format.
|
||||
*
|
||||
* @param df The data to write.
|
||||
* @param writer The [Appendable] to write to.
|
||||
* @param delimiter The field delimiter character. Default: ','.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param includeHeader Whether to include the header in the output. Default: `true`.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param quoteMode The [QuoteMode][org.jetbrains.kotlinx.dataframe.io.QuoteMode] to use when writing CSV / TSV files.
|
||||
* Default: [QuoteMode.MINIMAL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.MINIMAL].
|
||||
* @param escapeChar The escape character to use when writing CSV / TSV files with [QuoteMode.NONE][org.jetbrains.kotlinx.dataframe.io.QuoteMode.NONE].
|
||||
* Default: `null`. This will double-quote the value.
|
||||
* @param commentChar The character that indicates a comment line in a CSV / TSV file.
|
||||
* Default: `'#'`.
|
||||
* @param headerComments A list of comments to include at the beginning of the CSV / TSV file.
|
||||
* Default: empty list.
|
||||
* @param recordSeparator The character that separates records in a CSV / TSV file.
|
||||
* Default: `'\n'`, a Unix-newline.
|
||||
* @param adjustCsvFormat Optional extra [CSVFormat] configuration. Default: `{ it }`.
|
||||
*
|
||||
* Before instantiating the [CSVFormat], the [CSVFormat.Builder] will be passed to this lambda.
|
||||
* This will allow you to configure/overwrite any CSV / TSV writing options.
|
||||
*/
|
||||
internal fun writeDelimImpl(
|
||||
df: AnyFrame,
|
||||
writer: Appendable,
|
||||
delimiter: Char,
|
||||
includeHeader: Boolean,
|
||||
quote: Char?,
|
||||
quoteMode: QuoteMode,
|
||||
escapeChar: Char?,
|
||||
commentChar: Char?,
|
||||
headerComments: List<String>,
|
||||
recordSeparator: String,
|
||||
adjustCsvFormat: AdjustCSVFormat,
|
||||
) {
|
||||
// setup CSV format
|
||||
val format = with(CSVFormat.Builder.create(CSVFormat.DEFAULT)) {
|
||||
setDelimiter(delimiter)
|
||||
setQuote(quote)
|
||||
setSkipHeaderRecord(!includeHeader)
|
||||
setQuoteMode(quoteMode.toApache())
|
||||
setRecordSeparator(recordSeparator)
|
||||
setEscape(escapeChar)
|
||||
setCommentMarker(commentChar)
|
||||
setHeaderComments(*headerComments.toTypedArray())
|
||||
}.let { adjustCsvFormat(it, it) }
|
||||
.get()
|
||||
|
||||
// let the format handle the writing, only converting AnyRow and AnyFrame to JSON
|
||||
format.print(writer).use { printer ->
|
||||
if (includeHeader) {
|
||||
printer.printRecord(df.columnNames())
|
||||
}
|
||||
df.forEach {
|
||||
val values = it.values().map {
|
||||
when (it) {
|
||||
is AnyRow -> try {
|
||||
it.toJson()
|
||||
} catch (_: NoClassDefFoundError) {
|
||||
error(
|
||||
"Encountered a DataRow value when writing to csv/tsv/delim. It must be serialized to JSON, requiring the 'dataframe-json' dependency.",
|
||||
)
|
||||
}
|
||||
|
||||
is AnyFrame -> try {
|
||||
it.toJson()
|
||||
} catch (_: NoClassDefFoundError) {
|
||||
error(
|
||||
"Encountered a DataFrame value when writing to csv/tsv/delim. It must be serialized to JSON, requiring the 'dataframe-json' dependency.",
|
||||
)
|
||||
}
|
||||
|
||||
else -> it
|
||||
}
|
||||
}
|
||||
printer.printRecord(values)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
internal fun QuoteMode.toApache(): ApacheQuoteMode =
|
||||
when (this) {
|
||||
QuoteMode.ALL -> ApacheQuoteMode.ALL
|
||||
QuoteMode.MINIMAL -> ApacheQuoteMode.MINIMAL
|
||||
QuoteMode.NON_NUMERIC -> ApacheQuoteMode.NON_NUMERIC
|
||||
QuoteMode.NONE -> ApacheQuoteMode.NONE
|
||||
QuoteMode.ALL_NON_NULL -> ApacheQuoteMode.ALL_NON_NULL
|
||||
}
|
||||
+26
@@ -0,0 +1,26 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
/** Defines quoting behavior. */
|
||||
public enum class QuoteMode {
|
||||
|
||||
/** Quotes all fields. */
|
||||
ALL,
|
||||
|
||||
/** Quotes all non-null fields. */
|
||||
ALL_NON_NULL,
|
||||
|
||||
/**
|
||||
* Quotes fields that contain special characters such as a field delimiter, quote character, or any of the
|
||||
* characters in the line separator string.
|
||||
*/
|
||||
MINIMAL,
|
||||
|
||||
/** Quotes all non-numeric fields. */
|
||||
NON_NUMERIC,
|
||||
|
||||
/**
|
||||
* Never quotes fields. When the delimiter occurs in data, the printer prefixes it with the escape character. If the
|
||||
* escape character is not set, format validation throws an exception.
|
||||
*/
|
||||
NONE,
|
||||
}
|
||||
Vendored
+39
@@ -0,0 +1,39 @@
|
||||
@file:JvmName("CsvDeephavenKt")
|
||||
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import org.jetbrains.kotlinx.dataframe.DataFrame
|
||||
import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod
|
||||
import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams
|
||||
import java.io.File
|
||||
import java.io.InputStream
|
||||
import java.nio.file.Path
|
||||
import kotlin.reflect.typeOf
|
||||
|
||||
public class CsvDeephaven(private val delimiter: Char = DelimParams.CSV_DELIMITER) : SupportedDataFrameFormat {
|
||||
override fun readDataFrame(stream: InputStream, header: List<String>): DataFrame<*> =
|
||||
DataFrame.readCsv(inputStream = stream, header = header, delimiter = delimiter)
|
||||
|
||||
override fun readDataFrame(file: File, header: List<String>): DataFrame<*> =
|
||||
DataFrame.readCsv(file = file, header = header, delimiter = delimiter)
|
||||
|
||||
override fun readDataFrame(path: Path, header: List<String>): DataFrame<*> =
|
||||
DataFrame.readCsv(path = path, delimiter = delimiter, header = header)
|
||||
|
||||
override fun acceptsExtension(ext: String): Boolean = ext == "csv"
|
||||
|
||||
override fun acceptsSample(sample: SupportedFormatSample): Boolean = true // Extension is enough
|
||||
|
||||
override val testOrder: Int = 20_000
|
||||
|
||||
override fun createDefaultReadMethod(pathRepresentation: String?): DefaultReadDfMethod {
|
||||
val arguments = MethodArguments().add("delimiter", typeOf<Char>(), "'%L'", delimiter)
|
||||
return DefaultReadCsvMethod(pathRepresentation, arguments)
|
||||
}
|
||||
}
|
||||
|
||||
private const val READ_CSV = "readCsv"
|
||||
|
||||
internal class DefaultReadCsvMethod(path: String?, arguments: MethodArguments) :
|
||||
AbstractDefaultReadMethod(path, arguments, READ_CSV)
|
||||
+1053
File diff suppressed because it is too large
Load Diff
+175
@@ -0,0 +1,175 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import org.jetbrains.kotlinx.dataframe.DataFrame
|
||||
import org.jetbrains.kotlinx.dataframe.api.ParserOptions
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.CommonReadDelimDocs
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ADJUST_CSV_SPECS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ALLOW_MISSING_COLUMNS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.COL_TYPES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.CSV_DELIMITER
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.FIXED_COLUMN_WIDTHS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.HAS_FIXED_WIDTH_COLUMNS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.HEADER
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.IGNORE_EMPTY_LINES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.IGNORE_EXCESS_COLUMNS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.IGNORE_SURROUNDING_SPACES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.PARSER_OPTIONS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.PARSE_PARALLEL
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.QUOTE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.READ_LINES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.SKIP_LINES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.TEXT_READ
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.TRIM_INSIDE_QUOTED
|
||||
import org.jetbrains.kotlinx.dataframe.impl.io.readDelimImpl
|
||||
|
||||
/**
|
||||
* ### Read CSV String to [DataFrame]
|
||||
*
|
||||
* Reads any CSV [String] to a [DataFrame][DataFrame].
|
||||
*
|
||||
* Parameters you can use to customize the reading process include, for instance, [delimiter],
|
||||
* [header], [colTypes], [readLines], and [parserOptions].
|
||||
* See the param list below for all settings.
|
||||
*
|
||||
* The integration is built upon [Deephaven CSV](https://github.com/deephaven/deephaven-csv).
|
||||
*
|
||||
* ##### Similar Functions
|
||||
* With the overloads of [DataFrame.readCsv][readCsv]`()`, you can read any CSV by [File][File],
|
||||
* [Path][java.nio.file.Path], [URL][URL], or [InputStream][InputStream].
|
||||
* Reading by file path or URL can also be done by passing a [String].
|
||||
*
|
||||
* For example, [DataFrame.readCsv][readCsv]`("input.csv")` or with some options:
|
||||
*
|
||||
* [DataFrame.readCsv][readCsv]`(`
|
||||
*
|
||||
* `file = `[File][File]`("input.csv"),`
|
||||
*
|
||||
* `parserOptions = `[ParserOptions][org.jetbrains.kotlinx.dataframe.api.ParserOptions]`(locale = `[Locale][java.util.Locale]`.`[US][java.util.Locale.US]`),`
|
||||
*
|
||||
* `colTypes = `[mapOf][mapOf]`("a" `[to][to]` `[ColType][ColType]`.`[Int][ColType.Int]`, `[ColType][ColType]`.`[DEFAULT][ColType.DEFAULT]` `[to][to]` `[ColType][ColType]`.`[String][ColType.String]`),`
|
||||
*
|
||||
* `readLines = 1000L,`
|
||||
*
|
||||
* `)`
|
||||
*
|
||||
* ZIP (.zip) or GZIP (.gz) files are supported by default. [compression] is automatically detected.
|
||||
*
|
||||
* You can also read "raw" CSV data from a [String] like this:
|
||||
*
|
||||
* [DataFrame.readCsvStr][readCsvStr]`("a,b,c", delimiter = ",")`
|
||||
*
|
||||
* @param text The raw data to read in the form of a [String].
|
||||
* @param delimiter The field delimiter character. Default: ','.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param charset The [character set][java.nio.charset.Charset] the input is encoded in.
|
||||
* Default: `null`
|
||||
*
|
||||
* If `null`, the Charset will be read from the BOM of the provided input,
|
||||
* defaulting to [UTF-8][Charsets.UTF_8] if no BOM is found.
|
||||
* @param header Optional column titles. Default: empty list.
|
||||
*
|
||||
* If non-empty, the data will be read with [header] as the column titles
|
||||
* (use [skipLines] if there's a header in the data).
|
||||
* If empty (default), the header will be read from the data.
|
||||
* @param hasFixedWidthColumns Whether the data has fixed-width columns instead of a single delimiter.
|
||||
* Default: `false`.
|
||||
*
|
||||
* Fixed-width columns can occur, for instance, in multi-space delimited data, where the columns are separated
|
||||
* by multiple spaces instead of a single delimiter, so columns are visually aligned.
|
||||
* Column widths are determined by the header in the data (if present), or manually by setting
|
||||
* [fixedColumnWidths].
|
||||
* @param fixedColumnWidths The fixed column widths. Default: empty list.
|
||||
*
|
||||
* Requires [hasFixedWidthColumns]. If empty, the column widths will be determined by the header in the data
|
||||
* (if present), else, this manually sets the column widths.
|
||||
* The number of widths should match the number of columns.
|
||||
* @param colTypes The expected [ColType] per column name. Default: empty map, a.k.a. infer every column type.
|
||||
*
|
||||
* If supplied for a certain column name (inferred from data or given by [header]),
|
||||
* the parser will parse the column with the specified name as the specified type, else it will infer the type.
|
||||
*
|
||||
* e.g. `colTypes = `[mapOf][mapOf]`("colName" `[to][to]` `[ColType][ColType]`.`[Int][ColType.Int]`)`.
|
||||
* You can also set [ColType][ColType]`.`[DEFAULT][ColType.DEFAULT]` `[to][to]` `[ColType][ColType]`.X`
|
||||
* to set a _default_ column type, like [ColType.String].
|
||||
* @param skipLines The number of lines to skip before reading the header and data. Default: `0`.
|
||||
*
|
||||
* Useful for files with metadata, or comments at the beginning, or to give a custom [header].
|
||||
* @param readLines The maximum number of lines to read from the data. Default: `null`.
|
||||
*
|
||||
* If `null`, all lines will be read.
|
||||
* @param parserOptions Optional [parsing options][ParserOptions] for columns initially read as [String].
|
||||
* Default, `null`.
|
||||
*
|
||||
* Can configure locale, date format, double parsing, skipping types, etc.
|
||||
*
|
||||
* If [parserOptions] or any of the arguments are `null`, the global parser configuration
|
||||
* ([DataFrame.parser][DataFrame.Companion.parser]) will be queried.
|
||||
*
|
||||
* The only exceptions are:
|
||||
* - [nullStrings][ParserOptions.nullStrings], which, if `null`,
|
||||
* will take the global setting + [["", "NA", "N/A", "null", "NULL", "None", "none", "NIL", "nil"]][org.jetbrains.kotlinx.dataframe.io.DEFAULT_DELIM_NULL_STRINGS].
|
||||
* - [skipTypes][ParserOptions.skipTypes], which will always add [typesDeephavenAlreadyParses][org.jetbrains.kotlinx.dataframe.impl.io.typesDeephavenAlreadyParses] to
|
||||
* the given types or the global setting.
|
||||
* @param ignoreEmptyLines Whether to skip intermediate empty lines. Default: `false`.
|
||||
*
|
||||
* If `false`, empty lines will be interpreted as having _empty_ values if [allowMissingColumns].
|
||||
* @param allowMissingColumns Whether to allow rows with fewer columns than the header. Default: `true`.
|
||||
*
|
||||
* If `true`, rows that are too short will be interpreted as _empty_ values.
|
||||
* @param ignoreExcessColumns Whether to ignore rows with more columns than the header. Default: `true`.
|
||||
*
|
||||
* If `true`, rows that are too long will have those columns dropped.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param ignoreSurroundingSpaces Whether to ignore leading and trailing blanks around non-quoted fields.
|
||||
* Default: `true`.
|
||||
* @param trimInsideQuoted Whether to ignore leading and trailing blanks inside quoted fields.
|
||||
* Default: `false`.
|
||||
* @param parseParallel Whether to parse the data in parallel. Default: `true`.
|
||||
*
|
||||
* If `true`, the data will be read and parsed in parallel by the Deephaven parser.
|
||||
* This is usually faster but can be turned off for debugging.
|
||||
*/
|
||||
public fun DataFrame.Companion.readCsvStr(
|
||||
text: String,
|
||||
delimiter: Char = CSV_DELIMITER,
|
||||
header: List<String> = HEADER,
|
||||
hasFixedWidthColumns: Boolean = HAS_FIXED_WIDTH_COLUMNS,
|
||||
fixedColumnWidths: List<Int> = FIXED_COLUMN_WIDTHS,
|
||||
colTypes: Map<String, ColType> = COL_TYPES,
|
||||
skipLines: Long = SKIP_LINES,
|
||||
readLines: Long? = READ_LINES,
|
||||
parserOptions: ParserOptions? = PARSER_OPTIONS,
|
||||
ignoreEmptyLines: Boolean = IGNORE_EMPTY_LINES,
|
||||
allowMissingColumns: Boolean = ALLOW_MISSING_COLUMNS,
|
||||
ignoreExcessColumns: Boolean = IGNORE_EXCESS_COLUMNS,
|
||||
quote: Char = QUOTE,
|
||||
ignoreSurroundingSpaces: Boolean = IGNORE_SURROUNDING_SPACES,
|
||||
trimInsideQuoted: Boolean = TRIM_INSIDE_QUOTED,
|
||||
parseParallel: Boolean = PARSE_PARALLEL,
|
||||
): DataFrame<*> =
|
||||
readDelimImpl(
|
||||
inputStream = text.byteInputStream(),
|
||||
charset = Charsets.UTF_8,
|
||||
delimiter = delimiter,
|
||||
header = header,
|
||||
hasFixedWidthColumns = hasFixedWidthColumns,
|
||||
fixedColumnWidths = fixedColumnWidths,
|
||||
compression = Compression.None, // of course
|
||||
colTypes = colTypes,
|
||||
skipLines = skipLines,
|
||||
readLines = readLines,
|
||||
parserOptions = parserOptions,
|
||||
ignoreEmptyLines = ignoreEmptyLines,
|
||||
allowMissingColumns = allowMissingColumns,
|
||||
ignoreExcessColumns = ignoreExcessColumns,
|
||||
quote = quote,
|
||||
ignoreSurroundingSpaces = ignoreSurroundingSpaces,
|
||||
trimInsideQuoted = trimInsideQuoted,
|
||||
parseParallel = parseParallel,
|
||||
adjustCsvSpecs = ADJUST_CSV_SPECS,
|
||||
)
|
||||
+1059
File diff suppressed because it is too large
Load Diff
+175
@@ -0,0 +1,175 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import org.jetbrains.kotlinx.dataframe.DataFrame
|
||||
import org.jetbrains.kotlinx.dataframe.api.ParserOptions
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.CommonReadDelimDocs
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ADJUST_CSV_SPECS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ALLOW_MISSING_COLUMNS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.COL_TYPES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.DELIM_DELIMITER
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.FIXED_COLUMN_WIDTHS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.HAS_FIXED_WIDTH_COLUMNS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.HEADER
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.IGNORE_EMPTY_LINES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.IGNORE_EXCESS_COLUMNS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.IGNORE_SURROUNDING_SPACES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.PARSER_OPTIONS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.PARSE_PARALLEL
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.QUOTE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.READ_LINES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.SKIP_LINES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.TEXT_READ
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.TRIM_INSIDE_QUOTED
|
||||
import org.jetbrains.kotlinx.dataframe.impl.io.readDelimImpl
|
||||
|
||||
/**
|
||||
* ### Read Delimiter-Separated Text String to [DataFrame]
|
||||
*
|
||||
* Reads any delimiter-separated text [String] to a [DataFrame][DataFrame].
|
||||
*
|
||||
* Parameters you can use to customize the reading process include, for instance, [delimiter],
|
||||
* [header], [colTypes], [readLines], and [parserOptions].
|
||||
* See the param list below for all settings.
|
||||
*
|
||||
* The integration is built upon [Deephaven CSV](https://github.com/deephaven/deephaven-csv).
|
||||
*
|
||||
* ##### Similar Functions
|
||||
* With the overloads of [DataFrame.readDelim][readDelim]`()`, you can read any delimiter-separated text by [File][File],
|
||||
* [Path][java.nio.file.Path], [URL][URL], or [InputStream][InputStream].
|
||||
* Reading by file path or URL can also be done by passing a [String].
|
||||
*
|
||||
* For example, [DataFrame.readDelim][readDelim]`("input.txt")` or with some options:
|
||||
*
|
||||
* [DataFrame.readDelim][readDelim]`(`
|
||||
*
|
||||
* `file = `[File][File]`("input.txt"),`
|
||||
*
|
||||
* `parserOptions = `[ParserOptions][org.jetbrains.kotlinx.dataframe.api.ParserOptions]`(locale = `[Locale][java.util.Locale]`.`[US][java.util.Locale.US]`),`
|
||||
*
|
||||
* `colTypes = `[mapOf][mapOf]`("a" `[to][to]` `[ColType][ColType]`.`[Int][ColType.Int]`, `[ColType][ColType]`.`[DEFAULT][ColType.DEFAULT]` `[to][to]` `[ColType][ColType]`.`[String][ColType.String]`),`
|
||||
*
|
||||
* `readLines = 1000L,`
|
||||
*
|
||||
* `)`
|
||||
*
|
||||
* ZIP (.zip) or GZIP (.gz) files are supported by default. [compression] is automatically detected.
|
||||
*
|
||||
* You can also read "raw" delimiter-separated text data from a [String] like this:
|
||||
*
|
||||
* [DataFrame.readDelimStr][readDelimStr]`("a,b,c", delimiter = ",")`
|
||||
*
|
||||
* @param text The raw data to read in the form of a [String].
|
||||
* @param delimiter The field delimiter character. Default: ','.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param charset The [character set][java.nio.charset.Charset] the input is encoded in.
|
||||
* Default: `null`
|
||||
*
|
||||
* If `null`, the Charset will be read from the BOM of the provided input,
|
||||
* defaulting to [UTF-8][Charsets.UTF_8] if no BOM is found.
|
||||
* @param header Optional column titles. Default: empty list.
|
||||
*
|
||||
* If non-empty, the data will be read with [header] as the column titles
|
||||
* (use [skipLines] if there's a header in the data).
|
||||
* If empty (default), the header will be read from the data.
|
||||
* @param hasFixedWidthColumns Whether the data has fixed-width columns instead of a single delimiter.
|
||||
* Default: `false`.
|
||||
*
|
||||
* Fixed-width columns can occur, for instance, in multi-space delimited data, where the columns are separated
|
||||
* by multiple spaces instead of a single delimiter, so columns are visually aligned.
|
||||
* Column widths are determined by the header in the data (if present), or manually by setting
|
||||
* [fixedColumnWidths].
|
||||
* @param fixedColumnWidths The fixed column widths. Default: empty list.
|
||||
*
|
||||
* Requires [hasFixedWidthColumns]. If empty, the column widths will be determined by the header in the data
|
||||
* (if present), else, this manually sets the column widths.
|
||||
* The number of widths should match the number of columns.
|
||||
* @param colTypes The expected [ColType] per column name. Default: empty map, a.k.a. infer every column type.
|
||||
*
|
||||
* If supplied for a certain column name (inferred from data or given by [header]),
|
||||
* the parser will parse the column with the specified name as the specified type, else it will infer the type.
|
||||
*
|
||||
* e.g. `colTypes = `[mapOf][mapOf]`("colName" `[to][to]` `[ColType][ColType]`.`[Int][ColType.Int]`)`.
|
||||
* You can also set [ColType][ColType]`.`[DEFAULT][ColType.DEFAULT]` `[to][to]` `[ColType][ColType]`.X`
|
||||
* to set a _default_ column type, like [ColType.String].
|
||||
* @param skipLines The number of lines to skip before reading the header and data. Default: `0`.
|
||||
*
|
||||
* Useful for files with metadata, or comments at the beginning, or to give a custom [header].
|
||||
* @param readLines The maximum number of lines to read from the data. Default: `null`.
|
||||
*
|
||||
* If `null`, all lines will be read.
|
||||
* @param parserOptions Optional [parsing options][ParserOptions] for columns initially read as [String].
|
||||
* Default, `null`.
|
||||
*
|
||||
* Can configure locale, date format, double parsing, skipping types, etc.
|
||||
*
|
||||
* If [parserOptions] or any of the arguments are `null`, the global parser configuration
|
||||
* ([DataFrame.parser][DataFrame.Companion.parser]) will be queried.
|
||||
*
|
||||
* The only exceptions are:
|
||||
* - [nullStrings][ParserOptions.nullStrings], which, if `null`,
|
||||
* will take the global setting + [["", "NA", "N/A", "null", "NULL", "None", "none", "NIL", "nil"]][org.jetbrains.kotlinx.dataframe.io.DEFAULT_DELIM_NULL_STRINGS].
|
||||
* - [skipTypes][ParserOptions.skipTypes], which will always add [typesDeephavenAlreadyParses][org.jetbrains.kotlinx.dataframe.impl.io.typesDeephavenAlreadyParses] to
|
||||
* the given types or the global setting.
|
||||
* @param ignoreEmptyLines Whether to skip intermediate empty lines. Default: `false`.
|
||||
*
|
||||
* If `false`, empty lines will be interpreted as having _empty_ values if [allowMissingColumns].
|
||||
* @param allowMissingColumns Whether to allow rows with fewer columns than the header. Default: `true`.
|
||||
*
|
||||
* If `true`, rows that are too short will be interpreted as _empty_ values.
|
||||
* @param ignoreExcessColumns Whether to ignore rows with more columns than the header. Default: `true`.
|
||||
*
|
||||
* If `true`, rows that are too long will have those columns dropped.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param ignoreSurroundingSpaces Whether to ignore leading and trailing blanks around non-quoted fields.
|
||||
* Default: `true`.
|
||||
* @param trimInsideQuoted Whether to ignore leading and trailing blanks inside quoted fields.
|
||||
* Default: `false`.
|
||||
* @param parseParallel Whether to parse the data in parallel. Default: `true`.
|
||||
*
|
||||
* If `true`, the data will be read and parsed in parallel by the Deephaven parser.
|
||||
* This is usually faster but can be turned off for debugging.
|
||||
*/
|
||||
public fun DataFrame.Companion.readDelimStr(
|
||||
text: String,
|
||||
delimiter: Char = DELIM_DELIMITER,
|
||||
header: List<String> = HEADER,
|
||||
hasFixedWidthColumns: Boolean = HAS_FIXED_WIDTH_COLUMNS,
|
||||
fixedColumnWidths: List<Int> = FIXED_COLUMN_WIDTHS,
|
||||
colTypes: Map<String, ColType> = COL_TYPES,
|
||||
skipLines: Long = SKIP_LINES,
|
||||
readLines: Long? = READ_LINES,
|
||||
parserOptions: ParserOptions? = PARSER_OPTIONS,
|
||||
ignoreEmptyLines: Boolean = IGNORE_EMPTY_LINES,
|
||||
allowMissingColumns: Boolean = ALLOW_MISSING_COLUMNS,
|
||||
ignoreExcessColumns: Boolean = IGNORE_EXCESS_COLUMNS,
|
||||
quote: Char = QUOTE,
|
||||
ignoreSurroundingSpaces: Boolean = IGNORE_SURROUNDING_SPACES,
|
||||
trimInsideQuoted: Boolean = TRIM_INSIDE_QUOTED,
|
||||
parseParallel: Boolean = PARSE_PARALLEL,
|
||||
): DataFrame<*> =
|
||||
readDelimImpl(
|
||||
inputStream = text.byteInputStream(),
|
||||
charset = Charsets.UTF_8,
|
||||
delimiter = delimiter,
|
||||
header = header,
|
||||
hasFixedWidthColumns = hasFixedWidthColumns,
|
||||
fixedColumnWidths = fixedColumnWidths,
|
||||
compression = Compression.None, // of course
|
||||
colTypes = colTypes,
|
||||
skipLines = skipLines,
|
||||
readLines = readLines,
|
||||
parserOptions = parserOptions,
|
||||
ignoreEmptyLines = ignoreEmptyLines,
|
||||
allowMissingColumns = allowMissingColumns,
|
||||
ignoreExcessColumns = ignoreExcessColumns,
|
||||
quote = quote,
|
||||
ignoreSurroundingSpaces = ignoreSurroundingSpaces,
|
||||
trimInsideQuoted = trimInsideQuoted,
|
||||
parseParallel = parseParallel,
|
||||
adjustCsvSpecs = ADJUST_CSV_SPECS,
|
||||
)
|
||||
+1053
File diff suppressed because it is too large
Load Diff
+175
@@ -0,0 +1,175 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import org.jetbrains.kotlinx.dataframe.DataFrame
|
||||
import org.jetbrains.kotlinx.dataframe.api.ParserOptions
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.CommonReadDelimDocs
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ADJUST_CSV_SPECS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ALLOW_MISSING_COLUMNS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.COL_TYPES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.FIXED_COLUMN_WIDTHS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.HAS_FIXED_WIDTH_COLUMNS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.HEADER
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.IGNORE_EMPTY_LINES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.IGNORE_EXCESS_COLUMNS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.IGNORE_SURROUNDING_SPACES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.PARSER_OPTIONS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.PARSE_PARALLEL
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.QUOTE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.READ_LINES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.SKIP_LINES
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.TEXT_READ
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.TRIM_INSIDE_QUOTED
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.TSV_DELIMITER
|
||||
import org.jetbrains.kotlinx.dataframe.impl.io.readDelimImpl
|
||||
|
||||
/**
|
||||
* ### Read TSV String to [DataFrame]
|
||||
*
|
||||
* Reads any TSV [String] to a [DataFrame][DataFrame].
|
||||
*
|
||||
* Parameters you can use to customize the reading process include, for instance, [delimiter],
|
||||
* [header], [colTypes], [readLines], and [parserOptions].
|
||||
* See the param list below for all settings.
|
||||
*
|
||||
* The integration is built upon [Deephaven CSV](https://github.com/deephaven/deephaven-csv).
|
||||
*
|
||||
* ##### Similar Functions
|
||||
* With the overloads of [DataFrame.readTsv][readTsv]`()`, you can read any TSV by [File][File],
|
||||
* [Path][java.nio.file.Path], [URL][URL], or [InputStream][InputStream].
|
||||
* Reading by file path or URL can also be done by passing a [String].
|
||||
*
|
||||
* For example, [DataFrame.readTsv][readTsv]`("input.tsv")` or with some options:
|
||||
*
|
||||
* [DataFrame.readTsv][readTsv]`(`
|
||||
*
|
||||
* `file = `[File][File]`("input.tsv"),`
|
||||
*
|
||||
* `parserOptions = `[ParserOptions][org.jetbrains.kotlinx.dataframe.api.ParserOptions]`(locale = `[Locale][java.util.Locale]`.`[US][java.util.Locale.US]`),`
|
||||
*
|
||||
* `colTypes = `[mapOf][mapOf]`("a" `[to][to]` `[ColType][ColType]`.`[Int][ColType.Int]`, `[ColType][ColType]`.`[DEFAULT][ColType.DEFAULT]` `[to][to]` `[ColType][ColType]`.`[String][ColType.String]`),`
|
||||
*
|
||||
* `readLines = 1000L,`
|
||||
*
|
||||
* `)`
|
||||
*
|
||||
* ZIP (.zip) or GZIP (.gz) files are supported by default. [compression] is automatically detected.
|
||||
*
|
||||
* You can also read "raw" TSV data from a [String] like this:
|
||||
*
|
||||
* [DataFrame.readTsvStr][readTsvStr]`("a,b,c", delimiter = ",")`
|
||||
*
|
||||
* @param text The raw data to read in the form of a [String].
|
||||
* @param delimiter The field delimiter character. Default: '\t'.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param charset The [character set][java.nio.charset.Charset] the input is encoded in.
|
||||
* Default: `null`
|
||||
*
|
||||
* If `null`, the Charset will be read from the BOM of the provided input,
|
||||
* defaulting to [UTF-8][Charsets.UTF_8] if no BOM is found.
|
||||
* @param header Optional column titles. Default: empty list.
|
||||
*
|
||||
* If non-empty, the data will be read with [header] as the column titles
|
||||
* (use [skipLines] if there's a header in the data).
|
||||
* If empty (default), the header will be read from the data.
|
||||
* @param hasFixedWidthColumns Whether the data has fixed-width columns instead of a single delimiter.
|
||||
* Default: `false`.
|
||||
*
|
||||
* Fixed-width columns can occur, for instance, in multi-space delimited data, where the columns are separated
|
||||
* by multiple spaces instead of a single delimiter, so columns are visually aligned.
|
||||
* Column widths are determined by the header in the data (if present), or manually by setting
|
||||
* [fixedColumnWidths].
|
||||
* @param fixedColumnWidths The fixed column widths. Default: empty list.
|
||||
*
|
||||
* Requires [hasFixedWidthColumns]. If empty, the column widths will be determined by the header in the data
|
||||
* (if present), else, this manually sets the column widths.
|
||||
* The number of widths should match the number of columns.
|
||||
* @param colTypes The expected [ColType] per column name. Default: empty map, a.k.a. infer every column type.
|
||||
*
|
||||
* If supplied for a certain column name (inferred from data or given by [header]),
|
||||
* the parser will parse the column with the specified name as the specified type, else it will infer the type.
|
||||
*
|
||||
* e.g. `colTypes = `[mapOf][mapOf]`("colName" `[to][to]` `[ColType][ColType]`.`[Int][ColType.Int]`)`.
|
||||
* You can also set [ColType][ColType]`.`[DEFAULT][ColType.DEFAULT]` `[to][to]` `[ColType][ColType]`.X`
|
||||
* to set a _default_ column type, like [ColType.String].
|
||||
* @param skipLines The number of lines to skip before reading the header and data. Default: `0`.
|
||||
*
|
||||
* Useful for files with metadata, or comments at the beginning, or to give a custom [header].
|
||||
* @param readLines The maximum number of lines to read from the data. Default: `null`.
|
||||
*
|
||||
* If `null`, all lines will be read.
|
||||
* @param parserOptions Optional [parsing options][ParserOptions] for columns initially read as [String].
|
||||
* Default, `null`.
|
||||
*
|
||||
* Can configure locale, date format, double parsing, skipping types, etc.
|
||||
*
|
||||
* If [parserOptions] or any of the arguments are `null`, the global parser configuration
|
||||
* ([DataFrame.parser][DataFrame.Companion.parser]) will be queried.
|
||||
*
|
||||
* The only exceptions are:
|
||||
* - [nullStrings][ParserOptions.nullStrings], which, if `null`,
|
||||
* will take the global setting + [["", "NA", "N/A", "null", "NULL", "None", "none", "NIL", "nil"]][org.jetbrains.kotlinx.dataframe.io.DEFAULT_DELIM_NULL_STRINGS].
|
||||
* - [skipTypes][ParserOptions.skipTypes], which will always add [typesDeephavenAlreadyParses][org.jetbrains.kotlinx.dataframe.impl.io.typesDeephavenAlreadyParses] to
|
||||
* the given types or the global setting.
|
||||
* @param ignoreEmptyLines Whether to skip intermediate empty lines. Default: `false`.
|
||||
*
|
||||
* If `false`, empty lines will be interpreted as having _empty_ values if [allowMissingColumns].
|
||||
* @param allowMissingColumns Whether to allow rows with fewer columns than the header. Default: `true`.
|
||||
*
|
||||
* If `true`, rows that are too short will be interpreted as _empty_ values.
|
||||
* @param ignoreExcessColumns Whether to ignore rows with more columns than the header. Default: `true`.
|
||||
*
|
||||
* If `true`, rows that are too long will have those columns dropped.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param ignoreSurroundingSpaces Whether to ignore leading and trailing blanks around non-quoted fields.
|
||||
* Default: `true`.
|
||||
* @param trimInsideQuoted Whether to ignore leading and trailing blanks inside quoted fields.
|
||||
* Default: `false`.
|
||||
* @param parseParallel Whether to parse the data in parallel. Default: `true`.
|
||||
*
|
||||
* If `true`, the data will be read and parsed in parallel by the Deephaven parser.
|
||||
* This is usually faster but can be turned off for debugging.
|
||||
*/
|
||||
public fun DataFrame.Companion.readTsvStr(
|
||||
text: String,
|
||||
delimiter: Char = TSV_DELIMITER,
|
||||
header: List<String> = HEADER,
|
||||
hasFixedWidthColumns: Boolean = HAS_FIXED_WIDTH_COLUMNS,
|
||||
fixedColumnWidths: List<Int> = FIXED_COLUMN_WIDTHS,
|
||||
colTypes: Map<String, ColType> = COL_TYPES,
|
||||
skipLines: Long = SKIP_LINES,
|
||||
readLines: Long? = READ_LINES,
|
||||
parserOptions: ParserOptions? = PARSER_OPTIONS,
|
||||
ignoreEmptyLines: Boolean = IGNORE_EMPTY_LINES,
|
||||
allowMissingColumns: Boolean = ALLOW_MISSING_COLUMNS,
|
||||
ignoreExcessColumns: Boolean = IGNORE_EXCESS_COLUMNS,
|
||||
quote: Char = QUOTE,
|
||||
ignoreSurroundingSpaces: Boolean = IGNORE_SURROUNDING_SPACES,
|
||||
trimInsideQuoted: Boolean = TRIM_INSIDE_QUOTED,
|
||||
parseParallel: Boolean = PARSE_PARALLEL,
|
||||
): DataFrame<*> =
|
||||
readDelimImpl(
|
||||
inputStream = text.byteInputStream(),
|
||||
charset = Charsets.UTF_8,
|
||||
delimiter = delimiter,
|
||||
header = header,
|
||||
hasFixedWidthColumns = hasFixedWidthColumns,
|
||||
fixedColumnWidths = fixedColumnWidths,
|
||||
compression = Compression.None, // of course
|
||||
colTypes = colTypes,
|
||||
skipLines = skipLines,
|
||||
readLines = readLines,
|
||||
parserOptions = parserOptions,
|
||||
ignoreEmptyLines = ignoreEmptyLines,
|
||||
allowMissingColumns = allowMissingColumns,
|
||||
ignoreExcessColumns = ignoreExcessColumns,
|
||||
quote = quote,
|
||||
ignoreSurroundingSpaces = ignoreSurroundingSpaces,
|
||||
trimInsideQuoted = trimInsideQuoted,
|
||||
parseParallel = parseParallel,
|
||||
adjustCsvSpecs = ADJUST_CSV_SPECS,
|
||||
)
|
||||
+83
@@ -0,0 +1,83 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import org.jetbrains.kotlinx.dataframe.AnyFrame
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.CommonWriteDelimDocs
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ADJUST_CSV_FORMAT
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.COMMENT_CHAR
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.CSV_DELIMITER
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ESCAPE_CHAR
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.HEADER_COMMENTS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.INCLUDE_HEADER
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.QUOTE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.QUOTE_MODE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.RECORD_SEPARATOR
|
||||
import org.jetbrains.kotlinx.dataframe.impl.io.writeDelimImpl
|
||||
|
||||
/**
|
||||
* ### Convert [DataFrame] to CSV String
|
||||
*
|
||||
* Converts [this][this] [DataFrame][DataFrame] to a CSV [String].
|
||||
*
|
||||
* Parameters you can use to customize the process include, for instance, [delimiter],
|
||||
* [includeHeader], [quoteMode], and [headerComments].
|
||||
* See the param list below for all settings.
|
||||
*
|
||||
* The integration is built upon [Apache Commons CSV](https://commons.apache.org/proper/commons-csv/).
|
||||
*
|
||||
* ##### Similar Functions
|
||||
* With overloads of [DataFrame.writeCsv][writeCsv]`()`, you can write CSV to [File][File], [Path][java.nio.file.Path],
|
||||
* [Appendable], or [String].
|
||||
*
|
||||
* For example, [DataFrame.writeCsv][writeCsv]`("output.csv")`
|
||||
*
|
||||
* or [DataFrame.writeCsv][writeCsv]`(`[File][File]`("output.csv"), quoteMode = `[QuoteMode.ALL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.ALL]`)`
|
||||
*
|
||||
* Converting to a [String] can be done like this:
|
||||
*
|
||||
* [DataFrame.toCsvStr][toCsvStr]`(delimiter = ",")`
|
||||
*
|
||||
* @param delimiter The field delimiter character. Default: ','.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param includeHeader Whether to include the header in the output. Default: `true`.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param quoteMode The [QuoteMode][org.jetbrains.kotlinx.dataframe.io.QuoteMode] to use when writing CSV / TSV files.
|
||||
* Default: [QuoteMode.MINIMAL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.MINIMAL].
|
||||
* @param escapeChar The escape character to use when writing CSV / TSV files with [QuoteMode.NONE][org.jetbrains.kotlinx.dataframe.io.QuoteMode.NONE].
|
||||
* Default: `null`. This will double-quote the value.
|
||||
* @param commentChar The character that indicates a comment line in a CSV / TSV file.
|
||||
* Default: `'#'`.
|
||||
* @param headerComments A list of comments to include at the beginning of the CSV / TSV file.
|
||||
* Default: empty list.
|
||||
* @param recordSeparator The character that separates records in a CSV / TSV file.
|
||||
* Default: `'\n'`, a Unix-newline.
|
||||
*/
|
||||
public fun AnyFrame.toCsvStr(
|
||||
delimiter: Char = CSV_DELIMITER,
|
||||
includeHeader: Boolean = INCLUDE_HEADER,
|
||||
quote: Char? = QUOTE,
|
||||
quoteMode: QuoteMode = QUOTE_MODE,
|
||||
escapeChar: Char? = ESCAPE_CHAR,
|
||||
commentChar: Char? = COMMENT_CHAR,
|
||||
headerComments: List<String> = HEADER_COMMENTS,
|
||||
recordSeparator: String = RECORD_SEPARATOR,
|
||||
): String =
|
||||
buildString {
|
||||
writeDelimImpl(
|
||||
df = this@toCsvStr,
|
||||
writer = this,
|
||||
delimiter = delimiter,
|
||||
includeHeader = includeHeader,
|
||||
quote = quote,
|
||||
quoteMode = quoteMode,
|
||||
escapeChar = escapeChar,
|
||||
commentChar = commentChar,
|
||||
headerComments = headerComments,
|
||||
recordSeparator = recordSeparator,
|
||||
adjustCsvFormat = ADJUST_CSV_FORMAT,
|
||||
)
|
||||
}
|
||||
+83
@@ -0,0 +1,83 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import org.jetbrains.kotlinx.dataframe.AnyFrame
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.CommonWriteDelimDocs
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ADJUST_CSV_FORMAT
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.COMMENT_CHAR
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.DELIM_DELIMITER
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ESCAPE_CHAR
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.HEADER_COMMENTS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.INCLUDE_HEADER
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.QUOTE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.QUOTE_MODE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.RECORD_SEPARATOR
|
||||
import org.jetbrains.kotlinx.dataframe.impl.io.writeDelimImpl
|
||||
|
||||
/**
|
||||
* ### Convert [DataFrame] to Delimiter-Separated Text String
|
||||
*
|
||||
* Converts [this][this] [DataFrame][DataFrame] to a delimiter-separated text [String].
|
||||
*
|
||||
* Parameters you can use to customize the process include, for instance, [delimiter],
|
||||
* [includeHeader], [quoteMode], and [headerComments].
|
||||
* See the param list below for all settings.
|
||||
*
|
||||
* The integration is built upon [Apache Commons CSV](https://commons.apache.org/proper/commons-csv/).
|
||||
*
|
||||
* ##### Similar Functions
|
||||
* With overloads of [DataFrame.writeDelim][writeDelim]`()`, you can write delimiter-separated text to [File][File], [Path][java.nio.file.Path],
|
||||
* [Appendable], or [String].
|
||||
*
|
||||
* For example, [DataFrame.writeDelim][writeDelim]`("output.txt")`
|
||||
*
|
||||
* or [DataFrame.writeDelim][writeDelim]`(`[File][File]`("output.txt"), quoteMode = `[QuoteMode.ALL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.ALL]`)`
|
||||
*
|
||||
* Converting to a [String] can be done like this:
|
||||
*
|
||||
* [DataFrame.toDelimStr][toDelimStr]`(delimiter = ",")`
|
||||
*
|
||||
* @param delimiter The field delimiter character. Default: ','.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param includeHeader Whether to include the header in the output. Default: `true`.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param quoteMode The [QuoteMode][org.jetbrains.kotlinx.dataframe.io.QuoteMode] to use when writing CSV / TSV files.
|
||||
* Default: [QuoteMode.MINIMAL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.MINIMAL].
|
||||
* @param escapeChar The escape character to use when writing CSV / TSV files with [QuoteMode.NONE][org.jetbrains.kotlinx.dataframe.io.QuoteMode.NONE].
|
||||
* Default: `null`. This will double-quote the value.
|
||||
* @param commentChar The character that indicates a comment line in a CSV / TSV file.
|
||||
* Default: `'#'`.
|
||||
* @param headerComments A list of comments to include at the beginning of the CSV / TSV file.
|
||||
* Default: empty list.
|
||||
* @param recordSeparator The character that separates records in a CSV / TSV file.
|
||||
* Default: `'\n'`, a Unix-newline.
|
||||
*/
|
||||
public fun AnyFrame.toDelimStr(
|
||||
delimiter: Char = DELIM_DELIMITER,
|
||||
includeHeader: Boolean = INCLUDE_HEADER,
|
||||
quote: Char? = QUOTE,
|
||||
quoteMode: QuoteMode = QUOTE_MODE,
|
||||
escapeChar: Char? = ESCAPE_CHAR,
|
||||
commentChar: Char? = COMMENT_CHAR,
|
||||
headerComments: List<String> = HEADER_COMMENTS,
|
||||
recordSeparator: String = RECORD_SEPARATOR,
|
||||
): String =
|
||||
buildString {
|
||||
writeDelimImpl(
|
||||
df = this@toDelimStr,
|
||||
writer = this,
|
||||
delimiter = delimiter,
|
||||
includeHeader = includeHeader,
|
||||
quote = quote,
|
||||
quoteMode = quoteMode,
|
||||
escapeChar = escapeChar,
|
||||
commentChar = commentChar,
|
||||
headerComments = headerComments,
|
||||
recordSeparator = recordSeparator,
|
||||
adjustCsvFormat = ADJUST_CSV_FORMAT,
|
||||
)
|
||||
}
|
||||
+83
@@ -0,0 +1,83 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import org.jetbrains.kotlinx.dataframe.AnyFrame
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.CommonWriteDelimDocs
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ADJUST_CSV_FORMAT
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.COMMENT_CHAR
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ESCAPE_CHAR
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.HEADER_COMMENTS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.INCLUDE_HEADER
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.QUOTE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.QUOTE_MODE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.RECORD_SEPARATOR
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.TSV_DELIMITER
|
||||
import org.jetbrains.kotlinx.dataframe.impl.io.writeDelimImpl
|
||||
|
||||
/**
|
||||
* ### Convert [DataFrame] to TSV String
|
||||
*
|
||||
* Converts [this][this] [DataFrame][DataFrame] to a TSV [String].
|
||||
*
|
||||
* Parameters you can use to customize the process include, for instance, [delimiter],
|
||||
* [includeHeader], [quoteMode], and [headerComments].
|
||||
* See the param list below for all settings.
|
||||
*
|
||||
* The integration is built upon [Apache Commons CSV](https://commons.apache.org/proper/commons-csv/).
|
||||
*
|
||||
* ##### Similar Functions
|
||||
* With overloads of [DataFrame.writeTsv][writeTsv]`()`, you can write TSV to [File][File], [Path][java.nio.file.Path],
|
||||
* [Appendable], or [String].
|
||||
*
|
||||
* For example, [DataFrame.writeTsv][writeTsv]`("output.tsv")`
|
||||
*
|
||||
* or [DataFrame.writeTsv][writeTsv]`(`[File][File]`("output.tsv"), quoteMode = `[QuoteMode.ALL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.ALL]`)`
|
||||
*
|
||||
* Converting to a [String] can be done like this:
|
||||
*
|
||||
* [DataFrame.toTsvStr][toTsvStr]`(delimiter = ",")`
|
||||
*
|
||||
* @param delimiter The field delimiter character. Default: '\t'.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param includeHeader Whether to include the header in the output. Default: `true`.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param quoteMode The [QuoteMode][org.jetbrains.kotlinx.dataframe.io.QuoteMode] to use when writing CSV / TSV files.
|
||||
* Default: [QuoteMode.MINIMAL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.MINIMAL].
|
||||
* @param escapeChar The escape character to use when writing CSV / TSV files with [QuoteMode.NONE][org.jetbrains.kotlinx.dataframe.io.QuoteMode.NONE].
|
||||
* Default: `null`. This will double-quote the value.
|
||||
* @param commentChar The character that indicates a comment line in a CSV / TSV file.
|
||||
* Default: `'#'`.
|
||||
* @param headerComments A list of comments to include at the beginning of the CSV / TSV file.
|
||||
* Default: empty list.
|
||||
* @param recordSeparator The character that separates records in a CSV / TSV file.
|
||||
* Default: `'\n'`, a Unix-newline.
|
||||
*/
|
||||
public fun AnyFrame.toTsvStr(
|
||||
includeHeader: Boolean = INCLUDE_HEADER,
|
||||
delimiter: Char = TSV_DELIMITER,
|
||||
quote: Char? = QUOTE,
|
||||
quoteMode: QuoteMode = QUOTE_MODE,
|
||||
escapeChar: Char? = ESCAPE_CHAR,
|
||||
commentChar: Char? = COMMENT_CHAR,
|
||||
headerComments: List<String> = HEADER_COMMENTS,
|
||||
recordSeparator: String = RECORD_SEPARATOR,
|
||||
): String =
|
||||
buildString {
|
||||
writeDelimImpl(
|
||||
df = this@toTsvStr,
|
||||
writer = this,
|
||||
delimiter = delimiter,
|
||||
includeHeader = includeHeader,
|
||||
quote = quote,
|
||||
quoteMode = quoteMode,
|
||||
escapeChar = escapeChar,
|
||||
commentChar = commentChar,
|
||||
headerComments = headerComments,
|
||||
recordSeparator = recordSeparator,
|
||||
adjustCsvFormat = ADJUST_CSV_FORMAT,
|
||||
)
|
||||
}
|
||||
Vendored
+39
@@ -0,0 +1,39 @@
|
||||
@file:JvmName("TsvDeephavenKt")
|
||||
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import org.jetbrains.kotlinx.dataframe.DataFrame
|
||||
import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod
|
||||
import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams
|
||||
import java.io.File
|
||||
import java.io.InputStream
|
||||
import java.nio.file.Path
|
||||
import kotlin.reflect.typeOf
|
||||
|
||||
public class TsvDeephaven(private val delimiter: Char = DelimParams.TSV_DELIMITER) : SupportedDataFrameFormat {
|
||||
override fun readDataFrame(stream: InputStream, header: List<String>): DataFrame<*> =
|
||||
DataFrame.readTsv(inputStream = stream, header = header, delimiter = delimiter)
|
||||
|
||||
override fun readDataFrame(file: File, header: List<String>): DataFrame<*> =
|
||||
DataFrame.readTsv(file = file, header = header, delimiter = delimiter)
|
||||
|
||||
override fun readDataFrame(path: Path, header: List<String>): DataFrame<*> =
|
||||
DataFrame.readTsv(path = path, header = header, delimiter = delimiter)
|
||||
|
||||
override fun acceptsExtension(ext: String): Boolean = ext == "tsv"
|
||||
|
||||
override fun acceptsSample(sample: SupportedFormatSample): Boolean = true // Extension is enough
|
||||
|
||||
override val testOrder: Int = 30_000
|
||||
|
||||
override fun createDefaultReadMethod(pathRepresentation: String?): DefaultReadDfMethod {
|
||||
val arguments = MethodArguments().add("delimiter", typeOf<Char>(), "'%L'", delimiter)
|
||||
return DefaultReadTsvMethod(pathRepresentation, arguments)
|
||||
}
|
||||
}
|
||||
|
||||
private const val READ_TSV = "readTsv"
|
||||
|
||||
internal class DefaultReadTsvMethod(path: String?, arguments: MethodArguments) :
|
||||
AbstractDefaultReadMethod(path, arguments, READ_TSV)
|
||||
dataframe/dataframe-csv/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/util.kt
Vendored
+25
@@ -0,0 +1,25 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import io.deephaven.csv.CsvSpecs
|
||||
import org.apache.commons.csv.CSVFormat
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.ExcludeFromSources
|
||||
|
||||
/**
|
||||
* Default strings that are considered null when reading CSV / TSV / delim files:
|
||||
*
|
||||
* [["", "NA", "N/A", "null", "NULL", "None", "none", "NIL", "nil"]][org.jetbrains.kotlinx.dataframe.io.DEFAULT_DELIM_NULL_STRINGS]
|
||||
*/
|
||||
public val DEFAULT_DELIM_NULL_STRINGS: Set<String> =
|
||||
setOf("", "NA", "N/A", "null", "NULL", "None", "none", "NIL", "nil")
|
||||
|
||||
/**
|
||||
* Typealias for `CsvSpecs.Builder.(CsvSpecs.Builder) -> CsvSpecs.Builder`.
|
||||
* A lambda where you can overwrite or adjust any of the CSV specs.
|
||||
*/
|
||||
public typealias AdjustCsvSpecs = CsvSpecs.Builder.(CsvSpecs.Builder) -> CsvSpecs.Builder
|
||||
|
||||
/**
|
||||
* Typealias for `CSVFormat.Builder.(CSVFormat.Builder) -> CSVFormat.Builder`.
|
||||
* A lambda where you can overwrite or adjust any of the CSV format options.
|
||||
*/
|
||||
public typealias AdjustCSVFormat = CSVFormat.Builder.(CSVFormat.Builder) -> CSVFormat.Builder
|
||||
+305
@@ -0,0 +1,305 @@
|
||||
@file:JvmName("WriteCsvDeephavenKt")
|
||||
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import org.jetbrains.kotlinx.dataframe.AnyFrame
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.CommonWriteDelimDocs
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ADJUST_CSV_FORMAT
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.COMMENT_CHAR
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.CSV_DELIMITER
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ESCAPE_CHAR
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.FILE_WRITE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.HEADER_COMMENTS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.INCLUDE_HEADER
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.PATH_WRITE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.QUOTE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.QUOTE_MODE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.RECORD_SEPARATOR
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.WRITER_WRITE
|
||||
import org.jetbrains.kotlinx.dataframe.impl.io.writeDelimImpl
|
||||
import java.io.File
|
||||
import java.io.FileWriter
|
||||
import java.nio.file.Path
|
||||
import kotlin.io.path.writer
|
||||
|
||||
/**
|
||||
* ### Write [DataFrame] to CSV File
|
||||
*
|
||||
* Writes [this][this] [DataFrame][DataFrame] to a CSV file.
|
||||
*
|
||||
* Parameters you can use to customize the process include, for instance, [delimiter],
|
||||
* [includeHeader], [quoteMode], and [headerComments].
|
||||
* See the param list below for all settings.
|
||||
*
|
||||
* The integration is built upon [Apache Commons CSV](https://commons.apache.org/proper/commons-csv/).
|
||||
*
|
||||
* ##### Similar Functions
|
||||
* With overloads of [DataFrame.writeCsv][writeCsv]`()`, you can write CSV to [File][File], [Path][java.nio.file.Path],
|
||||
* [Appendable], or [String].
|
||||
*
|
||||
* For example, [DataFrame.writeCsv][writeCsv]`("output.csv")`
|
||||
*
|
||||
* or [DataFrame.writeCsv][writeCsv]`(`[File][File]`("output.csv"), quoteMode = `[QuoteMode.ALL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.ALL]`)`
|
||||
*
|
||||
* Converting to a [String] can be done like this:
|
||||
*
|
||||
* [DataFrame.toCsvStr][toCsvStr]`(delimiter = ",")`
|
||||
*
|
||||
* @param path The path pointing to a file to write to.
|
||||
* @param delimiter The field delimiter character. Default: ','.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param includeHeader Whether to include the header in the output. Default: `true`.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param quoteMode The [QuoteMode][org.jetbrains.kotlinx.dataframe.io.QuoteMode] to use when writing CSV / TSV files.
|
||||
* Default: [QuoteMode.MINIMAL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.MINIMAL].
|
||||
* @param escapeChar The escape character to use when writing CSV / TSV files with [QuoteMode.NONE][org.jetbrains.kotlinx.dataframe.io.QuoteMode.NONE].
|
||||
* Default: `null`. This will double-quote the value.
|
||||
* @param commentChar The character that indicates a comment line in a CSV / TSV file.
|
||||
* Default: `'#'`.
|
||||
* @param headerComments A list of comments to include at the beginning of the CSV / TSV file.
|
||||
* Default: empty list.
|
||||
* @param recordSeparator The character that separates records in a CSV / TSV file.
|
||||
* Default: `'\n'`, a Unix-newline.
|
||||
*/
|
||||
public fun AnyFrame.writeCsv(
|
||||
path: Path,
|
||||
delimiter: Char = CSV_DELIMITER,
|
||||
includeHeader: Boolean = INCLUDE_HEADER,
|
||||
quote: Char? = QUOTE,
|
||||
quoteMode: QuoteMode = QUOTE_MODE,
|
||||
escapeChar: Char? = ESCAPE_CHAR,
|
||||
commentChar: Char? = COMMENT_CHAR,
|
||||
headerComments: List<String> = HEADER_COMMENTS,
|
||||
recordSeparator: String = RECORD_SEPARATOR,
|
||||
): Unit =
|
||||
writeDelimImpl(
|
||||
df = this,
|
||||
writer = path.writer(),
|
||||
delimiter = delimiter,
|
||||
includeHeader = includeHeader,
|
||||
quote = quote,
|
||||
quoteMode = quoteMode,
|
||||
escapeChar = escapeChar,
|
||||
commentChar = commentChar,
|
||||
headerComments = headerComments,
|
||||
recordSeparator = recordSeparator,
|
||||
adjustCsvFormat = ADJUST_CSV_FORMAT,
|
||||
)
|
||||
|
||||
/**
|
||||
* ### Write [DataFrame] to CSV File
|
||||
*
|
||||
* Writes [this][this] [DataFrame][DataFrame] to a CSV file.
|
||||
*
|
||||
* Parameters you can use to customize the process include, for instance, [delimiter],
|
||||
* [includeHeader], [quoteMode], and [headerComments].
|
||||
* See the param list below for all settings.
|
||||
*
|
||||
* The integration is built upon [Apache Commons CSV](https://commons.apache.org/proper/commons-csv/).
|
||||
*
|
||||
* ##### Similar Functions
|
||||
* With overloads of [DataFrame.writeCsv][writeCsv]`()`, you can write CSV to [File][File], [Path][java.nio.file.Path],
|
||||
* [Appendable], or [String].
|
||||
*
|
||||
* For example, [DataFrame.writeCsv][writeCsv]`("output.csv")`
|
||||
*
|
||||
* or [DataFrame.writeCsv][writeCsv]`(`[File][File]`("output.csv"), quoteMode = `[QuoteMode.ALL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.ALL]`)`
|
||||
*
|
||||
* Converting to a [String] can be done like this:
|
||||
*
|
||||
* [DataFrame.toCsvStr][toCsvStr]`(delimiter = ",")`
|
||||
*
|
||||
* @param file The file to write to.
|
||||
* @param delimiter The field delimiter character. Default: ','.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param includeHeader Whether to include the header in the output. Default: `true`.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param quoteMode The [QuoteMode][org.jetbrains.kotlinx.dataframe.io.QuoteMode] to use when writing CSV / TSV files.
|
||||
* Default: [QuoteMode.MINIMAL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.MINIMAL].
|
||||
* @param escapeChar The escape character to use when writing CSV / TSV files with [QuoteMode.NONE][org.jetbrains.kotlinx.dataframe.io.QuoteMode.NONE].
|
||||
* Default: `null`. This will double-quote the value.
|
||||
* @param commentChar The character that indicates a comment line in a CSV / TSV file.
|
||||
* Default: `'#'`.
|
||||
* @param headerComments A list of comments to include at the beginning of the CSV / TSV file.
|
||||
* Default: empty list.
|
||||
* @param recordSeparator The character that separates records in a CSV / TSV file.
|
||||
* Default: `'\n'`, a Unix-newline.
|
||||
*/
|
||||
public fun AnyFrame.writeCsv(
|
||||
file: File,
|
||||
delimiter: Char = CSV_DELIMITER,
|
||||
includeHeader: Boolean = INCLUDE_HEADER,
|
||||
quote: Char? = QUOTE,
|
||||
quoteMode: QuoteMode = QUOTE_MODE,
|
||||
escapeChar: Char? = ESCAPE_CHAR,
|
||||
commentChar: Char? = COMMENT_CHAR,
|
||||
headerComments: List<String> = HEADER_COMMENTS,
|
||||
recordSeparator: String = RECORD_SEPARATOR,
|
||||
): Unit =
|
||||
writeDelimImpl(
|
||||
df = this,
|
||||
writer = FileWriter(file),
|
||||
delimiter = delimiter,
|
||||
includeHeader = includeHeader,
|
||||
quote = quote,
|
||||
quoteMode = quoteMode,
|
||||
escapeChar = escapeChar,
|
||||
commentChar = commentChar,
|
||||
headerComments = headerComments,
|
||||
recordSeparator = recordSeparator,
|
||||
adjustCsvFormat = ADJUST_CSV_FORMAT,
|
||||
)
|
||||
|
||||
/**
|
||||
* ### Write [DataFrame] to CSV File
|
||||
*
|
||||
* Writes [this][this] [DataFrame][DataFrame] to a CSV file.
|
||||
*
|
||||
* Parameters you can use to customize the process include, for instance, [delimiter],
|
||||
* [includeHeader], [quoteMode], and [headerComments].
|
||||
* See the param list below for all settings.
|
||||
*
|
||||
* The integration is built upon [Apache Commons CSV](https://commons.apache.org/proper/commons-csv/).
|
||||
*
|
||||
* ##### Similar Functions
|
||||
* With overloads of [DataFrame.writeCsv][writeCsv]`()`, you can write CSV to [File][File], [Path][java.nio.file.Path],
|
||||
* [Appendable], or [String].
|
||||
*
|
||||
* For example, [DataFrame.writeCsv][writeCsv]`("output.csv")`
|
||||
*
|
||||
* or [DataFrame.writeCsv][writeCsv]`(`[File][File]`("output.csv"), quoteMode = `[QuoteMode.ALL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.ALL]`)`
|
||||
*
|
||||
* Converting to a [String] can be done like this:
|
||||
*
|
||||
* [DataFrame.toCsvStr][toCsvStr]`(delimiter = ",")`
|
||||
*
|
||||
* @param path The path pointing to a file to write to.
|
||||
* @param delimiter The field delimiter character. Default: ','.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param includeHeader Whether to include the header in the output. Default: `true`.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param quoteMode The [QuoteMode][org.jetbrains.kotlinx.dataframe.io.QuoteMode] to use when writing CSV / TSV files.
|
||||
* Default: [QuoteMode.MINIMAL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.MINIMAL].
|
||||
* @param escapeChar The escape character to use when writing CSV / TSV files with [QuoteMode.NONE][org.jetbrains.kotlinx.dataframe.io.QuoteMode.NONE].
|
||||
* Default: `null`. This will double-quote the value.
|
||||
* @param commentChar The character that indicates a comment line in a CSV / TSV file.
|
||||
* Default: `'#'`.
|
||||
* @param headerComments A list of comments to include at the beginning of the CSV / TSV file.
|
||||
* Default: empty list.
|
||||
* @param recordSeparator The character that separates records in a CSV / TSV file.
|
||||
* Default: `'\n'`, a Unix-newline.
|
||||
*/
|
||||
public fun AnyFrame.writeCsv(
|
||||
path: String,
|
||||
delimiter: Char = CSV_DELIMITER,
|
||||
includeHeader: Boolean = INCLUDE_HEADER,
|
||||
quote: Char? = QUOTE,
|
||||
quoteMode: QuoteMode = QUOTE_MODE,
|
||||
escapeChar: Char? = ESCAPE_CHAR,
|
||||
commentChar: Char? = COMMENT_CHAR,
|
||||
headerComments: List<String> = HEADER_COMMENTS,
|
||||
recordSeparator: String = RECORD_SEPARATOR,
|
||||
): Unit =
|
||||
writeDelimImpl(
|
||||
df = this,
|
||||
writer = FileWriter(path),
|
||||
delimiter = delimiter,
|
||||
includeHeader = includeHeader,
|
||||
quote = quote,
|
||||
quoteMode = quoteMode,
|
||||
escapeChar = escapeChar,
|
||||
commentChar = commentChar,
|
||||
headerComments = headerComments,
|
||||
recordSeparator = recordSeparator,
|
||||
adjustCsvFormat = ADJUST_CSV_FORMAT,
|
||||
)
|
||||
|
||||
/**
|
||||
*
|
||||
* ### Write [DataFrame] to CSV Appendable
|
||||
*
|
||||
* Writes [this][this] [DataFrame][DataFrame] to a CSV [Appendable].
|
||||
*
|
||||
* Parameters you can use to customize the process include, for instance, [delimiter],
|
||||
* [includeHeader], [quoteMode], and [headerComments].
|
||||
* See the param list below for all settings.
|
||||
*
|
||||
* The integration is built upon [Apache Commons CSV](https://commons.apache.org/proper/commons-csv/).
|
||||
*
|
||||
* ##### Similar Functions
|
||||
* With overloads of [DataFrame.writeCsv][writeCsv]`()`, you can write CSV to [File][File], [Path][java.nio.file.Path],
|
||||
* [Appendable], or [String].
|
||||
*
|
||||
* For example, [DataFrame.writeCsv][writeCsv]`("output.csv")`
|
||||
*
|
||||
* or [DataFrame.writeCsv][writeCsv]`(`[File][File]`("output.csv"), quoteMode = `[QuoteMode.ALL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.ALL]`)`
|
||||
*
|
||||
* Converting to a [String] can be done like this:
|
||||
*
|
||||
* [DataFrame.toCsvStr][toCsvStr]`(delimiter = ",")`
|
||||
*
|
||||
* @param writer The [Appendable] to write to.
|
||||
* @param delimiter The field delimiter character. Default: ','.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param includeHeader Whether to include the header in the output. Default: `true`.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param quoteMode The [QuoteMode][org.jetbrains.kotlinx.dataframe.io.QuoteMode] to use when writing CSV / TSV files.
|
||||
* Default: [QuoteMode.MINIMAL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.MINIMAL].
|
||||
* @param escapeChar The escape character to use when writing CSV / TSV files with [QuoteMode.NONE][org.jetbrains.kotlinx.dataframe.io.QuoteMode.NONE].
|
||||
* Default: `null`. This will double-quote the value.
|
||||
* @param commentChar The character that indicates a comment line in a CSV / TSV file.
|
||||
* Default: `'#'`.
|
||||
* @param headerComments A list of comments to include at the beginning of the CSV / TSV file.
|
||||
* Default: empty list.
|
||||
* @param recordSeparator The character that separates records in a CSV / TSV file.
|
||||
* Default: `'\n'`, a Unix-newline.
|
||||
* @param adjustCsvFormat Optional extra [CSVFormat] configuration. Default: `{ it }`.
|
||||
*
|
||||
* Before instantiating the [CSVFormat], the [CSVFormat.Builder] will be passed to this lambda.
|
||||
* This will allow you to configure/overwrite any CSV / TSV writing options.
|
||||
*/
|
||||
public fun AnyFrame.writeCsv(
|
||||
writer: Appendable,
|
||||
delimiter: Char = CSV_DELIMITER,
|
||||
includeHeader: Boolean = INCLUDE_HEADER,
|
||||
quote: Char? = QUOTE,
|
||||
quoteMode: QuoteMode = QUOTE_MODE,
|
||||
escapeChar: Char? = ESCAPE_CHAR,
|
||||
commentChar: Char? = COMMENT_CHAR,
|
||||
headerComments: List<String> = HEADER_COMMENTS,
|
||||
recordSeparator: String = RECORD_SEPARATOR,
|
||||
adjustCsvFormat: AdjustCSVFormat = ADJUST_CSV_FORMAT,
|
||||
): Unit =
|
||||
writeDelimImpl(
|
||||
df = this,
|
||||
writer = writer,
|
||||
delimiter = delimiter,
|
||||
includeHeader = includeHeader,
|
||||
quote = quote,
|
||||
quoteMode = quoteMode,
|
||||
escapeChar = escapeChar,
|
||||
commentChar = commentChar,
|
||||
headerComments = headerComments,
|
||||
recordSeparator = recordSeparator,
|
||||
adjustCsvFormat = adjustCsvFormat,
|
||||
)
|
||||
+305
@@ -0,0 +1,305 @@
|
||||
@file:JvmName("WriteDelimDeephavenKt")
|
||||
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import org.jetbrains.kotlinx.dataframe.AnyFrame
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.CommonWriteDelimDocs
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ADJUST_CSV_FORMAT
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.COMMENT_CHAR
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.DELIM_DELIMITER
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ESCAPE_CHAR
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.FILE_WRITE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.HEADER_COMMENTS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.INCLUDE_HEADER
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.PATH_WRITE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.QUOTE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.QUOTE_MODE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.RECORD_SEPARATOR
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.WRITER_WRITE
|
||||
import org.jetbrains.kotlinx.dataframe.impl.io.writeDelimImpl
|
||||
import java.io.File
|
||||
import java.io.FileWriter
|
||||
import java.nio.file.Path
|
||||
import kotlin.io.path.writer
|
||||
|
||||
/**
|
||||
* ### Write [DataFrame] to Delimiter-Separated Text File
|
||||
*
|
||||
* Writes [this][this] [DataFrame][DataFrame] to a delimiter-separated text file.
|
||||
*
|
||||
* Parameters you can use to customize the process include, for instance, [delimiter],
|
||||
* [includeHeader], [quoteMode], and [headerComments].
|
||||
* See the param list below for all settings.
|
||||
*
|
||||
* The integration is built upon [Apache Commons CSV](https://commons.apache.org/proper/commons-csv/).
|
||||
*
|
||||
* ##### Similar Functions
|
||||
* With overloads of [DataFrame.writeDelim][writeDelim]`()`, you can write delimiter-separated text to [File][File], [Path][java.nio.file.Path],
|
||||
* [Appendable], or [String].
|
||||
*
|
||||
* For example, [DataFrame.writeDelim][writeDelim]`("output.txt")`
|
||||
*
|
||||
* or [DataFrame.writeDelim][writeDelim]`(`[File][File]`("output.txt"), quoteMode = `[QuoteMode.ALL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.ALL]`)`
|
||||
*
|
||||
* Converting to a [String] can be done like this:
|
||||
*
|
||||
* [DataFrame.toDelimStr][toDelimStr]`(delimiter = ",")`
|
||||
*
|
||||
* @param path The path pointing to a file to write to.
|
||||
* @param delimiter The field delimiter character. Default: ','.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param includeHeader Whether to include the header in the output. Default: `true`.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param quoteMode The [QuoteMode][org.jetbrains.kotlinx.dataframe.io.QuoteMode] to use when writing CSV / TSV files.
|
||||
* Default: [QuoteMode.MINIMAL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.MINIMAL].
|
||||
* @param escapeChar The escape character to use when writing CSV / TSV files with [QuoteMode.NONE][org.jetbrains.kotlinx.dataframe.io.QuoteMode.NONE].
|
||||
* Default: `null`. This will double-quote the value.
|
||||
* @param commentChar The character that indicates a comment line in a CSV / TSV file.
|
||||
* Default: `'#'`.
|
||||
* @param headerComments A list of comments to include at the beginning of the CSV / TSV file.
|
||||
* Default: empty list.
|
||||
* @param recordSeparator The character that separates records in a CSV / TSV file.
|
||||
* Default: `'\n'`, a Unix-newline.
|
||||
*/
|
||||
public fun AnyFrame.writeDelim(
|
||||
path: Path,
|
||||
delimiter: Char = DELIM_DELIMITER,
|
||||
includeHeader: Boolean = INCLUDE_HEADER,
|
||||
quote: Char? = QUOTE,
|
||||
quoteMode: QuoteMode = QUOTE_MODE,
|
||||
escapeChar: Char? = ESCAPE_CHAR,
|
||||
commentChar: Char? = COMMENT_CHAR,
|
||||
headerComments: List<String> = HEADER_COMMENTS,
|
||||
recordSeparator: String = RECORD_SEPARATOR,
|
||||
): Unit =
|
||||
writeDelimImpl(
|
||||
df = this,
|
||||
writer = path.writer(),
|
||||
delimiter = delimiter,
|
||||
includeHeader = includeHeader,
|
||||
quote = quote,
|
||||
quoteMode = quoteMode,
|
||||
escapeChar = escapeChar,
|
||||
commentChar = commentChar,
|
||||
headerComments = headerComments,
|
||||
recordSeparator = recordSeparator,
|
||||
adjustCsvFormat = ADJUST_CSV_FORMAT,
|
||||
)
|
||||
|
||||
/**
|
||||
* ### Write [DataFrame] to Delimiter-Separated Text File
|
||||
*
|
||||
* Writes [this][this] [DataFrame][DataFrame] to a delimiter-separated text file.
|
||||
*
|
||||
* Parameters you can use to customize the process include, for instance, [delimiter],
|
||||
* [includeHeader], [quoteMode], and [headerComments].
|
||||
* See the param list below for all settings.
|
||||
*
|
||||
* The integration is built upon [Apache Commons CSV](https://commons.apache.org/proper/commons-csv/).
|
||||
*
|
||||
* ##### Similar Functions
|
||||
* With overloads of [DataFrame.writeDelim][writeDelim]`()`, you can write delimiter-separated text to [File][File], [Path][java.nio.file.Path],
|
||||
* [Appendable], or [String].
|
||||
*
|
||||
* For example, [DataFrame.writeDelim][writeDelim]`("output.txt")`
|
||||
*
|
||||
* or [DataFrame.writeDelim][writeDelim]`(`[File][File]`("output.txt"), quoteMode = `[QuoteMode.ALL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.ALL]`)`
|
||||
*
|
||||
* Converting to a [String] can be done like this:
|
||||
*
|
||||
* [DataFrame.toDelimStr][toDelimStr]`(delimiter = ",")`
|
||||
*
|
||||
* @param file The file to write to.
|
||||
* @param delimiter The field delimiter character. Default: ','.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param includeHeader Whether to include the header in the output. Default: `true`.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param quoteMode The [QuoteMode][org.jetbrains.kotlinx.dataframe.io.QuoteMode] to use when writing CSV / TSV files.
|
||||
* Default: [QuoteMode.MINIMAL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.MINIMAL].
|
||||
* @param escapeChar The escape character to use when writing CSV / TSV files with [QuoteMode.NONE][org.jetbrains.kotlinx.dataframe.io.QuoteMode.NONE].
|
||||
* Default: `null`. This will double-quote the value.
|
||||
* @param commentChar The character that indicates a comment line in a CSV / TSV file.
|
||||
* Default: `'#'`.
|
||||
* @param headerComments A list of comments to include at the beginning of the CSV / TSV file.
|
||||
* Default: empty list.
|
||||
* @param recordSeparator The character that separates records in a CSV / TSV file.
|
||||
* Default: `'\n'`, a Unix-newline.
|
||||
*/
|
||||
public fun AnyFrame.writeDelim(
|
||||
file: File,
|
||||
delimiter: Char = DELIM_DELIMITER,
|
||||
includeHeader: Boolean = INCLUDE_HEADER,
|
||||
quote: Char? = QUOTE,
|
||||
quoteMode: QuoteMode = QUOTE_MODE,
|
||||
escapeChar: Char? = ESCAPE_CHAR,
|
||||
commentChar: Char? = COMMENT_CHAR,
|
||||
headerComments: List<String> = HEADER_COMMENTS,
|
||||
recordSeparator: String = RECORD_SEPARATOR,
|
||||
): Unit =
|
||||
writeDelimImpl(
|
||||
df = this,
|
||||
writer = FileWriter(file),
|
||||
delimiter = delimiter,
|
||||
includeHeader = includeHeader,
|
||||
quote = quote,
|
||||
quoteMode = quoteMode,
|
||||
escapeChar = escapeChar,
|
||||
commentChar = commentChar,
|
||||
headerComments = headerComments,
|
||||
recordSeparator = recordSeparator,
|
||||
adjustCsvFormat = ADJUST_CSV_FORMAT,
|
||||
)
|
||||
|
||||
/**
|
||||
* ### Write [DataFrame] to Delimiter-Separated Text File
|
||||
*
|
||||
* Writes [this][this] [DataFrame][DataFrame] to a delimiter-separated text file.
|
||||
*
|
||||
* Parameters you can use to customize the process include, for instance, [delimiter],
|
||||
* [includeHeader], [quoteMode], and [headerComments].
|
||||
* See the param list below for all settings.
|
||||
*
|
||||
* The integration is built upon [Apache Commons CSV](https://commons.apache.org/proper/commons-csv/).
|
||||
*
|
||||
* ##### Similar Functions
|
||||
* With overloads of [DataFrame.writeDelim][writeDelim]`()`, you can write delimiter-separated text to [File][File], [Path][java.nio.file.Path],
|
||||
* [Appendable], or [String].
|
||||
*
|
||||
* For example, [DataFrame.writeDelim][writeDelim]`("output.txt")`
|
||||
*
|
||||
* or [DataFrame.writeDelim][writeDelim]`(`[File][File]`("output.txt"), quoteMode = `[QuoteMode.ALL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.ALL]`)`
|
||||
*
|
||||
* Converting to a [String] can be done like this:
|
||||
*
|
||||
* [DataFrame.toDelimStr][toDelimStr]`(delimiter = ",")`
|
||||
*
|
||||
* @param path The path pointing to a file to write to.
|
||||
* @param delimiter The field delimiter character. Default: ','.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param includeHeader Whether to include the header in the output. Default: `true`.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param quoteMode The [QuoteMode][org.jetbrains.kotlinx.dataframe.io.QuoteMode] to use when writing CSV / TSV files.
|
||||
* Default: [QuoteMode.MINIMAL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.MINIMAL].
|
||||
* @param escapeChar The escape character to use when writing CSV / TSV files with [QuoteMode.NONE][org.jetbrains.kotlinx.dataframe.io.QuoteMode.NONE].
|
||||
* Default: `null`. This will double-quote the value.
|
||||
* @param commentChar The character that indicates a comment line in a CSV / TSV file.
|
||||
* Default: `'#'`.
|
||||
* @param headerComments A list of comments to include at the beginning of the CSV / TSV file.
|
||||
* Default: empty list.
|
||||
* @param recordSeparator The character that separates records in a CSV / TSV file.
|
||||
* Default: `'\n'`, a Unix-newline.
|
||||
*/
|
||||
public fun AnyFrame.writeDelim(
|
||||
path: String,
|
||||
delimiter: Char = DELIM_DELIMITER,
|
||||
includeHeader: Boolean = INCLUDE_HEADER,
|
||||
quote: Char? = QUOTE,
|
||||
quoteMode: QuoteMode = QUOTE_MODE,
|
||||
escapeChar: Char? = ESCAPE_CHAR,
|
||||
commentChar: Char? = COMMENT_CHAR,
|
||||
headerComments: List<String> = HEADER_COMMENTS,
|
||||
recordSeparator: String = RECORD_SEPARATOR,
|
||||
): Unit =
|
||||
writeDelimImpl(
|
||||
df = this,
|
||||
writer = FileWriter(path),
|
||||
delimiter = delimiter,
|
||||
includeHeader = includeHeader,
|
||||
quote = quote,
|
||||
quoteMode = quoteMode,
|
||||
escapeChar = escapeChar,
|
||||
commentChar = commentChar,
|
||||
headerComments = headerComments,
|
||||
recordSeparator = recordSeparator,
|
||||
adjustCsvFormat = ADJUST_CSV_FORMAT,
|
||||
)
|
||||
|
||||
/**
|
||||
*
|
||||
* ### Write [DataFrame] to Delimiter-Separated Text Appendable
|
||||
*
|
||||
* Writes [this][this] [DataFrame][DataFrame] to a delimiter-separated text [Appendable].
|
||||
*
|
||||
* Parameters you can use to customize the process include, for instance, [delimiter],
|
||||
* [includeHeader], [quoteMode], and [headerComments].
|
||||
* See the param list below for all settings.
|
||||
*
|
||||
* The integration is built upon [Apache Commons CSV](https://commons.apache.org/proper/commons-csv/).
|
||||
*
|
||||
* ##### Similar Functions
|
||||
* With overloads of [DataFrame.writeDelim][writeDelim]`()`, you can write delimiter-separated text to [File][File], [Path][java.nio.file.Path],
|
||||
* [Appendable], or [String].
|
||||
*
|
||||
* For example, [DataFrame.writeDelim][writeDelim]`("output.txt")`
|
||||
*
|
||||
* or [DataFrame.writeDelim][writeDelim]`(`[File][File]`("output.txt"), quoteMode = `[QuoteMode.ALL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.ALL]`)`
|
||||
*
|
||||
* Converting to a [String] can be done like this:
|
||||
*
|
||||
* [DataFrame.toDelimStr][toDelimStr]`(delimiter = ",")`
|
||||
*
|
||||
* @param writer The [Appendable] to write to.
|
||||
* @param delimiter The field delimiter character. Default: ','.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param includeHeader Whether to include the header in the output. Default: `true`.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param quoteMode The [QuoteMode][org.jetbrains.kotlinx.dataframe.io.QuoteMode] to use when writing CSV / TSV files.
|
||||
* Default: [QuoteMode.MINIMAL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.MINIMAL].
|
||||
* @param escapeChar The escape character to use when writing CSV / TSV files with [QuoteMode.NONE][org.jetbrains.kotlinx.dataframe.io.QuoteMode.NONE].
|
||||
* Default: `null`. This will double-quote the value.
|
||||
* @param commentChar The character that indicates a comment line in a CSV / TSV file.
|
||||
* Default: `'#'`.
|
||||
* @param headerComments A list of comments to include at the beginning of the CSV / TSV file.
|
||||
* Default: empty list.
|
||||
* @param recordSeparator The character that separates records in a CSV / TSV file.
|
||||
* Default: `'\n'`, a Unix-newline.
|
||||
* @param adjustCsvFormat Optional extra [CSVFormat] configuration. Default: `{ it }`.
|
||||
*
|
||||
* Before instantiating the [CSVFormat], the [CSVFormat.Builder] will be passed to this lambda.
|
||||
* This will allow you to configure/overwrite any CSV / TSV writing options.
|
||||
*/
|
||||
public fun AnyFrame.writeDelim(
|
||||
writer: Appendable,
|
||||
delimiter: Char = DELIM_DELIMITER,
|
||||
includeHeader: Boolean = INCLUDE_HEADER,
|
||||
quote: Char? = QUOTE,
|
||||
quoteMode: QuoteMode = QUOTE_MODE,
|
||||
escapeChar: Char? = ESCAPE_CHAR,
|
||||
commentChar: Char? = COMMENT_CHAR,
|
||||
headerComments: List<String> = HEADER_COMMENTS,
|
||||
recordSeparator: String = RECORD_SEPARATOR,
|
||||
adjustCsvFormat: AdjustCSVFormat = ADJUST_CSV_FORMAT,
|
||||
): Unit =
|
||||
writeDelimImpl(
|
||||
df = this,
|
||||
writer = writer,
|
||||
delimiter = delimiter,
|
||||
includeHeader = includeHeader,
|
||||
quote = quote,
|
||||
quoteMode = quoteMode,
|
||||
escapeChar = escapeChar,
|
||||
commentChar = commentChar,
|
||||
headerComments = headerComments,
|
||||
recordSeparator = recordSeparator,
|
||||
adjustCsvFormat = adjustCsvFormat,
|
||||
)
|
||||
+305
@@ -0,0 +1,305 @@
|
||||
@file:JvmName("WriteTsvDeephavenKt")
|
||||
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import org.jetbrains.kotlinx.dataframe.AnyFrame
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.CommonWriteDelimDocs
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ADJUST_CSV_FORMAT
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.COMMENT_CHAR
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.ESCAPE_CHAR
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.FILE_WRITE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.HEADER_COMMENTS
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.INCLUDE_HEADER
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.PATH_WRITE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.QUOTE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.QUOTE_MODE
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.RECORD_SEPARATOR
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.TSV_DELIMITER
|
||||
import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams.WRITER_WRITE
|
||||
import org.jetbrains.kotlinx.dataframe.impl.io.writeDelimImpl
|
||||
import java.io.File
|
||||
import java.io.FileWriter
|
||||
import java.nio.file.Path
|
||||
import kotlin.io.path.writer
|
||||
|
||||
/**
|
||||
* ### Write [DataFrame] to TSV File
|
||||
*
|
||||
* Writes [this][this] [DataFrame][DataFrame] to a TSV file.
|
||||
*
|
||||
* Parameters you can use to customize the process include, for instance, [delimiter],
|
||||
* [includeHeader], [quoteMode], and [headerComments].
|
||||
* See the param list below for all settings.
|
||||
*
|
||||
* The integration is built upon [Apache Commons CSV](https://commons.apache.org/proper/commons-csv/).
|
||||
*
|
||||
* ##### Similar Functions
|
||||
* With overloads of [DataFrame.writeTsv][writeTsv]`()`, you can write TSV to [File][File], [Path][java.nio.file.Path],
|
||||
* [Appendable], or [String].
|
||||
*
|
||||
* For example, [DataFrame.writeTsv][writeTsv]`("output.tsv")`
|
||||
*
|
||||
* or [DataFrame.writeTsv][writeTsv]`(`[File][File]`("output.tsv"), quoteMode = `[QuoteMode.ALL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.ALL]`)`
|
||||
*
|
||||
* Converting to a [String] can be done like this:
|
||||
*
|
||||
* [DataFrame.toTsvStr][toTsvStr]`(delimiter = ",")`
|
||||
*
|
||||
* @param path The path pointing to a file to write to.
|
||||
* @param delimiter The field delimiter character. Default: '\t'.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param includeHeader Whether to include the header in the output. Default: `true`.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param quoteMode The [QuoteMode][org.jetbrains.kotlinx.dataframe.io.QuoteMode] to use when writing CSV / TSV files.
|
||||
* Default: [QuoteMode.MINIMAL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.MINIMAL].
|
||||
* @param escapeChar The escape character to use when writing CSV / TSV files with [QuoteMode.NONE][org.jetbrains.kotlinx.dataframe.io.QuoteMode.NONE].
|
||||
* Default: `null`. This will double-quote the value.
|
||||
* @param commentChar The character that indicates a comment line in a CSV / TSV file.
|
||||
* Default: `'#'`.
|
||||
* @param headerComments A list of comments to include at the beginning of the CSV / TSV file.
|
||||
* Default: empty list.
|
||||
* @param recordSeparator The character that separates records in a CSV / TSV file.
|
||||
* Default: `'\n'`, a Unix-newline.
|
||||
*/
|
||||
public fun AnyFrame.writeTsv(
|
||||
path: Path,
|
||||
delimiter: Char = TSV_DELIMITER,
|
||||
includeHeader: Boolean = INCLUDE_HEADER,
|
||||
quote: Char? = QUOTE,
|
||||
quoteMode: QuoteMode = QUOTE_MODE,
|
||||
escapeChar: Char? = ESCAPE_CHAR,
|
||||
commentChar: Char? = COMMENT_CHAR,
|
||||
headerComments: List<String> = HEADER_COMMENTS,
|
||||
recordSeparator: String = RECORD_SEPARATOR,
|
||||
): Unit =
|
||||
writeDelimImpl(
|
||||
df = this,
|
||||
writer = path.writer(),
|
||||
delimiter = delimiter,
|
||||
includeHeader = includeHeader,
|
||||
quote = quote,
|
||||
quoteMode = quoteMode,
|
||||
escapeChar = escapeChar,
|
||||
commentChar = commentChar,
|
||||
headerComments = headerComments,
|
||||
recordSeparator = recordSeparator,
|
||||
adjustCsvFormat = ADJUST_CSV_FORMAT,
|
||||
)
|
||||
|
||||
/**
|
||||
* ### Write [DataFrame] to TSV File
|
||||
*
|
||||
* Writes [this][this] [DataFrame][DataFrame] to a TSV file.
|
||||
*
|
||||
* Parameters you can use to customize the process include, for instance, [delimiter],
|
||||
* [includeHeader], [quoteMode], and [headerComments].
|
||||
* See the param list below for all settings.
|
||||
*
|
||||
* The integration is built upon [Apache Commons CSV](https://commons.apache.org/proper/commons-csv/).
|
||||
*
|
||||
* ##### Similar Functions
|
||||
* With overloads of [DataFrame.writeTsv][writeTsv]`()`, you can write TSV to [File][File], [Path][java.nio.file.Path],
|
||||
* [Appendable], or [String].
|
||||
*
|
||||
* For example, [DataFrame.writeTsv][writeTsv]`("output.tsv")`
|
||||
*
|
||||
* or [DataFrame.writeTsv][writeTsv]`(`[File][File]`("output.tsv"), quoteMode = `[QuoteMode.ALL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.ALL]`)`
|
||||
*
|
||||
* Converting to a [String] can be done like this:
|
||||
*
|
||||
* [DataFrame.toTsvStr][toTsvStr]`(delimiter = ",")`
|
||||
*
|
||||
* @param file The file to write to.
|
||||
* @param delimiter The field delimiter character. Default: '\t'.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param includeHeader Whether to include the header in the output. Default: `true`.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param quoteMode The [QuoteMode][org.jetbrains.kotlinx.dataframe.io.QuoteMode] to use when writing CSV / TSV files.
|
||||
* Default: [QuoteMode.MINIMAL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.MINIMAL].
|
||||
* @param escapeChar The escape character to use when writing CSV / TSV files with [QuoteMode.NONE][org.jetbrains.kotlinx.dataframe.io.QuoteMode.NONE].
|
||||
* Default: `null`. This will double-quote the value.
|
||||
* @param commentChar The character that indicates a comment line in a CSV / TSV file.
|
||||
* Default: `'#'`.
|
||||
* @param headerComments A list of comments to include at the beginning of the CSV / TSV file.
|
||||
* Default: empty list.
|
||||
* @param recordSeparator The character that separates records in a CSV / TSV file.
|
||||
* Default: `'\n'`, a Unix-newline.
|
||||
*/
|
||||
public fun AnyFrame.writeTsv(
|
||||
file: File,
|
||||
delimiter: Char = TSV_DELIMITER,
|
||||
includeHeader: Boolean = INCLUDE_HEADER,
|
||||
quote: Char? = QUOTE,
|
||||
quoteMode: QuoteMode = QUOTE_MODE,
|
||||
escapeChar: Char? = ESCAPE_CHAR,
|
||||
commentChar: Char? = COMMENT_CHAR,
|
||||
headerComments: List<String> = HEADER_COMMENTS,
|
||||
recordSeparator: String = RECORD_SEPARATOR,
|
||||
): Unit =
|
||||
writeDelimImpl(
|
||||
df = this,
|
||||
writer = FileWriter(file),
|
||||
delimiter = delimiter,
|
||||
includeHeader = includeHeader,
|
||||
quote = quote,
|
||||
quoteMode = quoteMode,
|
||||
escapeChar = escapeChar,
|
||||
commentChar = commentChar,
|
||||
headerComments = headerComments,
|
||||
recordSeparator = recordSeparator,
|
||||
adjustCsvFormat = ADJUST_CSV_FORMAT,
|
||||
)
|
||||
|
||||
/**
|
||||
* ### Write [DataFrame] to TSV File
|
||||
*
|
||||
* Writes [this][this] [DataFrame][DataFrame] to a TSV file.
|
||||
*
|
||||
* Parameters you can use to customize the process include, for instance, [delimiter],
|
||||
* [includeHeader], [quoteMode], and [headerComments].
|
||||
* See the param list below for all settings.
|
||||
*
|
||||
* The integration is built upon [Apache Commons CSV](https://commons.apache.org/proper/commons-csv/).
|
||||
*
|
||||
* ##### Similar Functions
|
||||
* With overloads of [DataFrame.writeTsv][writeTsv]`()`, you can write TSV to [File][File], [Path][java.nio.file.Path],
|
||||
* [Appendable], or [String].
|
||||
*
|
||||
* For example, [DataFrame.writeTsv][writeTsv]`("output.tsv")`
|
||||
*
|
||||
* or [DataFrame.writeTsv][writeTsv]`(`[File][File]`("output.tsv"), quoteMode = `[QuoteMode.ALL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.ALL]`)`
|
||||
*
|
||||
* Converting to a [String] can be done like this:
|
||||
*
|
||||
* [DataFrame.toTsvStr][toTsvStr]`(delimiter = ",")`
|
||||
*
|
||||
* @param path The path pointing to a file to write to.
|
||||
* @param delimiter The field delimiter character. Default: '\t'.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param includeHeader Whether to include the header in the output. Default: `true`.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param quoteMode The [QuoteMode][org.jetbrains.kotlinx.dataframe.io.QuoteMode] to use when writing CSV / TSV files.
|
||||
* Default: [QuoteMode.MINIMAL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.MINIMAL].
|
||||
* @param escapeChar The escape character to use when writing CSV / TSV files with [QuoteMode.NONE][org.jetbrains.kotlinx.dataframe.io.QuoteMode.NONE].
|
||||
* Default: `null`. This will double-quote the value.
|
||||
* @param commentChar The character that indicates a comment line in a CSV / TSV file.
|
||||
* Default: `'#'`.
|
||||
* @param headerComments A list of comments to include at the beginning of the CSV / TSV file.
|
||||
* Default: empty list.
|
||||
* @param recordSeparator The character that separates records in a CSV / TSV file.
|
||||
* Default: `'\n'`, a Unix-newline.
|
||||
*/
|
||||
public fun AnyFrame.writeTsv(
|
||||
path: String,
|
||||
delimiter: Char = TSV_DELIMITER,
|
||||
includeHeader: Boolean = INCLUDE_HEADER,
|
||||
quote: Char? = QUOTE,
|
||||
quoteMode: QuoteMode = QUOTE_MODE,
|
||||
escapeChar: Char? = ESCAPE_CHAR,
|
||||
commentChar: Char? = COMMENT_CHAR,
|
||||
headerComments: List<String> = HEADER_COMMENTS,
|
||||
recordSeparator: String = RECORD_SEPARATOR,
|
||||
): Unit =
|
||||
writeDelimImpl(
|
||||
df = this,
|
||||
writer = FileWriter(path),
|
||||
delimiter = delimiter,
|
||||
includeHeader = includeHeader,
|
||||
quote = quote,
|
||||
quoteMode = quoteMode,
|
||||
escapeChar = escapeChar,
|
||||
commentChar = commentChar,
|
||||
headerComments = headerComments,
|
||||
recordSeparator = recordSeparator,
|
||||
adjustCsvFormat = ADJUST_CSV_FORMAT,
|
||||
)
|
||||
|
||||
/**
|
||||
*
|
||||
* ### Write [DataFrame] to TSV Appendable
|
||||
*
|
||||
* Writes [this][this] [DataFrame][DataFrame] to a TSV [Appendable].
|
||||
*
|
||||
* Parameters you can use to customize the process include, for instance, [delimiter],
|
||||
* [includeHeader], [quoteMode], and [headerComments].
|
||||
* See the param list below for all settings.
|
||||
*
|
||||
* The integration is built upon [Apache Commons CSV](https://commons.apache.org/proper/commons-csv/).
|
||||
*
|
||||
* ##### Similar Functions
|
||||
* With overloads of [DataFrame.writeTsv][writeTsv]`()`, you can write TSV to [File][File], [Path][java.nio.file.Path],
|
||||
* [Appendable], or [String].
|
||||
*
|
||||
* For example, [DataFrame.writeTsv][writeTsv]`("output.tsv")`
|
||||
*
|
||||
* or [DataFrame.writeTsv][writeTsv]`(`[File][File]`("output.tsv"), quoteMode = `[QuoteMode.ALL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.ALL]`)`
|
||||
*
|
||||
* Converting to a [String] can be done like this:
|
||||
*
|
||||
* [DataFrame.toTsvStr][toTsvStr]`(delimiter = ",")`
|
||||
*
|
||||
* @param writer The [Appendable] to write to.
|
||||
* @param delimiter The field delimiter character. Default: '\t'.
|
||||
*
|
||||
* Ignored if [hasFixedWidthColumns] is `true`.
|
||||
* @param includeHeader Whether to include the header in the output. Default: `true`.
|
||||
* @param quote The quote character. Default: `"`.
|
||||
*
|
||||
* Used when field- or line delimiters should be interpreted as literal text.
|
||||
*
|
||||
* For example: `123,"hello, there",456,` would correspond to: `123`; `hello, there`; `456`.
|
||||
* @param quoteMode The [QuoteMode][org.jetbrains.kotlinx.dataframe.io.QuoteMode] to use when writing CSV / TSV files.
|
||||
* Default: [QuoteMode.MINIMAL][org.jetbrains.kotlinx.dataframe.io.QuoteMode.MINIMAL].
|
||||
* @param escapeChar The escape character to use when writing CSV / TSV files with [QuoteMode.NONE][org.jetbrains.kotlinx.dataframe.io.QuoteMode.NONE].
|
||||
* Default: `null`. This will double-quote the value.
|
||||
* @param commentChar The character that indicates a comment line in a CSV / TSV file.
|
||||
* Default: `'#'`.
|
||||
* @param headerComments A list of comments to include at the beginning of the CSV / TSV file.
|
||||
* Default: empty list.
|
||||
* @param recordSeparator The character that separates records in a CSV / TSV file.
|
||||
* Default: `'\n'`, a Unix-newline.
|
||||
* @param adjustCsvFormat Optional extra [CSVFormat] configuration. Default: `{ it }`.
|
||||
*
|
||||
* Before instantiating the [CSVFormat], the [CSVFormat.Builder] will be passed to this lambda.
|
||||
* This will allow you to configure/overwrite any CSV / TSV writing options.
|
||||
*/
|
||||
public fun AnyFrame.writeTsv(
|
||||
writer: Appendable,
|
||||
delimiter: Char = TSV_DELIMITER,
|
||||
includeHeader: Boolean = INCLUDE_HEADER,
|
||||
quote: Char? = QUOTE,
|
||||
quoteMode: QuoteMode = QUOTE_MODE,
|
||||
escapeChar: Char? = ESCAPE_CHAR,
|
||||
commentChar: Char? = COMMENT_CHAR,
|
||||
headerComments: List<String> = HEADER_COMMENTS,
|
||||
recordSeparator: String = RECORD_SEPARATOR,
|
||||
adjustCsvFormat: AdjustCSVFormat = ADJUST_CSV_FORMAT,
|
||||
): Unit =
|
||||
writeDelimImpl(
|
||||
df = this,
|
||||
writer = writer,
|
||||
delimiter = delimiter,
|
||||
includeHeader = includeHeader,
|
||||
quote = quote,
|
||||
quoteMode = quoteMode,
|
||||
escapeChar = escapeChar,
|
||||
commentChar = commentChar,
|
||||
headerComments = headerComments,
|
||||
recordSeparator = recordSeparator,
|
||||
adjustCsvFormat = adjustCsvFormat,
|
||||
)
|
||||
+30
@@ -0,0 +1,30 @@
|
||||
@file:JvmName("CsvDeprecationMessagesKt")
|
||||
|
||||
package org.jetbrains.kotlinx.dataframe.util
|
||||
|
||||
/*
|
||||
* This file contains deprecation messages for the whole core module.
|
||||
* After each release, all messages should be reviewed and updated.
|
||||
* Level.WARNING -> Level.ERROR
|
||||
* Level.ERROR -> Remove
|
||||
*/
|
||||
|
||||
// region WARNING in 0.15, ERROR in 1.0
|
||||
|
||||
private const val MESSAGE_1_0 = "Will be ERROR in 1.0."
|
||||
|
||||
internal const val READ_CSV_BINARY_COMPATIBILITY = "This overload is here to maintain binary compatibility."
|
||||
internal const val READ_TSV_BINARY_COMPATIBILITY = "This overload is here to maintain binary compatibility."
|
||||
internal const val READ_DELIM_BINARY_COMPATIBILITY = "This overload is here to maintain binary compatibility."
|
||||
|
||||
// endregion
|
||||
|
||||
// region WARNING in 1.0, ERROR in 1.1
|
||||
|
||||
private const val MESSAGE_1_1 = "Will be ERROR in 1.1."
|
||||
|
||||
// endregion
|
||||
|
||||
// region keep across releases
|
||||
|
||||
// endregion
|
||||
+54
@@ -0,0 +1,54 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import org.jetbrains.kotlinx.dataframe.DataFrame
|
||||
import org.openjdk.jmh.annotations.Benchmark
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode
|
||||
import org.openjdk.jmh.annotations.Measurement
|
||||
import org.openjdk.jmh.annotations.Mode
|
||||
import org.openjdk.jmh.annotations.Param
|
||||
import org.openjdk.jmh.annotations.Scope
|
||||
import org.openjdk.jmh.annotations.Setup
|
||||
import org.openjdk.jmh.annotations.State
|
||||
import org.openjdk.jmh.annotations.TearDown
|
||||
import org.openjdk.jmh.annotations.Warmup
|
||||
import java.io.File
|
||||
import java.util.concurrent.TimeUnit
|
||||
|
||||
@BenchmarkMode(Mode.SingleShotTime)
|
||||
@Warmup(iterations = 10, time = 5, timeUnit = TimeUnit.SECONDS)
|
||||
@Measurement(iterations = 10, timeUnit = TimeUnit.SECONDS)
|
||||
@State(Scope.Benchmark)
|
||||
open class BenchmarkTest {
|
||||
|
||||
@Param("small", "medium", "large")
|
||||
var type = ""
|
||||
var file: File? = null
|
||||
|
||||
@Setup
|
||||
fun setup() {
|
||||
System.setProperty("org.slf4j.simpleLogger.defaultLogLevel", "info")
|
||||
file = File(
|
||||
"src/test/resources/" + when (type) {
|
||||
"small" -> "testCSV.csv"
|
||||
"medium" -> "gross-domestic-product-june-2024-quarter.csv"
|
||||
"large" -> "largeCsv.csv.gz"
|
||||
else -> throw IllegalArgumentException("Invalid type")
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
@TearDown
|
||||
fun tearDown() {
|
||||
file = null
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
fun apache() {
|
||||
DataFrame.readCSV(file!!)
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
fun deephaven() {
|
||||
DataFrame.readCsv(file!!)
|
||||
}
|
||||
}
|
||||
+887
@@ -0,0 +1,887 @@
|
||||
package org.jetbrains.kotlinx.dataframe.io
|
||||
|
||||
import io.deephaven.csv.parsers.Parsers
|
||||
import io.kotest.assertions.throwables.shouldNotThrowAny
|
||||
import io.kotest.assertions.throwables.shouldThrow
|
||||
import io.kotest.matchers.collections.shouldContainInOrder
|
||||
import io.kotest.matchers.nulls.shouldNotBeNull
|
||||
import io.kotest.matchers.shouldBe
|
||||
import io.kotest.matchers.shouldNotBe
|
||||
import kotlinx.datetime.LocalDate
|
||||
import kotlinx.datetime.LocalDateTime
|
||||
import org.intellij.lang.annotations.Language
|
||||
import org.jetbrains.kotlinx.dataframe.DataFrame
|
||||
import org.jetbrains.kotlinx.dataframe.api.ParserOptions
|
||||
import org.jetbrains.kotlinx.dataframe.api.allNulls
|
||||
import org.jetbrains.kotlinx.dataframe.api.convert
|
||||
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
|
||||
import org.jetbrains.kotlinx.dataframe.api.group
|
||||
import org.jetbrains.kotlinx.dataframe.api.groupBy
|
||||
import org.jetbrains.kotlinx.dataframe.api.into
|
||||
import org.jetbrains.kotlinx.dataframe.api.isEmpty
|
||||
import org.jetbrains.kotlinx.dataframe.api.parser
|
||||
import org.jetbrains.kotlinx.dataframe.api.print
|
||||
import org.jetbrains.kotlinx.dataframe.api.schema
|
||||
import org.jetbrains.kotlinx.dataframe.api.toStr
|
||||
import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser
|
||||
import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema
|
||||
import org.junit.After
|
||||
import org.junit.Before
|
||||
import org.junit.Test
|
||||
import java.io.File
|
||||
import java.io.StringWriter
|
||||
import java.math.BigDecimal
|
||||
import java.net.URL
|
||||
import java.util.Locale
|
||||
import java.util.zip.GZIPInputStream
|
||||
import kotlin.reflect.KClass
|
||||
import kotlin.reflect.typeOf
|
||||
import kotlin.time.Instant as StdlibInstant
|
||||
import kotlinx.datetime.Instant as DeprecatedInstant
|
||||
|
||||
// can be enabled for showing logs for these tests
|
||||
private const val SHOW_LOGS = false
|
||||
|
||||
@Suppress("ktlint:standard:argument-list-wrapping")
|
||||
class DelimCsvTsvTests {
|
||||
|
||||
private val logLevel = "org.slf4j.simpleLogger.log.${FastDoubleParser::class.qualifiedName}"
|
||||
private var loggerBefore: String? = null
|
||||
|
||||
@Before
|
||||
fun setLogger() {
|
||||
if (!SHOW_LOGS) return
|
||||
loggerBefore = System.getProperty(logLevel)
|
||||
System.setProperty(logLevel, "trace")
|
||||
}
|
||||
|
||||
@After
|
||||
fun restoreLogger() {
|
||||
if (!SHOW_LOGS) return
|
||||
if (loggerBefore != null) {
|
||||
System.setProperty(logLevel, loggerBefore)
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun readNulls() {
|
||||
@Language("CSV")
|
||||
val src =
|
||||
"""
|
||||
first,second
|
||||
2,,
|
||||
3,,
|
||||
""".trimIndent()
|
||||
val df = DataFrame.readCsvStr(src)
|
||||
df.rowsCount() shouldBe 2
|
||||
df.columnsCount() shouldBe 2
|
||||
df["first"].type() shouldBe typeOf<Int>()
|
||||
df["second"].allNulls() shouldBe true
|
||||
df["second"].type() shouldBe typeOf<String?>()
|
||||
}
|
||||
|
||||
@Test
|
||||
fun write() {
|
||||
val df = dataFrameOf("col1", "col2")(
|
||||
1, null,
|
||||
2, null,
|
||||
).convert("col2").toStr()
|
||||
|
||||
val str = StringWriter()
|
||||
df.writeCsv(str)
|
||||
|
||||
val res = DataFrame.readCsvStr(str.buffer.toString())
|
||||
|
||||
res shouldBe df
|
||||
}
|
||||
|
||||
@Test
|
||||
fun readCsv() {
|
||||
val df = DataFrame.read(simpleCsv)
|
||||
|
||||
df.columnsCount() shouldBe 11
|
||||
df.rowsCount() shouldBe 5
|
||||
df.columnNames()[5] shouldBe "duplicate1"
|
||||
df.columnNames()[6] shouldBe "duplicate11"
|
||||
df["duplicate1"].type() shouldBe typeOf<Char?>()
|
||||
df["double"].type() shouldBe typeOf<Double?>()
|
||||
df["number"].type() shouldBe typeOf<Double>()
|
||||
df["time"].type() shouldBe typeOf<LocalDateTime>()
|
||||
|
||||
df.print(columnTypes = true, borders = true, title = true)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `readCsv different charset`() {
|
||||
val df = DataFrame.readCsv(simpleCsv)
|
||||
|
||||
DataFrame.readCsv(simpleCsvUtf16le) shouldBe df
|
||||
DataFrame.readCsv(simpleCsvUtf16le, charset = Charsets.UTF_16LE) shouldBe df
|
||||
DataFrame.readCsv(simpleCsvUtf16le, charset = Charsets.UTF_16BE) shouldNotBe df
|
||||
DataFrame.readCsv(simpleCsvUtf16le, charset = Charsets.UTF_8) shouldNotBe df
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `readCsv gz compressed different charset`() {
|
||||
val df = DataFrame.readCsv(simpleCsv)
|
||||
|
||||
DataFrame.readCsv(simpleCsvUtf16leGz) shouldBe df
|
||||
DataFrame.readCsv(simpleCsvUtf16leGz, charset = Charsets.UTF_16LE) shouldBe df
|
||||
DataFrame.readCsv(simpleCsvUtf16leGz, charset = Charsets.UTF_16BE) shouldNotBe df
|
||||
DataFrame.readCsv(simpleCsvUtf16leGz, charset = Charsets.UTF_8) shouldNotBe df
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `readCsv zip compressed different charset`() {
|
||||
val df = DataFrame.readCsv(simpleCsv)
|
||||
|
||||
DataFrame.readCsv(simpleCsvUtf16leZip) shouldBe df
|
||||
DataFrame.readCsv(simpleCsvUtf16leZip, charset = Charsets.UTF_16LE) shouldBe df
|
||||
DataFrame.readCsv(simpleCsvUtf16leZip, charset = Charsets.UTF_16BE) shouldNotBe df
|
||||
DataFrame.readCsv(simpleCsvUtf16leZip, charset = Charsets.UTF_8) shouldNotBe df
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `read ZIP Csv`() {
|
||||
DataFrame.readCsv(simpleCsvZip) shouldBe DataFrame.readCsv(simpleCsv)
|
||||
|
||||
shouldThrow<IllegalStateException> {
|
||||
DataFrame.readCsv(notCsv)
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `read GZ Csv`() {
|
||||
DataFrame.readCsv(simpleCsvGz) shouldBe DataFrame.readCsv(simpleCsv)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `read custom compression Csv`() {
|
||||
DataFrame.readCsv(
|
||||
simpleCsvGz,
|
||||
compression = Compression(::GZIPInputStream),
|
||||
) shouldBe DataFrame.readCsv(simpleCsv)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `read 2 compressed Csv`() {
|
||||
shouldThrow<IllegalArgumentException> { DataFrame.readCsv(twoCsvsZip) }
|
||||
}
|
||||
|
||||
@Test
|
||||
fun readCsvWithFrenchLocaleAndAlternativeDelimiter() {
|
||||
val df = DataFrame.readCsv(
|
||||
url = csvWithFrenchLocale,
|
||||
delimiter = ';',
|
||||
parserOptions = ParserOptions(locale = Locale.FRENCH),
|
||||
)
|
||||
|
||||
df.columnsCount() shouldBe 11
|
||||
df.rowsCount() shouldBe 5
|
||||
df.columnNames()[5] shouldBe "duplicate1"
|
||||
df.columnNames()[6] shouldBe "duplicate11"
|
||||
df["duplicate1"].type() shouldBe typeOf<Char?>()
|
||||
df["double"].type() shouldBe typeOf<Double?>()
|
||||
df["number"].type() shouldBe typeOf<Double>()
|
||||
df["time"].type() shouldBe typeOf<LocalDateTime>()
|
||||
|
||||
println(df)
|
||||
}
|
||||
|
||||
private fun assertColumnType(columnName: String, kClass: KClass<*>, schema: DataFrameSchema) {
|
||||
val col = schema.columns[columnName]
|
||||
col.shouldNotBeNull()
|
||||
col.type.classifier shouldBe kClass
|
||||
}
|
||||
|
||||
@Test
|
||||
fun readCsvWithFloats() {
|
||||
val df = DataFrame.readCsv(wineCsv, delimiter = ';')
|
||||
val schema = df.schema()
|
||||
|
||||
assertColumnType("citric acid", Double::class, schema)
|
||||
assertColumnType("alcohol", Double::class, schema)
|
||||
assertColumnType("quality", Int::class, schema)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `read standard CSV with floats when user has alternative locale`() {
|
||||
val currentLocale = Locale.getDefault()
|
||||
try {
|
||||
Locale.setDefault(Locale.forLanguageTag("ru-RU"))
|
||||
val df = DataFrame.readCsv(wineCsv, delimiter = ';')
|
||||
val schema = df.schema()
|
||||
|
||||
assertColumnType("citric acid", Double::class, schema)
|
||||
assertColumnType("alcohol", Double::class, schema)
|
||||
assertColumnType("quality", Int::class, schema)
|
||||
} finally {
|
||||
Locale.setDefault(currentLocale)
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `read with custom header`() {
|
||||
val header = ('A'..'K').map { it.toString() }
|
||||
val df = DataFrame.readCsv(simpleCsv, header = header, skipLines = 1)
|
||||
df.columnNames() shouldBe header
|
||||
df["B"].type() shouldBe typeOf<Int>()
|
||||
|
||||
val headerShort = ('A'..'E').map { it.toString() }
|
||||
val dfShort = DataFrame.readCsv(simpleCsv, header = headerShort, skipLines = 1)
|
||||
dfShort.columnsCount() shouldBe 5
|
||||
dfShort.columnNames() shouldBe headerShort
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `read first rows`() {
|
||||
val expected =
|
||||
listOf(
|
||||
"untitled",
|
||||
"user_id",
|
||||
"name",
|
||||
"duplicate",
|
||||
"username",
|
||||
"duplicate1",
|
||||
"duplicate11",
|
||||
"double",
|
||||
"number",
|
||||
"time",
|
||||
"empty",
|
||||
)
|
||||
val dfHeader = DataFrame.readCsv(simpleCsv, readLines = 0)
|
||||
dfHeader.rowsCount() shouldBe 0
|
||||
dfHeader.columnNames() shouldBe expected
|
||||
|
||||
val dfThree = DataFrame.readCsv(simpleCsv, readLines = 3)
|
||||
dfThree.rowsCount() shouldBe 3
|
||||
|
||||
val dfFull = DataFrame.readCsv(simpleCsv, readLines = 10)
|
||||
dfFull.rowsCount() shouldBe 5
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `if string starts with a number, it should be parsed as a string anyway`() {
|
||||
@Language("CSV")
|
||||
val df = DataFrame.readCsvStr(
|
||||
"""
|
||||
duration,floatDuration
|
||||
12 min,1.0
|
||||
15,12.98 sec
|
||||
1 Season,0.9 parsec
|
||||
""".trimIndent(),
|
||||
)
|
||||
df["duration"].type() shouldBe typeOf<String>()
|
||||
df["floatDuration"].type() shouldBe typeOf<String>()
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `if record has fewer columns than header then pad it with nulls`() {
|
||||
@Language("CSV")
|
||||
val csvContent =
|
||||
"""
|
||||
col1,col2,col3
|
||||
568,801,587
|
||||
780,588
|
||||
""".trimIndent()
|
||||
|
||||
val df = shouldNotThrowAny {
|
||||
DataFrame.readCsvStr(csvContent)
|
||||
}
|
||||
|
||||
df shouldBe dataFrameOf("col1", "col2", "col3")(
|
||||
568, 801, 587,
|
||||
780, 588, null,
|
||||
)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `write and read frame column`() {
|
||||
val df = dataFrameOf("a", "b", "c")(
|
||||
1, 2, 3,
|
||||
1, 3, 2,
|
||||
2, 1, 3,
|
||||
)
|
||||
val grouped = df.groupBy("a").into("g")
|
||||
val str = grouped.toCsvStr(escapeChar = null)
|
||||
val res = DataFrame.readCsvStr(str, quote = '"')
|
||||
res shouldBe grouped
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `write and read column group`() {
|
||||
val df = dataFrameOf("a", "b", "c")(
|
||||
1, 2, 3,
|
||||
1, 3, 2,
|
||||
)
|
||||
val grouped = df.group("b", "c").into("d")
|
||||
val str = grouped.toCsvStr()
|
||||
val res = DataFrame.readCsvStr(str)
|
||||
res shouldBe grouped
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `CSV String of saved dataframe starts with column name`() {
|
||||
val df = dataFrameOf("a")(1)
|
||||
df.toCsvStr().first() shouldBe 'a'
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `guess tsv`() {
|
||||
val df = DataFrame.read(testResource("abc.tsv"))
|
||||
df.columnsCount() shouldBe 3
|
||||
df.rowsCount() shouldBe 2
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `write csv without header produce correct file`() {
|
||||
val df = dataFrameOf("a", "b", "c")(
|
||||
1, 2, 3,
|
||||
1, 3, 2,
|
||||
)
|
||||
df.writeCsv(
|
||||
path = "src/test/resources/without_header.csv",
|
||||
includeHeader = false,
|
||||
recordSeparator = "\r\n",
|
||||
)
|
||||
val producedFile = File("src/test/resources/without_header.csv")
|
||||
producedFile.exists() shouldBe true
|
||||
producedFile.readText() shouldBe "1,2,3\r\n1,3,2\r\n"
|
||||
producedFile.delete()
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `check integrity of example data`() {
|
||||
shouldThrow<IllegalStateException> {
|
||||
// cannot read file with blank line at the start
|
||||
DataFrame.readCsv("../data/jetbrains repositories.csv")
|
||||
}
|
||||
shouldThrow<IllegalStateException> {
|
||||
// ignoreEmptyLines only ignores intermediate empty lines
|
||||
DataFrame.readCsv("../data/jetbrains repositories.csv", ignoreEmptyLines = true)
|
||||
}
|
||||
|
||||
val df = DataFrame.readCsv(
|
||||
"../data/jetbrains repositories.csv",
|
||||
skipLines = 1, // we need to skip the empty lines manually
|
||||
)
|
||||
df.columnNames() shouldBe listOf("full_name", "html_url", "stargazers_count", "topics", "watchers")
|
||||
df.columnTypes() shouldBe listOf(
|
||||
typeOf<String>(),
|
||||
typeOf<URL>(),
|
||||
typeOf<Int>(),
|
||||
typeOf<String>(),
|
||||
typeOf<Int>(),
|
||||
)
|
||||
// same file without empty line at the beginning
|
||||
df shouldBe DataFrame.readCsv("../data/jetbrains_repositories.csv")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `readCsvStr delimiter`() {
|
||||
@Language("TSV")
|
||||
val tsv =
|
||||
"""
|
||||
a b c
|
||||
1 2 3
|
||||
""".trimIndent()
|
||||
val df = DataFrame.readCsvStr(tsv, '\t')
|
||||
df shouldBe dataFrameOf("a", "b", "c")(1, 2, 3)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `file with BOM`() {
|
||||
val df = DataFrame.readCsv(withBomCsv, delimiter = ';')
|
||||
df.columnNames() shouldBe listOf("Column1", "Column2")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `read empty CSV`() {
|
||||
val emptyDelimStr = DataFrame.readCsvStr("")
|
||||
emptyDelimStr shouldBe DataFrame.empty()
|
||||
|
||||
val emptyWidthStr = DataFrame.readCsvStr("", hasFixedWidthColumns = true)
|
||||
emptyWidthStr shouldBe DataFrame.empty()
|
||||
|
||||
val emptyCsvFile = DataFrame.readCsv(File.createTempFile("empty", "csv"))
|
||||
emptyCsvFile shouldBe DataFrame.empty()
|
||||
|
||||
val emptyCsvFileManualHeader = DataFrame.readCsv(
|
||||
file = File.createTempFile("empty", "csv"),
|
||||
header = listOf("a", "b", "c"),
|
||||
)
|
||||
emptyCsvFileManualHeader.apply {
|
||||
isEmpty() shouldBe true
|
||||
columnNames() shouldBe listOf("a", "b", "c")
|
||||
columnTypes() shouldBe listOf(typeOf<String>(), typeOf<String>(), typeOf<String>())
|
||||
}
|
||||
|
||||
val emptyCsvFileWithHeader = DataFrame.readCsv(
|
||||
file = File.createTempFile("empty", "csv").also { it.writeText("a,b,c") },
|
||||
)
|
||||
emptyCsvFileWithHeader.apply {
|
||||
isEmpty() shouldBe true
|
||||
columnNames() shouldBe listOf("a", "b", "c")
|
||||
columnTypes() shouldBe listOf(typeOf<String>(), typeOf<String>(), typeOf<String>())
|
||||
}
|
||||
|
||||
val emptyTsvStr = DataFrame.readTsv(File.createTempFile("empty", "tsv"))
|
||||
emptyTsvStr shouldBe DataFrame.empty()
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `read Csv with comments`() {
|
||||
@Language("CSV")
|
||||
val csv =
|
||||
"""
|
||||
# This is a comment
|
||||
a,b,c
|
||||
1,2,3
|
||||
""".trimIndent()
|
||||
val df = DataFrame.readCsvStr(csv, skipLines = 1L)
|
||||
df shouldBe dataFrameOf("a", "b", "c")(1, 2, 3)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `csv with empty lines`() {
|
||||
@Language("CSV")
|
||||
val csv =
|
||||
"""
|
||||
a,b,c
|
||||
1,2,3
|
||||
|
||||
4,5,6
|
||||
""".trimIndent()
|
||||
val df1 = DataFrame.readCsvStr(csv)
|
||||
df1 shouldBe dataFrameOf("a", "b", "c")(
|
||||
1, 2, 3,
|
||||
null, null, null,
|
||||
4, 5, 6,
|
||||
)
|
||||
|
||||
val df2 = DataFrame.readCsvStr(csv, ignoreEmptyLines = true)
|
||||
df2 shouldBe dataFrameOf("a", "b", "c")(
|
||||
1, 2, 3,
|
||||
4, 5, 6,
|
||||
)
|
||||
|
||||
shouldThrow<IllegalStateException> { DataFrame.readCsvStr(csv, allowMissingColumns = false) }
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `don't read folder`() {
|
||||
shouldThrow<IllegalArgumentException> { DataFrame.readCsv("") }
|
||||
shouldThrow<IllegalArgumentException> { DataFrame.readCsv("NON EXISTENT FILE") }
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `cannot auto-parse specific date string`() {
|
||||
@Language("csv")
|
||||
val frenchCsv =
|
||||
"""
|
||||
name; price; date;
|
||||
a;12,45; 05/06/2021;
|
||||
b;-13,35;14/07/2025;
|
||||
c;100 123,35;;
|
||||
d;-204 235,23;;
|
||||
e;NaN;;
|
||||
f;null;;
|
||||
""".trimIndent()
|
||||
|
||||
val dfDeephaven = DataFrame.readCsvStr(
|
||||
text = frenchCsv,
|
||||
delimiter = ';',
|
||||
)
|
||||
|
||||
// could not parse, remains String
|
||||
dfDeephaven["date"].type() shouldBe typeOf<String?>()
|
||||
|
||||
val dfDataFrame = DataFrame.readCsvStr(
|
||||
text = frenchCsv,
|
||||
delimiter = ';',
|
||||
// setting any locale skips deephaven's date parsing
|
||||
parserOptions = ParserOptions(locale = Locale.ROOT),
|
||||
)
|
||||
|
||||
// could not parse, remains String
|
||||
dfDataFrame["date"].type() shouldBe typeOf<String?>()
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `parse with other locales`() {
|
||||
@Language("csv")
|
||||
val frenchCsv =
|
||||
"""
|
||||
name; price; date;
|
||||
a;12,45; 05/06/2021;
|
||||
b;-13,35;14/07/2025;
|
||||
c;100 123,35;;
|
||||
d;-204 235,23;;
|
||||
e;NaN;;
|
||||
f;null;;
|
||||
""".trimIndent()
|
||||
|
||||
val frenchDf = DataFrame.readCsvStr(
|
||||
text = frenchCsv,
|
||||
delimiter = ';',
|
||||
parserOptions = ParserOptions(
|
||||
dateTimePattern = "dd/MM/yyyy",
|
||||
locale = Locale.FRENCH,
|
||||
),
|
||||
)
|
||||
|
||||
frenchDf["price"].type() shouldBe typeOf<Double?>()
|
||||
frenchDf["date"].type() shouldBe typeOf<LocalDate?>()
|
||||
|
||||
@Language("csv")
|
||||
val dutchCsv =
|
||||
"""
|
||||
name; price;
|
||||
a;12,45;
|
||||
b;-13,35;
|
||||
c;100.123,35;
|
||||
d;-204.235,23;
|
||||
e;NaN;
|
||||
f;null;
|
||||
""".trimIndent()
|
||||
|
||||
val dutchDf = DataFrame.readCsvStr(
|
||||
text = dutchCsv,
|
||||
delimiter = ';',
|
||||
parserOptions = ParserOptions(
|
||||
locale = Locale.forLanguageTag("nl-NL"),
|
||||
),
|
||||
)
|
||||
|
||||
dutchDf["price"].type() shouldBe typeOf<Double?>()
|
||||
|
||||
// skipping this test on windows due to lack of support for Arabic locales
|
||||
if (!System.getProperty("os.name").startsWith("Windows")) {
|
||||
// while negative numbers in RTL languages cannot be parsed thanks to Java, others work
|
||||
@Language("csv")
|
||||
val arabicCsv =
|
||||
"""
|
||||
الاسم; السعر;
|
||||
أ;١٢٫٤٥;
|
||||
ب;١٣٫٣٥;
|
||||
ج;١٠٠٫١٢٣;
|
||||
د;٢٠٤٫٢٣٥;
|
||||
هـ;ليس رقم;
|
||||
و;null;
|
||||
""".trimIndent()
|
||||
|
||||
val easternArabicDf = DataFrame.readCsvStr(
|
||||
arabicCsv,
|
||||
delimiter = ';',
|
||||
parserOptions = ParserOptions(
|
||||
locale = Locale.forLanguageTag("ar-001"),
|
||||
),
|
||||
)
|
||||
|
||||
easternArabicDf["السعر"].type() shouldBe typeOf<Double?>()
|
||||
easternArabicDf["الاسم"].type() shouldBe typeOf<String>() // apparently not a char
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `handle slightly mixed locales`() {
|
||||
@Language("csv")
|
||||
val estonianWrongMinus =
|
||||
"""
|
||||
name; price;
|
||||
a;12,45;
|
||||
b;-13,35;
|
||||
c;100 123,35;
|
||||
d;-204 235,23;
|
||||
e;NaN;
|
||||
f;null;
|
||||
""".trimIndent()
|
||||
|
||||
val estonianDf1 = DataFrame.readCsvStr(
|
||||
text = estonianWrongMinus,
|
||||
delimiter = ';',
|
||||
parserOptions = ParserOptions(
|
||||
locale = Locale.forLanguageTag("et-EE"),
|
||||
),
|
||||
)
|
||||
|
||||
estonianDf1["price"].type() shouldBe typeOf<Double?>()
|
||||
|
||||
// also test the global setting
|
||||
DataFrame.parser.locale = Locale.forLanguageTag("et-EE")
|
||||
|
||||
val estonianDf2 = DataFrame.readCsvStr(
|
||||
text = estonianWrongMinus,
|
||||
delimiter = ';',
|
||||
)
|
||||
estonianDf2 shouldBe estonianDf1
|
||||
|
||||
DataFrame.parser.resetToDefault()
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `NA and custom null string in double column`() {
|
||||
val df1 = DataFrame.readCsv(
|
||||
msleepCsv,
|
||||
parserOptions = ParserOptions(
|
||||
nullStrings = DEFAULT_DELIM_NULL_STRINGS + "nothing",
|
||||
),
|
||||
)
|
||||
|
||||
df1["name"].type() shouldBe typeOf<String>()
|
||||
df1["genus"].type() shouldBe typeOf<String>()
|
||||
df1["vore"].type() shouldBe typeOf<String?>()
|
||||
df1["order"].type() shouldBe typeOf<String>()
|
||||
df1["conservation"].type() shouldBe typeOf<String?>()
|
||||
df1["sleep_total"].type() shouldBe typeOf<Double>()
|
||||
df1["sleep_rem"].type() shouldBe typeOf<Double?>()
|
||||
df1["sleep_cycle"].type() shouldBe typeOf<Double?>()
|
||||
df1["awake"].type() shouldBe typeOf<Double>()
|
||||
df1["brainwt"].type() shouldBe typeOf<Double?>()
|
||||
df1["bodywt"].type() shouldBe typeOf<Double?>()
|
||||
|
||||
// Also test the global setting
|
||||
DataFrame.parser.addNullString("nothing")
|
||||
DEFAULT_DELIM_NULL_STRINGS.forEach {
|
||||
DataFrame.parser.addNullString(it)
|
||||
}
|
||||
|
||||
val df2 = DataFrame.readCsv(msleepCsv)
|
||||
df2 shouldBe df1
|
||||
|
||||
DataFrame.parser.resetToDefault()
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `multiple spaces as delimiter`() {
|
||||
@Language("csv")
|
||||
val csv =
|
||||
"""
|
||||
NAME STATUS AGE NUMBER LABELS
|
||||
argo-events Active 2y77d 1234 app.kubernetes.io/instance=argo-events,kubernetes.io/metadata.name=argo-events
|
||||
argo-workflows Active 2y77d 1234 app.kubernetes.io/instance=argo-workflows,kubernetes.io/metadata.name=argo-workflows
|
||||
argocd Active 5y18d 1234 kubernetes.io/metadata.name=argocd
|
||||
beta Active 4y235d 1234 kubernetes.io/metadata.name=beta
|
||||
""".trimIndent()
|
||||
|
||||
val df1 = DataFrame.readCsvStr(
|
||||
text = csv,
|
||||
hasFixedWidthColumns = true,
|
||||
)
|
||||
|
||||
df1["NAME"].type() shouldBe typeOf<String>()
|
||||
df1["STATUS"].type() shouldBe typeOf<String>()
|
||||
df1["AGE"].type() shouldBe typeOf<String>()
|
||||
df1["NUMBER"].type() shouldBe typeOf<Int>()
|
||||
df1["LABELS"].type() shouldBe typeOf<String>()
|
||||
|
||||
val df2 = DataFrame.readCsvStr(
|
||||
text = csv,
|
||||
hasFixedWidthColumns = true,
|
||||
fixedColumnWidths = listOf(25, 9, 9, 9, 100),
|
||||
skipLines = 1,
|
||||
header = listOf("name", "status", "age", "number", "labels"),
|
||||
)
|
||||
|
||||
df2["name"].type() shouldBe typeOf<String>()
|
||||
df2["status"].type() shouldBe typeOf<String>()
|
||||
df2["age"].type() shouldBe typeOf<String>()
|
||||
df2["number"].type() shouldBe typeOf<Int>()
|
||||
df2["labels"].type() shouldBe typeOf<String>()
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `handle default coltype with other parameters`() {
|
||||
val df = DataFrame.readCsv(
|
||||
simpleCsv,
|
||||
header = listOf("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"),
|
||||
skipLines = 2,
|
||||
colTypes = mapOf(
|
||||
"a" to ColType.Int,
|
||||
"b" to ColType.Double,
|
||||
ColType.DEFAULT to ColType.String,
|
||||
),
|
||||
)
|
||||
|
||||
df.columnTypes().shouldContainInOrder(
|
||||
typeOf<Int>(),
|
||||
typeOf<Double>(),
|
||||
typeOf<String>(),
|
||||
typeOf<String?>(),
|
||||
typeOf<String>(),
|
||||
typeOf<String?>(),
|
||||
typeOf<String?>(),
|
||||
typeOf<String?>(),
|
||||
typeOf<String>(),
|
||||
typeOf<String>(),
|
||||
typeOf<String?>(),
|
||||
)
|
||||
df.rowsCount() shouldBe 4
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `skipping types`() {
|
||||
val df1 = DataFrame.readCsv(
|
||||
irisDataset,
|
||||
colTypes = mapOf("sepal.length" to ColType.Double),
|
||||
parserOptions = ParserOptions(
|
||||
skipTypes = setOf(typeOf<Double>()),
|
||||
),
|
||||
)
|
||||
|
||||
df1["sepal.length"].type() shouldBe typeOf<Double>()
|
||||
df1["sepal.width"].type() shouldBe typeOf<BigDecimal>()
|
||||
df1["petal.length"].type() shouldBe typeOf<BigDecimal>()
|
||||
df1["petal.width"].type() shouldBe typeOf<BigDecimal>()
|
||||
df1["variety"].type() shouldBe typeOf<String>()
|
||||
|
||||
// Also test the global setting
|
||||
DataFrame.parser.addSkipType(typeOf<Double>())
|
||||
|
||||
val df2 = DataFrame.readCsv(
|
||||
irisDataset,
|
||||
colTypes = mapOf("sepal.length" to ColType.Double),
|
||||
)
|
||||
df2 shouldBe df1
|
||||
|
||||
DataFrame.parser.resetToDefault()
|
||||
}
|
||||
|
||||
// Issue #921
|
||||
@Test
|
||||
fun `read csv with custom null strings and given type`() {
|
||||
@Language("CSV")
|
||||
val csv =
|
||||
"""
|
||||
a,b
|
||||
noppes,2
|
||||
1.2,
|
||||
3,45
|
||||
,noppes
|
||||
1.3,1
|
||||
""".trimIndent()
|
||||
|
||||
val df1 = DataFrame.readCsvStr(
|
||||
csv,
|
||||
parserOptions = ParserOptions(
|
||||
nullStrings = setOf("noppes", ""),
|
||||
),
|
||||
colTypes = mapOf("a" to ColType.Double, "b" to ColType.Int),
|
||||
)
|
||||
df1 shouldBe dataFrameOf("a", "b")(
|
||||
null, 2,
|
||||
1.2, null,
|
||||
3.0, 45,
|
||||
null, null,
|
||||
1.3, 1,
|
||||
)
|
||||
|
||||
// Also test the global setting
|
||||
DataFrame.parser.addNullString("noppes")
|
||||
DataFrame.parser.addNullString("")
|
||||
|
||||
val df2 = DataFrame.readCsvStr(
|
||||
csv,
|
||||
colTypes = mapOf("a" to ColType.Double, "b" to ColType.Int),
|
||||
)
|
||||
|
||||
df2 shouldBe df1
|
||||
|
||||
DataFrame.parser.resetToDefault()
|
||||
}
|
||||
|
||||
// Issue #1047
|
||||
@Test
|
||||
fun `Only use Deephaven datetime parser with custom csv specs`() {
|
||||
@Language("csv")
|
||||
val csvContent =
|
||||
"""
|
||||
with_timezone_offset,without_timezone_offset
|
||||
2024-12-12T13:00:00+01:00,2024-12-12T13:00:00
|
||||
""".trimIndent()
|
||||
|
||||
// use DFs parsers by default for datetime-like columns
|
||||
val df1 = DataFrame.readCsvStr(csvContent)
|
||||
df1["with_timezone_offset"].let {
|
||||
it.type() shouldBe typeOf<StdlibInstant>()
|
||||
it[0] shouldBe StdlibInstant.parse("2024-12-12T13:00:00+01:00")
|
||||
}
|
||||
df1["without_timezone_offset"].let {
|
||||
it.type() shouldBe typeOf<LocalDateTime>()
|
||||
it[0] shouldBe LocalDateTime.parse("2024-12-12T13:00:00")
|
||||
}
|
||||
|
||||
// enable fast datetime parser for the first column with adjustCsvSpecs
|
||||
val df2 = DataFrame.readCsv(
|
||||
inputStream = csvContent.byteInputStream(),
|
||||
adjustCsvSpecs = {
|
||||
putParserForName("with_timezone_offset", Parsers.DATETIME)
|
||||
},
|
||||
)
|
||||
df2["with_timezone_offset"].let {
|
||||
it.type() shouldBe typeOf<LocalDateTime>()
|
||||
it[0] shouldBe LocalDateTime.parse("2024-12-12T12:00:00")
|
||||
}
|
||||
df2["without_timezone_offset"].let {
|
||||
it.type() shouldBe typeOf<LocalDateTime>()
|
||||
it[0] shouldBe LocalDateTime.parse("2024-12-12T13:00:00")
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `test parsing kotlin-time-Instant`() {
|
||||
@Language("csv")
|
||||
val csvContent =
|
||||
"""
|
||||
with_timezone_offset,without_timezone_offset
|
||||
2024-12-12T13:00:00+01:00,2024-12-12T13:00:00
|
||||
""".trimIndent()
|
||||
|
||||
DataFrame.parser.parseExperimentalInstant = true
|
||||
|
||||
// use DFs parsers by default for datetime-like columns
|
||||
val df1 = DataFrame.readCsvStr(csvContent)
|
||||
df1["with_timezone_offset"].let {
|
||||
it.type() shouldBe typeOf<StdlibInstant>()
|
||||
it[0] shouldBe StdlibInstant.parse("2024-12-12T13:00:00+01:00")
|
||||
}
|
||||
|
||||
DataFrame.parser.resetToDefault()
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `json dependency test`() {
|
||||
val df = dataFrameOf("firstName", "lastName")(
|
||||
"John", "Doe",
|
||||
"Jane", "Doe",
|
||||
).group { "firstName" and "lastName" }.into { "name" }
|
||||
|
||||
df.toCsvStr(quote = '\'') shouldBe
|
||||
"""
|
||||
name
|
||||
'{"firstName":"John","lastName":"Doe"}'
|
||||
'{"firstName":"Jane","lastName":"Doe"}'
|
||||
|
||||
""".trimIndent()
|
||||
}
|
||||
|
||||
companion object {
|
||||
private val irisDataset = testCsv("irisDataset")
|
||||
private val simpleCsv = testCsv("testCSV")
|
||||
private val simpleCsvUtf16le = testCsv("testCSV-utf-16-le-bom")
|
||||
private val simpleCsvUtf16leGz = testResource("testCSV-utf16le-bom.csv.gz")
|
||||
private val simpleCsvUtf16leZip = testResource("testCSV-utf-16-le-bom.zip")
|
||||
private val simpleCsvZip = testResource("testCSV.zip")
|
||||
private val twoCsvsZip = testResource("two csvs.zip")
|
||||
private val simpleCsvGz = testResource("testCSV.csv.gz")
|
||||
private val csvWithFrenchLocale = testCsv("testCSVwithFrenchLocale")
|
||||
private val wineCsv = testCsv("wine")
|
||||
private val withBomCsv = testCsv("with-bom")
|
||||
private val msleepCsv = testCsv("msleep")
|
||||
private val notCsv = testResource("not-csv.zip")
|
||||
}
|
||||
}
|
||||
|
||||
fun testResource(resourcePath: String): URL = DelimCsvTsvTests::class.java.classLoader.getResource(resourcePath)!!
|
||||
|
||||
fun testCsv(csvName: String) = testResource("$csvName.csv")
|
||||
Reference in New Issue
Block a user