118 lines
16 KiB
HTML
Vendored
118 lines
16 KiB
HTML
Vendored
<!DOCTYPE html PUBLIC ""
|
|
"">
|
|
<html><head><meta charset="UTF-8" /><title>tech.v3.dataset.join documentation</title><script async="true" src="https://www.googletagmanager.com/gtag/js?id=G-RGTB4J7LGP"></script><script>window.dataLayer = window.dataLayer || [];
|
|
function gtag(){dataLayer.push(arguments);}
|
|
gtag('js', new Date());
|
|
|
|
gtag('config', 'G-95TVFC1FEB');</script><link rel="stylesheet" type="text/css" href="css/default.css" /><link rel="stylesheet" type="text/css" href="highlight/solarized-light.css" /><script type="text/javascript" src="highlight/highlight.min.js"></script><script type="text/javascript" src="js/jquery.min.js"></script><script type="text/javascript" src="js/page_effects.js"></script><script>hljs.initHighlightingOnLoad();</script></head><body><div id="header"><h2>Generated by <a href="https://github.com/weavejester/codox">Codox</a> with <a href="https://github.com/xsc/codox-theme-rdash">RDash UI</a> theme</h2><h1><a href="index.html"><span class="project-title"><span class="project-name">TMD</span> <span class="project-version">8.003</span></span></a></h1></div><div class="sidebar primary"><h3 class="no-link"><span class="inner">Project</span></h3><ul class="index-link"><li class="depth-1 "><a href="index.html"><div class="inner">Index</div></a></li></ul><h3 class="no-link"><span class="inner">Topics</span></h3><ul><li class="depth-1 "><a href="000-getting-started.html"><div class="inner"><span>tech.ml.dataset Getting Started</span></div></a></li><li class="depth-1 "><a href="100-walkthrough.html"><div class="inner"><span>tech.ml.dataset Walkthrough</span></div></a></li><li class="depth-1 "><a href="200-quick-reference.html"><div class="inner"><span>tech.ml.dataset Quick Reference</span></div></a></li><li class="depth-1 "><a href="columns-readers-and-datatypes.html"><div class="inner"><span>tech.ml.dataset Columns, Readers, and Datatypes</span></div></a></li><li class="depth-1 "><a href="nippy-serialization-rocks.html"><div class="inner"><span>tech.ml.dataset And nippy</span></div></a></li><li class="depth-1 "><a href="supported-datatypes.html"><div class="inner"><span>tech.ml.dataset Supported Datatypes</span></div></a></li></ul><h3 class="no-link"><span class="inner">Namespaces</span></h3><ul><li class="depth-1"><div class="no-link"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>tech</span></div></div></li><li class="depth-2"><div class="no-link"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>v3</span></div></div></li><li class="depth-3"><a href="tech.v3.dataset.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>dataset</span></div></a></li><li class="depth-4 branch"><a href="tech.v3.dataset.categorical.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>categorical</span></div></a></li><li class="depth-4 branch"><a href="tech.v3.dataset.clipboard.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>clipboard</span></div></a></li><li class="depth-4 branch"><a href="tech.v3.dataset.column.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>column</span></div></a></li><li class="depth-4 branch"><a href="tech.v3.dataset.column-filters.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>column-filters</span></div></a></li><li class="depth-4"><div class="no-link"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>io</span></div></div></li><li class="depth-5 branch"><a href="tech.v3.dataset.io.csv.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>csv</span></div></a></li><li class="depth-5 branch"><a href="tech.v3.dataset.io.datetime.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>datetime</span></div></a></li><li class="depth-5 branch"><a href="tech.v3.dataset.io.string-row-parser.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>string-row-parser</span></div></a></li><li class="depth-5"><a href="tech.v3.dataset.io.univocity.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>univocity</span></div></a></li><li class="depth-4 branch current"><a href="tech.v3.dataset.join.html"><div class="inner"><span class="tree" style="top: -145px;"><span class="top" style="height: 154px;"></span><span class="bottom"></span></span><span>join</span></div></a></li><li class="depth-4 branch"><a href="tech.v3.dataset.math.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>math</span></div></a></li><li class="depth-4 branch"><a href="tech.v3.dataset.metamorph.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>metamorph</span></div></a></li><li class="depth-4 branch"><a href="tech.v3.dataset.modelling.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>modelling</span></div></a></li><li class="depth-4 branch"><a href="tech.v3.dataset.print.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>print</span></div></a></li><li class="depth-4"><a href="tech.v3.dataset.reductions.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>reductions</span></div></a></li><li class="depth-5"><a href="tech.v3.dataset.reductions.apache-data-sketch.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>apache-data-sketch</span></div></a></li><li class="depth-4 branch"><a href="tech.v3.dataset.rolling.html"><div class="inner"><span class="tree" style="top: -52px;"><span class="top" style="height: 61px;"></span><span class="bottom"></span></span><span>rolling</span></div></a></li><li class="depth-4 branch"><a href="tech.v3.dataset.set.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>set</span></div></a></li><li class="depth-4 branch"><a href="tech.v3.dataset.tensor.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>tensor</span></div></a></li><li class="depth-4"><a href="tech.v3.dataset.zip.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>zip</span></div></a></li><li class="depth-3"><div class="no-link"><div class="inner"><span class="tree" style="top: -641px;"><span class="top" style="height: 650px;"></span><span class="bottom"></span></span><span>libs</span></div></div></li><li class="depth-4 branch"><a href="tech.v3.libs.arrow.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>arrow</span></div></a></li><li class="depth-4 branch"><a href="tech.v3.libs.clj-transit.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>clj-transit</span></div></a></li><li class="depth-4 branch"><a href="tech.v3.libs.fastexcel.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>fastexcel</span></div></a></li><li class="depth-4"><div class="no-link"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>guava</span></div></div></li><li class="depth-5"><a href="tech.v3.libs.guava.cache.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>cache</span></div></a></li><li class="depth-4 branch"><a href="tech.v3.libs.parquet.html"><div class="inner"><span class="tree" style="top: -52px;"><span class="top" style="height: 61px;"></span><span class="bottom"></span></span><span>parquet</span></div></a></li><li class="depth-4 branch"><a href="tech.v3.libs.poi.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>poi</span></div></a></li><li class="depth-4"><a href="tech.v3.libs.tribuo.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>tribuo</span></div></a></li></ul></div><div class="sidebar secondary"><h3><a href="#top"><span class="inner">Public Vars</span></a></h3><ul><li class="depth-1"><a href="tech.v3.dataset.join.html#var-hash-join"><div class="inner"><span>hash-join</span></div></a></li><li class="depth-1"><a href="tech.v3.dataset.join.html#var-inner-join"><div class="inner"><span>inner-join</span></div></a></li><li class="depth-1"><a href="tech.v3.dataset.join.html#var-left-join"><div class="inner"><span>left-join</span></div></a></li><li class="depth-1"><a href="tech.v3.dataset.join.html#var-left-join-asof"><div class="inner"><span>left-join-asof</span></div></a></li><li class="depth-1"><a href="tech.v3.dataset.join.html#var-pd-merge"><div class="inner"><span>pd-merge</span></div></a></li><li class="depth-1"><a href="tech.v3.dataset.join.html#var-right-join"><div class="inner"><span>right-join</span></div></a></li></ul></div><div class="namespace-docs" id="content"><h1 class="anchor" id="top">tech.v3.dataset.join</h1><div class="doc"><div class="markdown"><p>implementation of join algorithms, both exact (hash-join) and near.</p>
|
|
</div></div><div class="public anchor" id="var-hash-join"><h3>hash-join</h3><div class="usage"><code>(hash-join colname lhs rhs)</code><code>(hash-join colname lhs rhs {:keys [operation-space], :or {operation-space :int32}, :as options})</code></div><div class="doc"><div class="markdown"><p>Join by column. For efficiency, lhs should be smaller than rhs.
|
|
colname - may be a single item or a tuple in which is destructures as:
|
|
(let <a href="lhs-colname-rhs-colname">lhs-colname rhs-colname</a> colname] ...)
|
|
An options map can be passed in with optional arguments:
|
|
:lhs-missing? Calculate the missing lhs indexes and left outer join table.
|
|
:rhs-missing? Calculate the missing rhs indexes and right outer join table.
|
|
:operation-space - either :int32 or :int64. Defaults to :int32.
|
|
Returns
|
|
{:join-table - joined-table
|
|
:lhs-indexes - matched lhs indexes
|
|
:rhs-indexes - matched rhs indexes
|
|
;; -- when rhs-missing? is true --
|
|
:rhs-missing - missing indexes of rhs.
|
|
:rhs-outer-join - rhs outer join table.
|
|
;; -- when lhs-missing? is true --
|
|
:lhs-missing - missing indexes of lhs.
|
|
:lhs-outer-join - lhs outer join table.
|
|
}</p>
|
|
</div></div><div class="src-link"><a href="https://github.com/techascent/tech.ml.dataset/blob/master/src/tech/v3/dataset/join.clj#L270">view source</a></div></div><div class="public anchor" id="var-inner-join"><h3>inner-join</h3><div class="usage"><code>(inner-join colname lhs rhs)</code><code>(inner-join colname lhs rhs options)</code></div><div class="doc"><div class="markdown"><p>Inner join by column. For efficiency, lhs should be smaller than rhs.
|
|
colname - may be a single item or a tuple in which is destructures as:
|
|
(let <a href="lhs-colname-rhs-colname">lhs-colname rhs-colname</a> colname] ...)
|
|
An options map can be passed in with optional arguments:
|
|
:operation-space - either :int32 or :int64. Defaults to :int32.
|
|
Returns the joined table</p>
|
|
</div></div><div class="src-link"><a href="https://github.com/techascent/tech.ml.dataset/blob/master/src/tech/v3/dataset/join.clj#L308">view source</a></div></div><div class="public anchor" id="var-left-join"><h3>left-join</h3><div class="usage"><code>(left-join colname lhs rhs)</code><code>(left-join colname lhs rhs options)</code></div><div class="doc"><div class="markdown"><p>Left join by column. For efficiency, lhs should be smaller than rhs.
|
|
colname - may be a single item or a tuple in which is destructures as:
|
|
(let <a href="lhs-colname-rhs-colname">lhs-colname rhs-colname</a> colname] ...)
|
|
An options map can be passed in with optional arguments:
|
|
:operation-space - either :int32 or :int64. Defaults to :int32.
|
|
Returns the joined table</p>
|
|
</div></div><div class="src-link"><a href="https://github.com/techascent/tech.ml.dataset/blob/master/src/tech/v3/dataset/join.clj#L336">view source</a></div></div><div class="public anchor" id="var-left-join-asof"><h3>left-join-asof</h3><div class="usage"><code>(left-join-asof colname lhs rhs {:keys [asof-op], :or {asof-op :<=}})</code><code>(left-join-asof colname lhs rhs)</code></div><div class="doc"><div class="markdown"><p>Perform a left join asof. Similar to left join except this will join on nearest
|
|
value. lhs and rhs must be sorted by join-column. join columns must be either
|
|
datetime columns in which the join happens in millisecond space or they must be
|
|
numeric - integer or floating point datatypes.</p>
|
|
<p>Options:</p>
|
|
<ul>
|
|
<li><code>asof-op</code>- may be <a href=":< :<= :nearest :>= :>">:< :<= :nearest :>= :></a> - type of join operation. Defaults to
|
|
<=.</li>
|
|
</ul>
|
|
</div></div><div class="src-link"><a href="https://github.com/techascent/tech.ml.dataset/blob/master/src/tech/v3/dataset/join.clj#L747">view source</a></div></div><div class="public anchor" id="var-pd-merge"><h3>pd-merge</h3><div class="usage"><code>(pd-merge left-ds right-ds options)</code><code>(pd-merge left-ds right-ds)</code></div><div class="doc"><div class="markdown"><p>Pandas-style <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html">merge</a>.
|
|
This is similar to join except it can merge on multiple columns for the left and right
|
|
sides.</p>
|
|
<p>Options:</p>
|
|
<ul>
|
|
<li><code>:on</code> - column name or list of columns names. Names must be found in both datasets.</li>
|
|
<li><code>:left-on</code> - Column name or list of column names</li>
|
|
<li><code>:right-on</code> - Column name or list of column names</li>
|
|
<li><code>:how</code> - left, right inner, outer, cross. If cross, then no on, left-on, right-on can
|
|
be provided.</li>
|
|
</ul>
|
|
<p>Examples:</p>
|
|
<pre><code class="language-clojure">user> (require '[tech.v3.dataset :as ds])
|
|
nil
|
|
user> (require '[tech.v3.dataset.join :as ds-join])
|
|
nil
|
|
user> (def ds-a (ds/->dataset {:a [:a :b :b :a :c]
|
|
:b (range 5)
|
|
:c (range 5)}))
|
|
#'user/ds-a
|
|
user> (def ds-b (ds/->dataset {:a [:a :b :a :b :d]
|
|
:b (range 5)
|
|
:c (range 6 11)}))
|
|
#'user/ds-b
|
|
user> ds-a
|
|
_unnamed [5 3]:
|
|
|
|
| :a | :b | :c |
|
|
|----|---:|---:|
|
|
| :a | 0 | 0 |
|
|
| :b | 1 | 1 |
|
|
| :b | 2 | 2 |
|
|
| :a | 3 | 3 |
|
|
| :c | 4 | 4 |
|
|
user> ds-b
|
|
_unnamed [5 3]:
|
|
|
|
| :a | :b | :c |
|
|
|----|---:|---:|
|
|
| :a | 0 | 6 |
|
|
| :b | 1 | 7 |
|
|
| :a | 2 | 8 |
|
|
| :b | 3 | 9 |
|
|
| :d | 4 | 10 |
|
|
user> (ds-join/pd-merge ds-a ds-b {:on [:a :b] :how :inner})
|
|
inner-join [2 4]:
|
|
|
|
| :a | :b | :c | :right.c |
|
|
|----|---:|---:|---------:|
|
|
| :a | 0 | 0 | 6 |
|
|
| :b | 1 | 1 | 7 |
|
|
user> (ds-join/pd-merge ds-a ds-b {:on [:a :b] :how :outer})
|
|
outer-join [8 4]:
|
|
|
|
| :a | :b | :c | :right.c |
|
|
|----|---:|---:|---------:|
|
|
| :a | 0 | 0 | 6 |
|
|
| :b | 1 | 1 | 7 |
|
|
| :b | 2 | 2 | |
|
|
| :a | 3 | 3 | |
|
|
| :c | 4 | 4 | |
|
|
| :a | 2 | | 8 |
|
|
| :b | 3 | | 9 |
|
|
| :d | 4 | | 10 |
|
|
</code></pre>
|
|
</div></div><div class="src-link"><a href="https://github.com/techascent/tech.ml.dataset/blob/master/src/tech/v3/dataset/join.clj#L376">view source</a></div></div><div class="public anchor" id="var-right-join"><h3>right-join</h3><div class="usage"><code>(right-join colname lhs rhs)</code><code>(right-join colname lhs rhs options)</code></div><div class="doc"><div class="markdown"><p>Right join by column. For efficiency, lhs should be smaller than rhs.
|
|
colname - may be a single item or a tuple in which is destructures as:
|
|
(let <a href="lhs-colname-rhs-colname">lhs-colname rhs-colname</a> colname] ...)
|
|
An options map can be passed in with optional arguments:
|
|
:operation-space - either :int32 or :int64. Defaults to :int32.
|
|
Returns the joined table</p>
|
|
</div></div><div class="src-link"><a href="https://github.com/techascent/tech.ml.dataset/blob/master/src/tech/v3/dataset/join.clj#L322">view source</a></div></div></div></body></html> |