(ns tech.v3.dataset.ames-test (:require [tech.v3.dataset.column :as ds-col] [tech.v3.dataset :as ds] [tech.v3.dataset.column-filters :as cf] [tech.v3.dataset.modelling :as ds-mod] [tech.v3.dataset.math :as ds-math] [tech.v3.dataset.neanderthal :as ds-nean] [tech.v3.datatype :as dtype] [tech.v3.datatype.functional :as dfn] [taoensso.nippy :as nippy] [clojure.set :as c-set] [clojure.pprint :as pp] [clojure.data :as data] [clojure.test :refer [deftest is]] [clojure.tools.logging :as log])) (deftest tablesaw-col-subset-test (let [test-col (ds-col/new-column "unnamed" (range 10)) select-vec [3 5 7 3 2 1] new-col (ds-col/select test-col select-vec)] (is (= select-vec (dtype/->vector new-col))))) (def src-ds (ds/->dataset "test/data/ames-house-prices/train.csv" {:parser-fn {"CentralAir" :boolean}})) (defn missing-pipeline [dataset] (ds/bind-> (ds/->dataset dataset) ds (ds/remove-column "Id") (ds/update cf/string ds/replace-missing-value "NA") (ds/update-elemwise cf/string #(get {"" "NA"} % %)) (ds/update cf/numeric ds/replace-missing-value 0) (ds/update cf/boolean ds/replace-missing-value false) (ds/update-columnwise (cf/union (cf/numeric ds) (cf/boolean ds)) #(dtype/elemwise-cast % :float64)))) (def original-missing #{"LotFrontage" "Alley" "MasVnrType" "MasVnrArea" "BsmtQual" "BsmtCond" "BsmtExposure" "BsmtFinType1" "BsmtFinType2" "Electrical" "FireplaceQu" "GarageType" "GarageYrBlt" "GarageFinish" "GarageQual" "GarageCond" "PoolQC" "Fence" "MiscFeature"}) (deftest basic-pipeline-test (let [dataset (missing-pipeline src-ds)] (is (= original-missing (set (map :column-name (ds/columns-with-missing-seq src-ds)))) (with-out-str (pp/pprint (data/diff original-missing (set (map :column-name (ds/columns-with-missing-seq src-ds))))))) (is (= 0 (count (ds/columns-with-missing-seq dataset)))) (is (= 42 (ds/column-count (cf/categorical dataset)))) (is (= #{:string :float64} (->> (ds/columns dataset) (map dtype/get-datatype) set))))) (deftest log1p-changes-datatype ;;This causes actual data corruption--if the column datatype gets clipped ;;back to an integer type you get values like 12 instead of 12.5. For this ;;dataset that destroys the accuracy so we make sure the log1p operation does ;;in fact change the datatype correctly. (is (dfn/equals [12.24769911637256 12.109016442313738 12.317171167298682 11.849404844423074 12.429220196836383] (-> (ds/update-columns src-ds ["SalePrice"] dfn/log1p) (ds/select-rows (range 5)) (ds/column "SalePrice") (vec))))) (defn skew-column-filter [dataset] (ds/bind-> (dissoc dataset "SalePrice") ds (cf/numeric) (cf/difference (cf/categorical ds)) (cf/column-filter #(> (Math/abs (dfn/skew %)) 0.5)))) (def old-cols #{"TotalBsmtSF" "YearRemodAdd" "LotFrontage" "PoolArea" "BsmtFinSF2" "YearBuilt" "LowQualFinSF" "GrLivArea" "MSSubClass" "WoodDeckSF" "KitchenAbvGr" "Fireplaces" "3SsnPorch" "OverallCond" "1stFlrSF" "EnclosedPorch" "MiscVal" "2ndFlrSF" "TotRmsAbvGrd" "GarageYrBlt" "BsmtHalfBath" "OpenPorchSF" "BsmtFinSF1" "LotArea" "MasVnrArea" "ScreenPorch" "BsmtFullBath" "BsmtUnfSF" "HalfBath"}) (deftest custom-colfilter-test (is (= old-cols (-> (skew-column-filter src-ds) (ds/column-names) (set))))) (defn string-and-math [dataset] (ds/column dataset "PoolQC") (let [initial-ds (-> dataset (ds/categorical->number ["Utilities"] [["NA" -1] "ELO" "NoSeWa" "NoSewr" "AllPub"]) (ds/categorical->number ["LandSlope"] ["Gtl" "Mod" "Sev" "NA"]) (ds/categorical->number ["ExterQual" "ExterCond" "BsmtQual" "BsmtCond" "HeatingQC" "KitchenQual" "FireplaceQu" "GarageQual" "GarageCond" "PoolQC"] ["Ex" "Gd" "TA" "Fa" "Po" "NA"]) (ds/assoc-metadata ["MSSubClass" "OverallQual" "OverallCond"] :categorical? true) (ds/update-column "MasVnrType" (fn [col] (map #(case % ("BrkCmn" "BrkFace" "CBlock" "Stone" ) "Brk" %) col))) (ds/update-column "SaleCondition" (fn [col] (map #(case % ("Abnorml" "Alloca" "AdjLand" "Family" "Normal" ) "sale-1" %) col))) (ds/categorical->number ["MasVnrType"] { "Brk" 1 "None" 0 "NA" -1}) (ds/categorical->number ["SaleCondition"] {"sale-1" 0 "Partial" 1 "NA" -1}) ;; ;;Auto convert the rest that are still string columns (ds/categorical->number cf/string))] (if (ds/has-column? initial-ds "SalePrice") (-> initial-ds (assoc "SalePriceDup" (initial-ds "SalePrice")) (ds/update-column "SalePrice" dfn/log1p) (ds-mod/set-inference-target "SalePrice")) initial-ds))) (deftest base-etl-test (let [src-dataset src-ds ;;For inference, we won't have the target but we will have everything else. inference-columns (c-set/difference (set (map ds-col/column-name (ds/columns src-dataset))) #{"SalePrice"}) inference-dataset (-> (ds/select src-dataset inference-columns (range 10)) (ds/rows {:nil-missing? true})) dataset (-> src-ds missing-pipeline string-and-math) post-pipeline-columns (c-set/difference inference-columns #{"Id"}) sane-dataset-for-flyweight (ds/select dataset post-pipeline-columns (range 10)) final-flyweight (-> sane-dataset-for-flyweight (ds/mapseq-reader))] (is (= [81 1460] (dtype/shape src-dataset))) (is (= [81 1460] (dtype/shape dataset))) (is (= 45 (ds/column-count (cf/categorical dataset)))) (is (= #{"MSSubClass" "OverallQual" "OverallCond"} (c-set/intersection #{"MSSubClass" "OverallQual" "OverallCond"} (set (ds/column-names (cf/categorical dataset)))))) (is (= 0 (ds/column-count (cf/string dataset)))) (is (= ["SalePrice"] (vec (ds/column-names (cf/target dataset))))) (is (= [] (vec (ds/column-names (cf/difference dataset (cf/numeric dataset)))))) (let [sale-price (ds/column dataset "SalePriceDup") sale-price-l1p (ds/column dataset "SalePrice") sp-stats (ds-col/stats sale-price [:mean :min :max]) sp1p-stats (ds-col/stats sale-price-l1p [:mean :min :max])] (is (dfn/equals (mapv sp-stats [:mean :min :max]) [180921.195890 34900 755000] 0.01)) (is (dfn/equals (mapv sp1p-stats [:mean :min :max]) [12.024 10.460 13.534] 0.01))) (is (= 10 (count inference-dataset))) (is (= 10 (count final-flyweight))) (let [pre-pipeline (map meta (ds/columns src-ds)) col-dtype-map (->> pre-pipeline (map (fn [{:keys [name datatype]}] [name datatype])) (into {})) exact-columns (ds/->dataset inference-dataset {:parser-fn col-dtype-map}) ;;Just checking that this works at all.. autoscan-columns (ds/->dataset inference-dataset {})] ;;And the definition of exact is... (is (every? #(= (dtype/get-datatype %) (get col-dtype-map (ds-col/column-name %))) (ds/columns exact-columns))) (let [inference-ds (-> exact-columns missing-pipeline string-and-math)] ;;spot check a few of the items (is (dfn/equals (dtype/->vector (ds/column sane-dataset-for-flyweight "MSSubClass")) (dtype/->vector (ds/column inference-ds "MSSubClass")))) ;;did categorical values get encoded identically? (is (dfn/equals (dtype/->vector (ds/column sane-dataset-for-flyweight "OverallQual")) (dtype/->vector (ds/column inference-ds "OverallQual")))))))) (defn full-ames-pt-1 [dataset] (ds/bind-> (missing-pipeline dataset) ds (ds/categorical->number ["Utilities"] [["NA" -1] "ELO" "NoSeWa" "NoSewr" "AllPub"]) (ds/categorical->number ["LandSlope"] ["Gtl" "Mod" "Sev" "NA"]) (ds/categorical->number ["ExterQual" "ExterCond" "BsmtQual" "BsmtCond" "HeatingQC" "KitchenQual" "FireplaceQu" "GarageQual" "GarageCond" "PoolQC"] ["Ex" "Gd" "TA" "Fa" "Po" "NA"]) (ds/assoc-metadata ["MSSubClass" "OverallQual" "OverallCond"] :categorical? true) (ds/update-column "MasVnrType" (fn [col] (map #(case % ("BrkCmn" "BrkFace" "CBlock" "Stone" ) "Brk" %) col))) (ds/update-column "SaleCondition" (fn [col] (map #(case % ("Abnorml" "Alloca" "AdjLand" "Family" "Normal" ) "sale-1" %) col))) (ds/categorical->number ["MasVnrType"] { "Brk" 1 "None" 0 "NA" -1}) (ds/categorical->number ["SaleCondition"] {"sale-1" 0 "Partial" 1 "NA" -1}) ;; ;;Auto convert the rest that are still string columns (ds/categorical->number cf/string) (ds/update-column "SalePrice" dfn/log1p) (ds-mod/set-inference-target "SalePrice") (assoc "OverallGrade" (dfn/* (ds "OverallQual") (ds "OverallCond"))) ;; Overall quality of the garage (assoc "GarageGrade" (dfn/* (ds "GarageQual") (ds "GarageCond"))) ;; Overall quality of the exterior (assoc "ExterGrade" (dfn/* (ds "ExterQual") (ds "ExterCond"))) ;; Overall kitchen score (assoc "KitchenScore" (dfn/* (ds "KitchenAbvGr") (ds "KitchenQual"))) ;; Overall fireplace score (assoc "FireplaceScore" (dfn/* (ds "Fireplaces") (ds "FireplaceQu"))) ;; Overall garage score (assoc "GarageScore" (dfn/* (ds "GarageArea") (ds "GarageQual"))) ;; Overall pool score (assoc "PoolScore" (dfn/* (ds "PoolArea") (ds "PoolQC"))) ;; Simplified overall quality of the house (assoc "SimplOverallGrade" (dfn/* (ds "OverallQual") (ds "OverallCond"))) ;; Simplified overall quality of the exterior (assoc "SimplExterGrade" (dfn/* (ds "ExterQual") (ds "ExterCond"))) ;; Simplified overall pool score (assoc "SimplPoolScore" (dfn/* (ds "PoolArea") (ds "PoolQC"))) ;; Simplified overall garage score (assoc "SimplGarageScore" (dfn/* (ds "GarageArea") (ds "GarageQual"))) ;; Simplified overall fireplace score (assoc "SimplFireplaceScore" (dfn/* (ds "Fireplaces") (ds "FireplaceQu"))) ;; Simplified overall kitchen score (assoc "SimplKitchenScore" (dfn/* (ds "KitchenAbvGr") (ds "KitchenQual"))) ;; Total number of bathrooms (assoc "TotalBath" (dfn/+ (ds "BsmtFullBath") (dfn/* 0.5 (ds "BsmtHalfBath")) (ds "FullBath") (dfn/* 0.5 (ds "HalfBath")))) ;; Total SF for house (incl. basement) (assoc "AllSF" (dfn/+ (ds "GrLivArea") (ds "TotalBsmtSF"))) ;; Total SF for 1st + 2nd floors (assoc "AllFlrsSF" (dfn/+ (ds "1stFlrSF") (ds "2ndFlrSF"))) ;; Total SF for porch (assoc "AllPorchSF" (dfn/+ (ds "OpenPorchSF") (ds "EnclosedPorch") (ds "3SsnPorch") (ds "ScreenPorch"))))) (def ames-top-columns ["SalePrice" "OverallQual" "AllSF" "AllFlrsSF" "GrLivArea" "GarageCars" "ExterQual" "TotalBath" "KitchenQual" "GarageArea" "ExterGrade"]) (defn full-ames-pt-2 [dataset] ;;Drop SalePrice column of course. (->> (rest ames-top-columns) (reduce (fn [dataset colname] (ds/bind-> dataset ds (assoc (str colname "-s2") (dfn/pow (ds colname) 2)) (assoc (str colname "-s3") (dfn/pow (ds colname) 3)) (assoc (str colname "-sqrt") (dfn/sqrt (ds colname))))) dataset))) (defn full-ames-pt-3 [dataset] (let [feature-ds (cf/difference dataset (cf/target dataset)) numeric-feature-ds (cf/difference feature-ds (cf/categorical feature-ds)) skew-fixed (ds/update-columnwise numeric-feature-ds skew-column-filter dfn/log1p) std-scale-fit (ds-math/fit-std-scale skew-fixed)] (merge dataset (ds-math/transform-std-scale skew-fixed std-scale-fit)))) (deftest full-ames-pipeline-test (let [dataset (full-ames-pt-1 src-ds)] (is (= ames-top-columns (->> (get (ds-math/correlation-table dataset :colname-seq ["SalePrice"]) "SalePrice") (take 11) (mapv first)))) (let [[n-cols n-rows] (dtype/shape src-ds) [n-new-cols n-new-rows] (-> (ds/filter-column src-ds "GrLivArea" #(< % 4000)) dtype/shape) num-over-the-line (->> (ds/column src-ds "GrLivArea") (dtype/->reader) (filter #(>= (int %) 4000)) count)] ;;Ensure our test isn't pointless. (is (not= 0 num-over-the-line)) (is (= n-new-rows (- n-rows num-over-the-line)))) (let [new-ds (assoc src-ds "SimplOverallQual" (dtype/emap {1 1 2 1 3 1 4 2 5 2 6 2 7 3 8 3 9 3 10 3} :int64 (src-ds "OverallQual")))] (is (= #{1 2 3} (->> (ds/column new-ds "SimplOverallQual") (ds-col/unique) (map int) set)))) (let [dataset (-> src-ds full-ames-pt-1 full-ames-pt-2) skewed-set (set (ds/column-names (skew-column-filter dataset)))] ;;This count seems rather high...a diff against the python stuff would be wise. (is (= 64 (count skewed-set))) (is (= 45 (ds/column-count (cf/categorical dataset)))) ;;Sale price cannot be in the set as it was explicitly removed. (is (not (contains? skewed-set "SalePrice")))))) (deftest ^:travis-broken full-ames-pipeline-pca (let [dataset (-> src-ds full-ames-pt-1 full-ames-pt-2 full-ames-pt-3) numeric-ds (cf/difference (cf/numeric dataset) (cf/union (cf/categorical dataset) (cf/target dataset))) std-set (set (ds/column-names numeric-ds)) mean-var-seq (->> std-set (map (comp #(ds-col/stats % [:mean :variance]) (partial ds/column dataset))))] ;;Are means 0? (is (dfn/equals (mapv :mean mean-var-seq) (vec (repeat (count mean-var-seq) 0)) 0.001)) (let [cat-ds (cf/categorical dataset) pca-fit (ds-nean/fit-pca numeric-ds {:n-components 10}) pca-ds (ds-nean/transform-pca numeric-ds pca-fit)] (is (= 127 (ds/column-count dataset))) (is (= 45 (ds/column-count cat-ds))) (is (= 10 (count (ds/columns pca-ds))))))) (deftest tostring-regression (is (string? (.toString ^Object src-ds)))) (deftest desc-stats-and-correlation [] (let [stats-data (ds/descriptive-stats src-ds) corr-data (ds-math/correlation-table src-ds :colname-seq ["SalePrice"])] (is (= #{:min :n-missing :col-name :mean :datatype :skew :mode :standard-deviation :n-valid :max :first :last} (set (ds/column-names stats-data)))) (is (= 35 (->> corr-data first second count))))) (deftest nippyfreezethaw (let [ds src-ds data (ds/dataset->data ds) thawed (ds/data->dataset data)] (is (= (ds/row-count ds) (count (mapv #(into [] %) (ds/rowvecs thawed)))))))