diff --git a/.gitignore b/.gitignore index 4a1c483..1fedb5d 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,5 @@ examples/.ipynb_checkpoints *.log .classpath .project -.settings \ No newline at end of file +.settings +resources/docs.md diff --git a/README.md b/README.md index 384385f..4900a38 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Dataframes in Clojure. Through [pandas](https://github.com/pandas-dev/pandas). O ## Disclaimer -This is very alpha, things will change fast, will break and the API is neither complete, nor settled. Since a few people have started playing with this there's a Clojars project available. Please give feedback if you're using this, every kind of contribution is appreciated (for more info check the [Contributing](#contributing) section). At the moment everything is mostly undocumented and untested, I'm currently adding them. +This is alpha, things will change fast, will break and the API is neither complete, nor settled. Since a few people have started playing with this there's a Clojars project available. Please give feedback if you're using this, every kind of contribution is appreciated (for more info check the [Contributing](#contributing) section). At the moment everything is mostly undocumented and untested, I'm currently adding them. [![Clojars Project](https://img.shields.io/clojars/v/panthera.svg)](https://clojars.org/panthera) @@ -20,60 +20,17 @@ This is very alpha, things will change fast, will break and the API is neither c **Panthera** uses the great [libpython-clj](https://github.com/cnuernber/libpython-clj) as a backend to access Python and get [pandas](https://github.com/pandas-dev/pandas) and [numpy](https://github.com/numpy/numpy) functionality. -### System level - -If you usually don't develop in Python then a system level install might be a good solution (though always discouraged), if this is your case then follow the subsequent steps. - -To get started you need python, pandas and numpy (the latter comes with the former) on your path. Usually a: - -```bash -sudo apt install libpython3.6-dev -pip3 install numpy pandas xlrd # the latter is for Excel files, if you don't care you can do without -``` - -### Environments - -If you want to have different Python environments, then getting **panthera** to work correctly is a bit more tricky. - -First create your new environment with at least python=3.6, numpy and pandas. (This was tested both on GNU/Linux and WSL with [conda](https://docs.conda.io/projects/conda/en/latest/), but there's no reason why it shouldn't work with other env management tools. On other systems, [Docker is your best bet](https://github.com/scicloj/docker-hub/tree/master/panthera)): - -```bash -conda create -n panthera python=3.6 numpy pandas -``` - -Then check the path to the newly created environment: - -```bash -conda activate panthera -which python -``` - -Now you just have to add to one of your profiles the path to the wanted python executable: - -```bash -{:dev {:resource-paths ["/home/user/miniconda3/envs/panthera"]}} -``` - -You can create different profiles with different paths according to what you need. Now if you want to make it possible to work with **panthera** without having to activate your environments you have 2 choices: - -- assign `PYTHONHOME` env variable to your environment - -```bash -PYTHONHOME="/home/user/miniconda3/envs/panthera" lein whatever -``` - -- assign `PYTHONHOME` env variable before requiring **panthera** - -```bash -(System/setProperty "PYTHONHOME" "/home/user/miniconda3/envs/panthera") -``` +### N.B.: check [libpython-clj](https://github.com/cnuernber/libpython-clj) repo on how to install and start a Clojure/Python session. ### The actual code After this you can start playing around with **panthera** ```clojure -(require '[panthera.panthera :as pt]) +(require '[[panthera.panthera :as pt] + [libpython-clj.python :refer [initialize!]]) + +(initialize!) (-> (pt/read-csv "mycsv.csv") (pt/subset-cols "Col1" "Col2" "Col3") @@ -86,33 +43,7 @@ The above chain will read your csv file as a DataFrame, select only the given co ## Numpy -All of Numpy is wrapped and accessible through a single interface from `panthera.numpy`. - -```clojure -(require '[panthera.numpy :refer [npy doc]]) - -(npy :power {:args [[1 2 3] 3]}) -;=> [1 8 27] - -(npy :power) -; This arity returns the actual numpy object that can be passed around to other functions as an argument -``` - -To access functions inside submodules pass to `npy` a sequence of keys leading to the wanted function: - -```clojure -(npy [:linalg :svd] {:args [[1 2 3] [4 5 6]]}) -``` - -You can check the original docstring for every module and function with the `doc` helper - -```clojure -(doc :power) - -(doc [:linalg :eigh]) -``` - -To see what is available and how everything works check the [official docs](https://docs.scipy.org/doc/numpy/reference/) online. +All of Numpy is accessible through [libpython-clj](https://github.com/cnuernber/libpython-clj) interop, check the repo for more info. ## Contributing @@ -142,8 +73,8 @@ Panthera doesn't pretend to be a clever wordplay because it doesn't need to. Fir ## License -Copyright © 2019 Alan Marazzi +Copyright © 2020 Alan Marazzi This program and the accompanying materials are made available under the terms of the Eclipse Public License 2.0 which is available at -http://www.eclipse.org/legal/epl-2.0. +http://www.eclipse.org/legal/epl-2.0. \ No newline at end of file diff --git a/examples/panthera-intro.ipynb b/examples/panthera-intro.ipynb index 919c5bd..6f6a4b2 100644 --- a/examples/panthera-intro.ipynb +++ b/examples/panthera-intro.ipynb @@ -71,20 +71,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "nil" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "(require '[panthera.panthera :as pt])" ] @@ -3913,9 +3902,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Clojure (clojupyter-v0.2.2)", + "display_name": "Lein-Clojure", "language": "clojure", - "name": "clojupyter" + "name": "lein-clojure" }, "language_info": { "file_extension": ".clj", diff --git a/project.clj b/project.clj index 0a75a1d..67451db 100644 --- a/project.clj +++ b/project.clj @@ -1,11 +1,11 @@ -(defproject panthera "0.1-alpha.16" - :description "Data Frames in Clojure (with Pandas) + NumPy" - :url "https://github.com/alanmarazzi/panthera" - :scm {:name "git" :url "https://github.com/alanmarazzi/panthera"} - :license {:name "EPL-2.0" - :url "https://www.eclipse.org/legal/epl-2.0/"} - :dependencies [[cnuernber/libpython-clj "1.0"] - [org.clojure/core.memoize "0.7.2"]] - :profiles {:dev {:dependencies [[org.clojure/clojure "1.10.1"]] - :plugins [[refactor-nrepl "2.4.0"] - [cider/cider-nrepl "0.18.0"]]}}) +(defproject panthera "0.1-alpha.17" + :description "Data Frames in Clojure (with Pandas) + NumPy" + :url "https://github.com/alanmarazzi/panthera" + :scm {:name "git" :url "https://github.com/alanmarazzi/panthera"} + :license {:name "EPL-2.0" + :url "https://www.eclipse.org/legal/epl-2.0/"} + :dependencies [[cnuernber/libpython-clj "1.32"] + [org.clojure/core.memoize "0.7.2"]] + :profiles {:dev {:dependencies [[org.clojure/clojure "1.10.1"]] + :plugins [[lein-jupyter "0.1.16"]]}}) + diff --git a/src/panthera/numpy.clj b/src/panthera/numpy.clj index ead1931..d9fdbae 100644 --- a/src/panthera/numpy.clj +++ b/src/panthera/numpy.clj @@ -1,133 +1,133 @@ -(ns panthera.numpy - (:require - [libpython-clj.python :as py] - [panthera.pandas.utils :as u])) - -(defonce numpy (py/import-module "numpy")) - -(defn py-get-in - "A similar to `get-in` implementation for Python modules, - classes and functions." - [py-module v] - (let [mods (drop-last v)] - ((apply comp - (reverse - (map (fn [x] #(py/get-attr % x)) mods))) py-module))) - -(defn doc - "Use this to see modules and functions original docstrings. - - **Examples** - - ``` - (doc :power) - - (doc :linalg) - - (doc [:linalg :svd]) - ```" - [ks] - (if (seqable? ks) - (println - (py/get-attr - (py/get-attr - (py-get-in numpy ks) - (last ks)) - "__doc__")) - (println (py/get-attr (py/get-attr numpy ks) "__doc__")))) - -(defn module - [py-module] - (fn [x] - (fn - ([] - (if (seqable? x) - (let [ks (map u/memo-key-converter x)] - (py/get-attr (py-get-in py-module ks) (last ks))) - (py/get-attr py-module (u/memo-key-converter x)))) - ([attrs] - (if (seqable? x) - (let [ks (map u/memo-key-converter x)] - (py/call-attr-kw (py-get-in py-module ks) (last ks) - (vec (:args attrs)) - (u/keys->pyargs (dissoc attrs :args)))) - (py/call-attr-kw py-module (u/memo-key-converter x) - (vec (:args attrs)) - (u/keys->pyargs (dissoc attrs :args)))))))) - -(defn npy - "General method to access Numpy functions and attributes. - - By calling `(npy k)` you get either the value associated with that attribute - (such as `(npy :nan)`) or the native Python function associated with that key. - This is useful to pass functions around to other methods. - - By calling `(npy k {:args [my-args] :other-arg 2})` you're calling that method - with the given arguments. `:args` is a conveniency argument to pass positional - arguments to functions in the same order as you'd pass them to Numpy. - This is because many Numpy functions have native C implementations that - accept only positional arguments. - - For example `(npy :power {:args [[1 2] 2]})` will give back as a result - `[1 4]` because we square (second element of `:args`) all the elements in the - given `Iterable` (first element of `:args`) - - - If you need to access a function in a submodule just pass a sequence of keys - to `npy`, such as `(npy [:linalg :svd])`. The functioning of this is the same - as above, but you'll be acting on the `:svd` function inside the `:linalg` - submodule." - ([k] (((module numpy) k))) - ([k attrs] (((module numpy) k) attrs))) - - -(comment - "An example on how to wrap another Python library, in this case scikit-learn" - - ; sklearn architecture is very convoluted, modules aren't loaded by default - ; but only by explicit import. So we import everything as below - (py/run-simple-string "from sklearn import *") - (defonce sk (py/import-module "sklearn")) - - (defn sklearn - ([k] ((module sk) k)) - ([k args] (((module sk) k) args))) - - (def pokemon (pt/read-csv "resources/pokemon.csv")) - - (def split (sklearn [:model_selection :train_test_split] - {:args [(pt/subset-cols pokemon - "HP" "Attack" - "Defense" "Sp. Atk" - "Sp. Def" "Speed") - (pt/subset-cols pokemon "Legendary")] - :test_size 0.3})) - - (defn train-test - [split k] - ((k {:x-train first - :x-test second - :y-train #(% 2) - :y-test last}) split)) - - (def logistic (sklearn [:linear_model :LogisticRegression] - {:n_jobs -1 :solver "lbfgs"})) - - (defn fit - [model x y] - (py/call-attr model "fit" x y)) - - (def model (fit logistic (train-test split :x-train) - (train-test split :y-train))) - - (defn predict - [model x] - (py/call-attr model "predict" x)) - - (predict model (train-test split :x-test)) - - (defn score - [model x y] - (py/call-attr model "score" x y)) - - (score model (train-test split :x-test) (train-test split :y-test))) +(ns panthera.numpy + (:require + [libpython-clj.python :as py] + [panthera.pandas.utils :as u])) + +(defonce numpy (py/import-module "numpy")) + +(defn py-get-in + "A similar to `get-in` implementation for Python modules, + classes and functions." + [py-module v] + (let [mods (drop-last v)] + ((apply comp + (reverse + (map (fn [x] #(py/get-attr % x)) mods))) py-module))) + +(defn doc + "Use this to see modules and functions original docstrings. + + **Examples** + + ``` + (doc :power) + + (doc :linalg) + + (doc [:linalg :svd]) + ```" + [ks] + (if (seqable? ks) + (println + (py/get-attr + (py/get-attr + (py-get-in numpy ks) + (last ks)) + "__doc__")) + (println (py/get-attr (py/get-attr numpy ks) "__doc__")))) + +(defn module + [py-module] + (fn [x] + (fn + ([] + (if (seqable? x) + (let [ks (map u/memo-key-converter x)] + (py/get-attr (py-get-in py-module ks) (last ks))) + (py/get-attr py-module (u/memo-key-converter x)))) + ([attrs] + (if (seqable? x) + (let [ks (map u/memo-key-converter x)] + (py/call-attr-kw (py-get-in py-module ks) (last ks) + (vec (:args attrs)) + (u/keys->pyargs (dissoc attrs :args)))) + (py/call-attr-kw py-module (u/memo-key-converter x) + (vec (:args attrs)) + (u/keys->pyargs (dissoc attrs :args)))))))) + +(defn npy + "General method to access Numpy functions and attributes. + + By calling `(npy k)` you get either the value associated with that attribute + (such as `(npy :nan)`) or the native Python function associated with that key. + This is useful to pass functions around to other methods. + + By calling `(npy k {:args [my-args] :other-arg 2})` you're calling that method + with the given arguments. `:args` is a conveniency argument to pass positional + arguments to functions in the same order as you'd pass them to Numpy. + This is because many Numpy functions have native C implementations that + accept only positional arguments. + + For example `(npy :power {:args [[1 2] 2]})` will give back as a result + `[1 4]` because we square (second element of `:args`) all the elements in the + given `Iterable` (first element of `:args`) + + + If you need to access a function in a submodule just pass a sequence of keys + to `npy`, such as `(npy [:linalg :svd])`. The functioning of this is the same + as above, but you'll be acting on the `:svd` function inside the `:linalg` + submodule." + ([k] (((module numpy) k))) + ([k attrs] (((module numpy) k) attrs))) + + +(comment + "An example on how to wrap another Python library, in this case scikit-learn" + + ; sklearn architecture is very convoluted, modules aren't loaded by default + ; but only by explicit import. So we import everything as below + (py/run-simple-string "from sklearn import *") + (defonce sk (py/import-module "sklearn")) + + (defn sklearn + ([k] ((module sk) k)) + ([k args] (((module sk) k) args))) + + (def pokemon (pt/read-csv "resources/pokemon.csv")) + + (def split (sklearn [:model_selection :train_test_split] + {:args [(pt/subset-cols pokemon + "HP" "Attack" + "Defense" "Sp. Atk" + "Sp. Def" "Speed") + (pt/subset-cols pokemon "Legendary")] + :test_size 0.3})) + + (defn train-test + [split k] + ((k {:x-train first + :x-test second + :y-train #(% 2) + :y-test last}) split)) + + (def logistic (sklearn [:linear_model :LogisticRegression] + {:n_jobs -1 :solver "lbfgs"})) + + (defn fit + [model x y] + (py/call-attr model "fit" x y)) + + (def model (fit logistic (train-test split :x-train) + (train-test split :y-train))) + + (defn predict + [model x] + (py/call-attr model "predict" x)) + + (predict model (train-test split :x-test)) + + (defn score + [model x y] + (py/call-attr model "score" x y)) + + (score model (train-test split :x-test) (train-test split :y-test))) diff --git a/src/panthera/pandas/generics.clj b/src/panthera/pandas/generics.clj index 2833284..f9237ae 100644 --- a/src/panthera/pandas/generics.clj +++ b/src/panthera/pandas/generics.clj @@ -814,8 +814,8 @@ "value_counts" (dissoc attrs :clj))] (if (:clj attrs) - (zipmap - (map u/memo-columns-converter (vec (index v))) + (zipmap + (map u/memo-columns-converter (vec (index v))) (vec v)) v)) (recur (series seq-or-srs) [attrs]))) @@ -862,7 +862,7 @@ (to-csv \"mycsv.csv\" {:sep \";\" :index false}) ```" [df-or-srs filename & [attrs]] - (u/simple-kw-call df-or-srs "to_csv" attrs)) + (u/kw-call df-or-srs "to_csv" filename attrs)) (defn reset-index "Reset the index or part of it. This replaces the current index @@ -1195,3 +1195,16 @@ ```" [df-or-srs i j] (u/simple-kw-call df-or-srs "swaplevel" [] {"i" i "j" j})) + +(defn factorize + "TODO" + [seq-or-srs & [attrs]] + (u/kw-call u/pd "factorize" seq-or-srs attrs)) + +(defn rename + [df-or-srs & [attrs]] + (u/simple-kw-call df-or-srs "rename" attrs)) + +(defn to-excel + [df-or-srs filename & [attrs]] + (u/kw-call df-or-srs "to_excel" filename attrs)) diff --git a/src/panthera/pandas/math.clj b/src/panthera/pandas/math.clj index b6a05cd..a511878 100644 --- a/src/panthera/pandas/math.clj +++ b/src/panthera/pandas/math.clj @@ -1,229 +1,232 @@ -(ns panthera.pandas.math - (:refer-clojure - :exclude [mod]) - (:require - [libpython-clj.python :as py] - [panthera.pandas.utils :as u])) - -(defn- base-math - [k] - (fn [& args] - (reduce - #(py/call-attr - %1 - ({:+ "__add__" - :- "__sub__" - :* "__mul__" - :div "__div__" - :fld "__floordiv__" - :mod "__mod__" - :** "__pow__" - :< "__lt__" - :> "__gt__" - :<= "__le__" - :>= "__ge__" - :!= "__ne__" - := "__eq__" - :dot "__matmul__"} k) - %2) args))) - -(defn ops - [df-or-srs other op & [attrs]] - (u/kw-call - df-or-srs - ({:+ "__add__" - :- "__sub__" - :* "__mul__" - :div "__div__" - :fld "__floordiv__" - :mod "__mod__" - :** "__pow__" - :< "__lt__" - :> "__gt__" - :<= "__le__" - :>= "__ge__" - :!= "__ne__" - := "__eq__" - :dot "__matmul__"} op) - other - attrs)) - -(def add - (base-math :+)) - -(def sub - (base-math :-)) - -(def mul - (base-math :*)) - -(def div - (base-math :div)) - -(def floor-div - (base-math :fld)) - -(def mod - (base-math :mod)) - -(def pow - (base-math :**)) - -(def lt - (base-math :<)) - -(def gt - (base-math :>)) - -(def le - (base-math :<=)) - -(def ge - (base-math :>=)) - -(def eq - (base-math :=)) - -(def ne - (base-math :!=)) - -(def dot - (base-math :dot)) - -(defn abs - [df-or-srs] - (py/call-attr df-or-srs "abs")) - -(defn autocorr - [srs & [lag]] - (py/call-attr srs "autocorr" (or lag 1))) - -(defn between - ([srs left right] - (py/call-attr srs "between" left right)) - ([srs left right inclusive] - (py/call-attr srs "between" left right inclusive))) - -(defn clip - [df-or-srs & [attrs]] - (u/simple-kw-call df-or-srs "clip" attrs)) - -(defn corr - [df-or-srs & args] - (if (= :data-frame (u/pytype df-or-srs)) - (u/simple-kw-call df-or-srs "corr" (first args)) - (u/kw-call df-or-srs "corr" (first args) (second args)))) - -(defn cnt - [df-or-srs & [attrs]] - (u/simple-kw-call df-or-srs "count" attrs)) - -(defn cov - [df-or-srs & args] - (if (= :data-frame (u/pytype df-or-srs)) - (u/simple-kw-call df-or-srs "cov" (first args)) - (u/kw-call df-or-srs "cov" (first args) (second args)))) - -(defn- base-cumulative - [k] - (fn [df-or-srs & [attrs]] - (u/simple-kw-call df-or-srs - ({:max "cummax" - :min "cummin" - :prod "cumprod" - :sum "cumsum" - :diff "diff" - :cmpnd "compound"} k) - attrs))) - -(def cummax - (base-cumulative :max)) - -(def cummin - (base-cumulative :min)) - -(def cumprod - (base-cumulative :prod)) - -(def cumsum - (base-cumulative :sum)) - -(def diff - (base-cumulative :diff)) - -(def compound - (base-cumulative :cmpnd)) - -(defn describe - [df-or-srs & [attrs]] - (u/simple-kw-call df-or-srs "describe" attrs)) - -(defn- other-ops - [k] - (fn [df-or-srs & [attrs]] - (u/simple-kw-call df-or-srs - ({:sum "sum" - :kurt "kurtosis" - :mad "mad" - :max "max" - :min "min" - :mean "mean" - :median "median" - :mode "mode" - :pct "pct_change" - :quant "quantile" - :rank "rank" - :round "round" - :sem "sem" - :skew "skew" - :std "std" - :var "var"} k) - attrs))) - -(def sum - (other-ops :sum)) - -(def kurtosis - (other-ops :kurt)) - -(def mean-abs-dev - (other-ops :mad)) - -(def maximum - (other-ops :max)) - -(def minimum - (other-ops :min)) - -(def mean - (other-ops :mean)) - -(def median - (other-ops :median)) - -(def mode - (other-ops :mode)) - -(def pct-change - (other-ops :pct)) - -(def quantile - (other-ops :quant)) - -(def rank - (other-ops :rank)) - -(def round - (other-ops :round)) - -(def sem - (other-ops :sem)) - -(def skew - (other-ops :skew)) - -(def std - (other-ops :std)) - -(def var - (other-ops :var)) +(ns panthera.pandas.math + (:refer-clojure + :exclude [mod]) + (:require + [libpython-clj.python :as py] + [panthera.pandas.utils :as u])) + +(defn same? + "This works differently than `eq`: the latter checks equality + value by value, `same?` checks that the given `series`es or `data-frame`s contain + the same exact values. This works even with missing values." + [left right] + (py/call-attr left :equals right)) + +(defn- base-math + [k] + (fn [& args] + (reduce + #(py/call-attr + %1 + ({:+ "__add__" + :- "__sub__" + :* "__mul__" + :div "__div__" + :fld "__floordiv__" + :mod "__mod__" + :** "__pow__" + :< "__lt__" + :> "__gt__" + :<= "__le__" + :>= "__ge__" + :!= "__ne__" + := "__eq__" + :dot "__matmul__"} k) + %2) args))) + +(defn ops + [df-or-srs other op & [attrs]] + (u/kw-call + df-or-srs + ({:+ "__add__" + :- "__sub__" + :* "__mul__" + :div "__div__" + :fld "__floordiv__" + :mod "__mod__" + :** "__pow__" + :< "__lt__" + :> "__gt__" + :<= "__le__" + :>= "__ge__" + :!= "__ne__" + := "__eq__" + :dot "__matmul__"} op) + other + attrs)) + +(def add + (base-math :+)) + +(def sub + (base-math :-)) + +(def mul + (base-math :*)) + +(def div + (base-math :div)) + +(def floor-div + (base-math :fld)) + +(def mod + (base-math :mod)) + +(def pow + (base-math :**)) + +(def lt + (base-math :<)) + +(def gt + (base-math :>)) + +(def le + (base-math :<=)) + +(def ge + (base-math :>=)) + +(def eq + (base-math :=)) + +(def ne + (base-math :!=)) + +(def dot + (base-math :dot)) + +(defn abs + [df-or-srs] + (py/call-attr df-or-srs "abs")) + +(defn autocorr + [srs & [lag]] + (py/call-attr srs "autocorr" (or lag 1))) + +(defn between + ([srs left right] + (py/call-attr srs "between" left right)) + ([srs left right inclusive] + (py/call-attr srs "between" left right inclusive))) + +(defn clip + [df-or-srs & [attrs]] + (u/simple-kw-call df-or-srs "clip" attrs)) + +(defn corr + [df-or-srs & args] + (if (= :data-frame (u/pytype df-or-srs)) + (u/simple-kw-call df-or-srs "corr" (first args)) + (u/kw-call df-or-srs "corr" (first args) (second args)))) + +(defn cnt + [df-or-srs & [attrs]] + (u/simple-kw-call df-or-srs "count" attrs)) + +(defn cov + [df-or-srs & args] + (if (= :data-frame (u/pytype df-or-srs)) + (u/simple-kw-call df-or-srs "cov" (first args)) + (u/kw-call df-or-srs "cov" (first args) (second args)))) + +(defn- base-cumulative + [k] + (fn [df-or-srs & [attrs]] + (u/simple-kw-call df-or-srs + ({:max "cummax" + :min "cummin" + :prod "cumprod" + :sum "cumsum" + :diff "diff"} k) + attrs))) + +(def cummax + (base-cumulative :max)) + +(def cummin + (base-cumulative :min)) + +(def cumprod + (base-cumulative :prod)) + +(def cumsum + (base-cumulative :sum)) + +(def diff + (base-cumulative :diff)) + +(defn describe + [df-or-srs & [attrs]] + (u/simple-kw-call df-or-srs "describe" attrs)) + +(defn- other-ops + [k] + (fn [df-or-srs & [attrs]] + (u/simple-kw-call df-or-srs + ({:sum "sum" + :kurt "kurtosis" + :mad "mad" + :max "max" + :min "min" + :mean "mean" + :median "median" + :mode "mode" + :pct "pct_change" + :quant "quantile" + :rank "rank" + :round "round" + :sem "sem" + :skew "skew" + :std "std" + :var "var"} k) + attrs))) + +(def sum + (other-ops :sum)) + +(def kurtosis + (other-ops :kurt)) + +(def mean-abs-dev + (other-ops :mad)) + +(def maximum + (other-ops :max)) + +(def minimum + (other-ops :min)) + +(def mean + (other-ops :mean)) + +(def median + (other-ops :median)) + +(def mode + (other-ops :mode)) + +(def pct-change + (other-ops :pct)) + +(def quantile + (other-ops :quant)) + +(def rank + (other-ops :rank)) + +(def round + (other-ops :round)) + +(def sem + (other-ops :sem)) + +(def skew + (other-ops :skew)) + +(def std + (other-ops :std)) + +(def var + (other-ops :var)) diff --git a/src/panthera/pandas/reshape.clj b/src/panthera/pandas/reshape.clj index 04f22fd..1234015 100644 --- a/src/panthera/pandas/reshape.clj +++ b/src/panthera/pandas/reshape.clj @@ -1,83 +1,1010 @@ -(ns panthera.pandas.reshape - (:require - [libpython-clj.python :as py] - [panthera.pandas.utils :as u])) - -(defn crosstab - [df-or-srs & [attrs]] - (u/kw-call u/pd "crosstab" df-or-srs attrs)) - -(defn pivot - [df & [attrs]] - (u/simple-kw-call df "pivot" attrs)) - -(defn cut - [data-or-srs bins & [attrs]] - (py/call-attr-kw u/pd "cut" [data-or-srs bins] - (u/keys->pyargs attrs))) - -(defn qcut - [data-or-srs q & [attrs]] - (py/call-attr-kw u/pd "qcut" [data-or-srs q] - (u/keys->pyargs attrs))) - -(defn merge-ordered - [left right & [attrs]] - (py/call-attr-kw u/pd "merge_ordered" [left right] - (u/keys->pyargs attrs))) - -(defn merge-asof - [left right & [attrs]] - (py/call-attr-kw u/pd "merge_asof" [left right] - (u/keys->pyargs attrs))) - -(defn concatenate - [dfs-or-srss & [attrs]] - (u/kw-call u/pd "concat" dfs-or-srss attrs)) - -(defn factorize - [seq-or-srs & [attrs]] - (u/kw-call u/pd "factorize" seq-or-srs attrs)) - -(defn aggregate - [df-or-srs how & [attrs]] - (u/kw-call df-or-srs "agg" how attrs)) - -(defn remap - [df-or-srs mappings & [na-action]] - (py/call-attr df-or-srs "map" mappings (or na-action nil))) - -(defn groupby - [df-or-srs by & [attrs]] - (u/kw-call df-or-srs "groupby" by attrs)) - -(defn rolling - [df-or-srs window & [attrs]] - (u/kw-call df-or-srs "rolling" window attrs)) - -(defn ewm - [df-or-srs & [attrs]] - (u/simple-kw-call df-or-srs "ewm" attrs)) - -; remove :inplace as an attr -(defn dropna - [df-or-srs & [attrs]] - (u/simple-kw-call df-or-srs "dropna" attrs)) - -(defn melt - [df & [attrs]] - (u/simple-kw-call df "melt" attrs)) - -(defn assign - [df-or-srs cols] - (u/simple-kw-call df-or-srs "assign" - (u/keys->pyargs cols))) - -(defn unstack - [df-or-srs & [attrs]] - (u/simple-kw-call df-or-srs "unstack" attrs)) - -(defn transpose - "Transpose the given panthera object" - [df-or-srs] - (py/call-attr df-or-srs "transpose")) \ No newline at end of file +(ns panthera.pandas.reshape + (:refer-clojure + :exclude [drop]) + (:require + [libpython-clj.python :as py] + [panthera.pandas.utils :as u] + [panthera.pandas.generics :as g])) + +(defn crosstab + "Compute a cross tabulation of two (or more) factors. By default + computes a frequency table of the factors unless an array of values and an + aggregation function are passed. + + **Arguments** + + - `seq-or-srs` -> seqable, `series` + + **Attrs** + + - `:columns` -> Iterable, `series`, Iterable of Iter/srs: values to group by + - `:values` -> Iterable, `series`, Iterable of Iter/srs: values to group + according to factors, requires `:aggfunc` + - `:rownames` -> Iterable, `series`: the names of `seq-or-srs` + - `:colnames` -> Iterable, `series`: the names of `:columns` + - `:aggfunc` -> function, keyword, str: the aggregation function, requires + `:values`. It can be a panthera function (`sum`), a numpy function (`(npy :sum)`), + the name of a numpy function (`:mean` or \"mean\") or a Clojure function. In the + latter case be aware that you have to reduce over a map. + - `:margins` -> bool, default `false`: add subtotals + - `:margins-name`: str, default \"All\": name of the row/column holding totals + when `:margins` true + - `:dropna` -> bool, default `true`: exclude columns with all missing values + - `:normalize` -> bool, {`:all` `:index` `columns`}, {0 1}, default `false`: + normalize by dividing all values by the sum of values + + **Examples** + + ``` + (crosstab [[1 2 2]] {:columns [[:a :b :a]]}) + ;; col_0 a b + ;; row_0 + ;; 1 1 0 + ;; 2 1 1 + + (crosstab [[1 2 2]] {:columns [[:a :b :a]] + :rownames [:myrows] + :colnames [:mycols]}) + ;; mycols a b + ;; myrows + ;; 1 1 0 + ;; 2 1 1 + + (crosstab [[1 2 2]] {:columns [[:a :b :b]] + :values [10 20 30] + :aggfunc :mean}) + ;; col_0 a b + ;; row_0 + ;; 1 10.0 NaN + ;; 2 NaN 25.0 + + (crosstab [[1 2 2]] {:columns [[:a :b :a]] + :margins true}) + ;; col_0 a b All + ;; row_0 + ;; 1 1 0 1 + ;; 2 1 1 2 + ;; All 2 1 3 + ``` + " + [seq-or-srs & [{:keys [columns values rownames colnames aggfunc + margins margins-name dropna normalize] + :as attrs}]] + (u/kw-call u/pd "crosstab" seq-or-srs attrs)) + +(defn pivot + "Returns a stacked `data-frame`: basically changes it from long format to wide. + + **Arguments** + + - `df` -> `data-frame` + + **Attrs** + + - `:index` -> str, keyword, default `nil`: the column to use as the new index. + When `nil` uses the current one + - `:columns` -> str, keyword: columns to use for the new `data-frame` + - `:values` -> str, keyword, Iterable, default `nil`: columns to use to populate + values. If `nil` all remaining columns will be used + + **Examples** + + ``` + (def df (data-frame {:foo [:one :one :one :two :two :two] + :bar [:a :b :c :a :b :c] + :baz [1 2 3 4 5 6] + :zoo [:x :y :z :q :w :t]})) + + (pivot df {:columns :bar :index :foo}) + ;; baz zoo + ;; bar a b c a b c + ;; foo + ;; one 1 2 3 x y z + ;; two 4 5 6 q w t + + (pivot df {:index :foo :columns :bar :values [:baz :zoo]}) + ;; baz zoo + ;; bar a b c a b c + ;; foo + ;; one 1 2 3 x y z + ;; two 4 5 6 q w t + ``` + " + [df & [{:keys [index columns values] + :as attrs}]] + (u/simple-kw-call df "pivot" attrs)) + +(defn cut + "Bin the given values into categories. + + Use this when you want to go from continuous values to ordered categories. For + example, you could go from age to age ranges. + + N.B.: `cut` converts your values to a [`Categorical`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Categorical.html#pandas.Categorical) type. This + means that you can choose whether you want a label back or just the new value. + + **Arguments** + + - `seq-or-srs` -> seqable or `series` + - `bins` -> int, Iterable, `series`: how to bin the data. If int defines the number + of equal-width bins, otherwise values are treated as bins edges + + **Attrs** + + - `:right` -> bool, default `true`: include the rightmost edge? + - `:labels` -> Iterable, bool: if Iterable, specifies the labels for the bins, + if false it doesn't return the labels, only the values (**N.B.: the suggestion + is to work with `{:labels false}` as much as possible, especially if you have to + convert things to Clojure at some point**) + - `:retbins` -> bool, default `false`: return bins? + - `:precision` -> int, default 3: the precision of the bins labels + - `:include-lowest` -> bool, default `false`: should the first interval be left-inclusive? + - `:duplicates` -> {`:raise`, `:drop`, `nil`}: ff bin edges are not unique, + raise error or drop non-uniques + + **Examples** + + ``` + (def s (series [1 7 5 4 6 3])) + + (cut s 3) + ;; 0 (0.994, 3.0] + ;; 1 (5.0, 7.0] + ;; 2 (3.0, 5.0] + ;; 3 (3.0, 5.0] + ;; 4 (5.0, 7.0] + ;; 5 (0.994, 3.0] + ;; dtype: category + ;; Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]] + + (cut s [3 5 7]) + ;; 0 NaN + ;; 1 (5.0, 7.0] + ;; 2 (3.0, 5.0] + ;; 3 (3.0, 5.0] + ;; 4 (5.0, 7.0] + ;; 5 NaN + ;; dtype: category + ;; Categories (2, interval[int64]): [(3, 5] < (5, 7]] + + (cut s 3 {:labels false}) + ;; 0 0 + ;; 1 2 + ;; 2 1 + ;; 3 1 + ;; 4 2 + ;; 5 0 + ;; dtype: int64 + ``` + " + [seq-or-srs bins & [{:keys [right labels retbins precision + include-lowest duplicates] + :as attrs}]] + (py/call-attr-kw u/pd "cut" [seq-or-srs bins] + (u/keys->pyargs attrs))) + +(defn qcut + "Bin values into quantiles. + + The same as `cut`, but categories are quantiles. + + **Arguments** + + - `seq-or-srs` -> seqable or `series` + - `q` -> int, Iterable: either number of quantiles or Iterable of quantiles + + **Attrs** + + - `:labels` -> Iterable, bool: if Iterable, specifies the labels for the bins, + if false it doesn't return the labels, only the values (**N.B.: the suggestion + is to work with `{:labels false}` as much as possible, especially if you have to + convert things to Clojure at some point**) + - `:retbins` -> bool, default `false`: return bins? + - `:precision` -> int, default 3: the precision of the bins labels + - `:duplicates` -> {`:raise`, `:drop`, `nil`}: ff bin edges are not unique, + raise error or drop non-uniques + + **Examples** + + ``` + (qcut (range 5) 4) + ;; [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]] + ;; Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] < (2.0, 3.0] < (3.0, 4.0]] + + (qcut (range 5) 3 {:labels [:low :medium :high]}) + ;; [low, low, medium, high, high] + ;; Categories (3, object): [low < medium < high] + + (qcut (range 5) 3 {:labels false}) + ;; [0 0 1 2 2] + ``` + " + [seq-or-srs q & [{:keys [labels retbins precision duplicates] + :as attrs}]] + (py/call-attr-kw u/pd "qcut" [seq-or-srs q] + (u/keys->pyargs attrs))) + +(defn merge-ordered + "Merge two `data-frames` together, facilities to deal with ordered data. + + **Arguments** + + - `left` -> `data-frame` + - `right` -> `data-frame` + + **Attrs** + + - `:on` -> str, keyword, Iterable: column names to be joined on. They must be the + same in both `left` and `right` + - `:left-on` -> str, keyword, Iterable, `series`: columns to join on the `left`, + use this if you have different columns names + - `:right-on` -> str, keyword, Iterable, `series`: columns to join on the `right`, + use this if you have different columns names + - `:left-by` -> str, keyword, Iterable, `series`: groupby `left` on the given + columns and then join piece by piece + - `:right-by` -> str, keyword, Iterable, `series`: groupby `right` on the given + columns and then join piece by piece + - `:fill-method` -> {`:ffill` `nil`}, default `nil`: forward fill missing data + - `:suffixes` -> Iterable, default [`:_x` `:_y`]: the suffixes to add to overlapping + column names + - `:how` -> {`:left` `:right` `:outer` `:inner`}, default `:outer`: kind of join + + **Examples** + ``` + (def A + (data-frame + {:key [:a :c :e :a] + :lvalue [1 2 3 1] + :group [:a :a :a :b]})) + + (def B + (data-frame + {:key [:b :c :d] + :rvalue [1 2 3]})) + + (merge-ordered A B) + ;; key lvalue group rvalue + ;; 0 a 1.0 a NaN + ;; 1 a 1.0 b NaN + ;; 2 b NaN NaN 1.0 + ;; 3 c 2.0 a 2.0 + ;; 4 d NaN NaN 3.0 + ;; 5 e 3.0 a NaN + + (merge-ordered A B {:fill-method :ffill}) + ;; key lvalue group rvalue + ;; 0 a 1 a NaN + ;; 1 a 1 b NaN + ;; 2 b 1 b 1.0 + ;; 3 c 2 a 2.0 + ;; 4 d 2 a 3.0 + ;; 5 e 3 a 3.0 + + (merge-ordered A B {:fill-method :ffill :left-by \"group\"}) + ;; key lvalue group rvalue + ;; 0 a 1 a NaN + ;; 1 b 1 a 1.0 + ;; 2 c 2 a 2.0 + ;; 3 d 2 a 3.0 + ;; 4 e 3 a 3.0 + ;; 5 a 1 b NaN + ;; 6 b 1 b 1.0 + ;; 7 c 1 b 2.0 + ;; 8 d 1 b 3.0 + + (merge-ordered A B {:left-on :lvalue :right-on :rvalue}) + ;; key_x lvalue group key_y rvalue + ;; 0 a 1 a b 1 + ;; 1 a 1 b b 1 + ;; 2 c 2 a c 2 + ;; 3 e 3 a d 3 + ``` + " + [left right & [{:keys [on left-on right-on left-by right-by + fill-method suffixes how] + :as attrs}]] + (py/call-attr-kw u/pd "merge_ordered" [left right] + (u/keys->pyargs attrs))) + +(defn merge-asof + "Similar to a left join, but merges on nearest key rather than equal. + + **Arguments** + + - `left` -> `data-frame`: sorted by key + - `right` -> `data-frame`: sorted by key + + **Attrs** + + - `:on` str, keyword -> column name to join on. Must be in both `data-frames` and + it must be ordered and numeric (dates, int, etc) + - `:left-on` -> str, keyword: column name to join in left `data-frame`. The + requirements are the same as for `:on` + - `:right-on` -> str, keyword: column name to join in right `data-frame`. The + requirements are the same as for `:on` + - `:left-index` -> bool: index of left `data-frame` is the join key? + - `:right-index` -> bool: index of right `data-frame` is the join key? + - `:by` -> str, keyword, Iterable, `series`: match these columns before merging + - `:left-by` -> str, keyword, Iterable. `series`: as `:by` but only for left `data-frame` + - `:right-by` -> str, keyword, Iterable. `series`: as `:by` but only for right `data-frame` + - `:suffixes` -> Iterable: suffix to add to overlapping column names, must + have length 2 and the first one is `left` and second one is `right` + - `:tolerance` -> depends on key: the tolerance for merging + - `:allow-exact-matches` -> bool, default `true`: allow matching with same `:on` value? + - `:direction` -> {`:backward` `:forward` `:nearest`}, default `:backward`: search for + prior, subsequent or closest matches + + **Examples** + + ``` + (def trades + (data-frame + {:time (->datetime [\"2016-05-25 13:30:00.023\" + \"2016-05-25 13:30:00.038\" + \"2016-05-25 13:30:00.048\" + \"2016-05-25 13:30:00.048\"]) + :ticker [:MSFT :MSFT :GOOG :AAPL] + :price [51.95 51.95 720.77 98.00] + :quantity [75 155 100 100]})) + + (def quotes + (data-frame + {:time (->datetime [\"2016-05-25 13:30:00.023\" + \"2016-05-25 13:30:00.023\" + \"2016-05-25 13:30:00.030\" + \"2016-05-25 13:30:00.048\" + \"2016-05-25 13:30:00.049\"]) + :ticker [:GOOG :MSFT :MSFT :GOOG :AAPL] + :bid [720.5 51.95 51.97 720.5 97.99] + :ask [720.93 51.96 51.98 720.93 98.01]})) + + (merge-asof trades quotes {:on :time}) + ;; time ticker_x price quantity ticker_y bid ask + ;; 0 2016-05-25 13:30:00.023 MSFT 51.95 75 MSFT 51.95 51.96 + ;; 1 2016-05-25 13:30:00.038 MSFT 51.95 155 MSFT 51.97 51.98 + ;; 2 2016-05-25 13:30:00.048 GOOG 720.77 100 GOOG 720.50 720.93 + ;; 3 2016-05-25 13:30:00.048 AAPL 98.00 100 GOOG 720.50 720.93 + + (merge-asof trades quotes {:on :time :allow-exact-matches false}) + ;; time ticker_x price quantity ticker_y bid ask + ;; 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN NaN + ;; 1 2016-05-25 13:30:00.038 MSFT 51.95 155 MSFT 51.97 51.98 + ;; 2 2016-05-25 13:30:00.048 GOOG 720.77 100 MSFT 51.97 51.98 + ;; 3 2016-05-25 13:30:00.048 AAPL 98.00 100 MSFT 51.97 51.98 + + (merge-asof trades quotes {:on :time :direction :forward}) + ;; time ticker_x price quantity ticker_y bid ask + ;; 0 2016-05-25 13:30:00.023 MSFT 51.95 75 GOOG 720.5 720.93 + ;; 1 2016-05-25 13:30:00.038 MSFT 51.95 155 GOOG 720.5 720.93 + ;; 2 2016-05-25 13:30:00.048 GOOG 720.77 100 GOOG 720.5 720.93 + ;; 3 2016-05-25 13:30:00.048 AAPL 98.00 100 GOOG 720.5 720.93 + + (merge-asof trades quotes {:on :time :by :ticker}) + ;; time ticker price quantity bid ask + ;; 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 + ;; 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 + ;; 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + ;; 3 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + ``` + " + [left right & [{:keys [on left-on right-on left-index right-index by + left-by right-by suffixes tolerance + allow-exact-matches direction] + :as attrs}]] + (py/call-attr-kw u/pd "merge_asof" [left right] + (u/keys->pyargs attrs))) + +(defn concatenate + "Append `series`es and/or `data-frame`s along a wanted axis. + + **Arguments** + + - `dfs-or-srss` -> Iterable: a collection of multiple `series`/`data-frame` + + **Attrs** + + - `:axis` -> int, default 0: 0 = rows, 1 = columns + - `:join` -> {`:inner` `:outer`}, default `:outer`: the kind of join on other `:axis` + - `:ignore-index` -> bool, default `false`: whether to consider the index along + the wanted `:axis` + - `:keys` -> Iterable, default `nil`: this lets you build a hierarchical index + using the passed `:keys` as the outermost levels + - `:levels` -> Iterable, default `nil`: unique values for building a multi index + - `:names` -> Iterable, default `nil`: names of the levels in the hierarchical index + - `:verify-integrity` -> bool, default `false`: does the new `:axis` + contain duplicates? (P.S.: expensive operation) + - `:sort` -> bool, default `true`: sort the other `:axis` when `:join` is `:outer` + - `:copy` -> bool, default `true`: if `false` avoid copying when unnecessary + + **Examples** + + ``` + (concatenate [(series (range 3)) (series (range 3))]) + ;; 0 0 + ;; 1 1 + ;; 2 2 + ;; 0 0 + ;; 1 1 + ;; 2 2 + ;; dtype: int64 + + (concatenate [(series (range 3)) (series (range 3))] {:axis 1}) + ;; 0 1 + ;; 0 0 0 + ;; 1 1 1 + ;; 2 2 2 + + (concatenate [(data-frame {:a [1 2 3] :b [4 5 6]}) + (data-frame {:a [2 2 2] :b [3 3 3]})]) + ;; a b + ;; 0 1 4 + ;; 1 2 5 + ;; 2 3 6 + ;; 0 2 3 + ;; 1 2 3 + ;; 2 2 3 + + (concatenate [(data-frame {:a [1 2 3] :b [4 5 6]}) + (data-frame {:a [2 2 2] :b [3 3 3]})] + {:ignore-index true}) + ;; a b + ;; 0 1 4 + ;; 1 2 5 + ;; 2 3 6 + ;; 3 2 3 + ;; 4 2 3 + ;; 5 2 3 + ``` + " + [dfs-or-srss & [{:keys [axis join ignore-index keys levels + names verify-integrity sort copy] + :as attrs}]] + (u/kw-call u/pd "concat" dfs-or-srss attrs)) + +(defn aggregate + "Aggregate data using one or more functions over a given axis. + + This is very similar to `reduce`, but works on `data-frames` as well. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + - `how` -> keyword, str, function, Iterable: how to aggregate data. This accepts + either panthera functions strings/keywords, a list of the previous and/or user + defined functions. Check examples for more info. + + **Attrs** + + - `:axis` -> {0 `:index` 1 `:columns`}, default 0: 0 = apply function along + cols; 1 = apply function along rows + - `fn-args` -> if the provided collapsing function needs arguments, just list + them freely (see examples) + + **Examples** + + ``` + (def a (data-frame + [[1, 2, 3] + [4, 5, 6] + [7, 8, 9] + [##NaN, ##NaN, ##NaN]] + {:columns [:A :B :C]})) + + (aggregate (series [1 2 3]) :sum) + ;; 6 + + (aggregate a [:sum :min]) + ;; A B C + ;; sum 12.0 15.0 18.0 + ;; min 1.0 2.0 3.0 + + ; if `how` needs arguments, you can pass them as `attrs` + (aggregate (series [1 2 3]) :cov {:other (series [4 5 6])}) + ;; 1.0 + + (aggregate (series [1 2 3]) inc) + ;; 0 2 + ;; 1 3 + ;; 2 4 + ;; dtype: int64 + ``` + " + [df-or-srs how & [{:keys [axis fn-args] :as attrs}]] + (u/kw-call df-or-srs "agg" how attrs)) + +(defn remap + "Remap values in a series. + + This is the same as using `map` on a sequence while using a map as the mapped + function: `(map {:a 1 :b 2} [:a :b]) => (1 2)` + + **Arguments** + + - `srs` -> `series` + - `mappings` -> map, function: the mapping correspondence + - `na-action` -> {`nil` `:ignore`}, default `nil`: `:ignore` doesn't pass missing + values to the `mappings` + + **Examples** + + ``` + (remap (series [:a :b :c]) {:a 1 :b 2 :c 3}) + ;; 0 1 + ;; 1 2 + ;; 2 3 + ;; dtype: int64 + + (remap (series [:a :b ##NaN]) #(str \"This is \" %)) + ;; 0 This is a + ;; 1 This is b + ;; 2 This is NaN + ;; dtype: object + + (remap (series [:a :b ##NaN]) #(str \"This is \" %) :ignore) + ;; 0 This is a + ;; 1 This is b + ;; 2 NaN + ;; dtype: object + ``` + " + [srs mappings & [na-action]] + (py/call-attr srs "map" mappings (or na-action nil))) + +(defn groupby + "Group `data-frame` or `series` by a given variable. + + Note that `groupby` does nothing by itself, this must be followed by another + operation like aggregation. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + - `by` -> str, keyword, Iterable, map, function: it can be a column, a list of + columns, a function used to group the index, a collection of values to use as + grouping variable + + **Attrs** + + - `:axis` -> {0 `:index` 1 `:columns`}: split along columns or rows + - `:level` -> int, str, keyword, Iterable: if multiple index, group by this + or these + - `:as-index` -> bool, default `true`: when `false` this becomes basically + as the SQL group by output + - `:sort` -> bool, default `true`: if `false` you get a performance improvement + - `:group-keys` -> bool, default `true`: add group keys to index when afterwards + you call `apply` + - `:squeeze` -> bool, default `false`: reduce dimensionality of the output if possible + - `:observed` -> bool, default `false`: this only applies to Categoricals: + if `true`, only show observed values for categorical groupers, + if `false`, show all values for categorical groupers + + **Examples** + + ``` + (def a (data-frame {:animal [:falcon :falcon :parrot :parrot] + :max-speed [380 370 24 26]})) + + (-> a (r/groupby :animal) m/mean) + max-speed + ;; animal + ;; falcon 375 + ;; parrot 25 + + (-> a (r/groupby :animal {:as-index false}) m/mean) + ;; animal max-speed + ;; 0 falcon 375 + ;; 1 parrot 25 + ``` + " + [df-or-srs by & [{:keys [axis level as-index sort group-keys + squeeze observed] :as attrs}]] + (u/kw-call df-or-srs "groupby" by attrs)) + +(defn rolling + "Rolling window calculations + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + - `window` -> int, str. keyword: the size of the window. If str or keyword then + this is considered as a time offset (e.g. :2s = 2 seconds, :30D = 30 days; + check this for more options https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases) + + **Attrs** + + - `:min-periods` -> int: minimum number of observations to have a value. For + times the default is 1, otherwise the default is `window` + - `:center` -> bool, default `false`: if `false` the result is set at the right + edge of the window, otherwise it gets centered + - `:win-type` -> str, keyword: refer to https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows + - `:on`-> str, keyword: column to use for the rolling window, only in case this + is not the index + - `:axis` -> {0 `:index` 1 `:columns`}: split along columns or rows + - `:closed` -> {`:right` `:left` `:both` `:neither`}: where to make the interval + close + + **Examples** + ``` + (def a (data-frame {:b [0 1 2 3 4]} + {:index + (panthera.pandas.conversion/->datetime + (series + [\"20130101 09:00:00\" + \"20130101 09:00:02\" + \"20130101 09:00:03\" + \"20130101 09:00:05\" + \"20130101 09:00:06\"]))})) + + (sum (rolling a 2)) + ;; b + ;; 2013-01-01 09:00:00 NaN + ;; 2013-01-01 09:00:02 1.0 + ;; 2013-01-01 09:00:03 3.0 + ;; 2013-01-01 09:00:05 5.0 + ;; 2013-01-01 09:00:06 7.0 + + (sum (rolling a :2s)) + ;; b + ;; 2013-01-01 09:00:00 0.0 + ;; 2013-01-01 09:00:02 1.0 + ;; 2013-01-01 09:00:03 3.0 + ;; 2013-01-01 09:00:05 3.0 + ;; 2013-01-01 09:00:06 7.0 + + (sum (rolling a 2 {:win-type :triang})) + ;; b + ;; 2013-01-01 09:00:00 NaN + ;; 2013-01-01 09:00:02 0.5 + ;; 2013-01-01 09:00:03 1.5 + ;; 2013-01-01 09:00:05 2.5 + ;; 2013-01-01 09:00:06 3.5 + + (sum (rolling a 2 {:min-periods 1})) + ;; b + ;; 2013-01-01 09:00:00 0.0 + ;; 2013-01-01 09:00:02 1.0 + ;; 2013-01-01 09:00:03 3.0 + ;; 2013-01-01 09:00:05 5.0 + ;; 2013-01-01 09:00:06 7.0 + ``` + " + [df-or-srs window & [{:keys [min-periods center win-type on axis closed] + :as attrs}]] + (u/kw-call df-or-srs "rolling" window attrs)) + +(defn ewm + "Exponentially weighted functions. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + + **Attrs** + + - `:com` -> numeric: decay in terms of center of mass + - `:span` -> numeric: decay in terms of span + - `:halflife` -> numeric: decay in terms of half-life + - `:alpha` -> numeric: smoothing factor + - `:min-periods` -> int, default 0: minimum number of observations + - `:adjust` -> bool, default `true`: divide by decaying adjustment factor + in beginning periods to account for imbalance in relative weightings + - `:ignore-na` -> bool, default `false`: ignore missing values + - `:axis` -> {0 `:index` 1 `:columns`}: use columns or rows + + **Examples** + + ``` + (def a (g/data-frame {:b [0 1 2 ##NaN 4]})) + + (-> a (ewm {:com 0.5}) mean) + ;; b + ;; 0 0.000000 + ;; 1 0.750000 + ;; 2 1.615385 + ;; 3 1.615385 + ;; 4 3.670213 + + (-> a (ewm {:span 3}) mean) + ;; b + ;; 0 0.000000 + ;; 1 0.666667 + ;; 2 1.428571 + ;; 3 1.428571 + ;; 4 3.217391 + + (-> a (ewm {:com 0.5 :ignore-na true}) mean) + ;; b + ;; 0 0.000000 + ;; 1 0.750000 + ;; 2 1.615385 + ;; 3 1.615385 + ;; 4 3.225000 + ``` + " + [df-or-srs & [{:keys [com span halflife min-periods adjust ignore-na axis] + :as attrs}]] + (u/simple-kw-call df-or-srs "ewm" attrs)) + +(defn drop + "Drop requested rows or columns. + + Remove rows or columns by specifying label names and corresponding axis, + or by specifying directly index or column names. When using a multi-index, + labels on different levels can be removed by specifying the level. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + - `labels` -> keyword, str, numeric, Iterable: index or labels to drop + + **Attrs** + + - `:axis` -> int, default 0: 0 = rows, 1 = columns + - `:level` -> numeric, keyword, str: level to drop from multi index + - `:errors` -> {`:ignore` `:raise`}, default `:raise`: ignore or raise errors + + **Examples** + + ``` + (require-python '[numpy :as np]) + (def df + (data-frame + (np/reshape (np/arange 12) [3 4]) + {:columns [:A :B :C :D]})) + + (drop df [:B :C] {:axis 1}) + ;; A D + ;; 0 0 3 + ;; 1 4 7 + ;; 2 8 11 + + (drop df [0 1]) + ;; A B C D + ;; 2 8 9 10 11 + ``` + " + [df-or-srs labels & [{:keys [axis level errors] :as attrs}]] + (u/kw-call df-or-srs "drop" labels attrs)) + +(defn drop-rows + "A shorthand for `(drop df [0 2] {:axis 0})` + + See [[drop]] docs for more info" + [df rows & [{:keys [level errors] :as attrs}]] + (drop df rows (merge attrs {:axis 0}))) + +(defn drop-cols + "A shorthand for `(drop df [:A :C] {:axis 1})` + + See [[drop]] docs for more info" + [df cols & [{:keys [level errors] :as attrs}]] + (drop df cols (merge attrs {:axis 1}))) + +(defn dropna + "Drop missing values. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + + **Attrs** + + - `:axis` -> int, default 0: 0 = rows, 1 = columns + - `:how` -> {`:any` `:all`}, default `:any`: drop when there are `:any` missing + values, or `:all` missing values + - `:thresh` -> numeric: require `:thresh` missing values to drop + - `:subset` -> Iterable: the subset to consider on opposite axis; e.g. if + you drop rows `:subset` are the columns to consider for dropping + + **Examples** + + ``` + (def df + (data-frame {:name [:Alfred :Batman :Robin] + :toy [nil :Batmobile :Whip] + :born [nil :1940-04-25 nil]}) + + (dropna df) + ;; name toy born + ;; 1 Batman Batmobile 1940-04-25 + ``` + " + [df-or-srs & [{:keys [axis how thresh subset] + :as attrs}]] + (u/simple-kw-call df-or-srs "dropna" attrs)) + +(defn melt + "Unpivot a `data-frame` from wide format to long format. + + Basically reshape the `data-frame` to have one row per observation and one + column per variable + + **Arguments** + + - `df` -> `data-frame` + + **Attrs** + + - `:id-vars` -> Iterable: columns to use as identifiers + - `:value-vars` -> Iterable: columns to melt (unpivot), if not specified uses + all the columns not in `:id-vars` + - `:var-name` -> keyword, str, default `:variable`: name for the variable column + - `:value-name` -> keyword, str, default `:value`: name for the value column + - `:col-level` -> numeric, str: the level to use for melting + + **Examples** + + ``` + (def df + (transpose + (data-frame [[:a :b :c] [1 3 5] [2 4 6]] + {:columns [0 1 2] + :index [:A :B :C]}))) + + (melt df) + ;; variable value + ;; 0 A a + ;; 1 A b + ;; 2 A c + ;; 3 B 1 + ;; 4 B 3 + ;; 5 B 5 + ;; 6 C 2 + ;; 7 C 4 + ;; 8 C 6 + + (melt df {:id-vars [:A] :value-vars [:B]}) + ;; A variable value + ;; 0 a B 1 + ;; 1 b B 3 + ;; 2 c B 5 + + (melt df {:id-vars [:A] :value-vars [:B :C]}) + ;; A variable value + ;; 0 a B 1 + ;; 1 b B 3 + ;; 2 c B 5 + ;; 3 a C 2 + ;; 4 b C 4 + ;; 5 c C 6 + ``` + " + [df & [{:keys [id-vars value-vars var-name + value-name col-level] :as attrs}]] + (u/simple-kw-call df "melt" attrs)) + +(defn assign + "Assign new columns to `df-or-srs` + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + - `cols` -> map: either a map `{:col-name value}`, or a map `{:col-name fn}` + + **Examples** + + ``` + (def df + (transpose + (data-frame [[:a :b :c] [1 3 5] [2 4 6]] + {:columns [0 1 2] + :index [:A :B :C]}))) + + (assign df {:D 3}) + ;; A B C D + ;; 0 a 1 2 3 + ;; 1 b 3 4 3 + ;; 2 c 5 6 3 + + (assign df {:D [1 2 3]}) + ;; A B C D + ;; 0 a 1 2 1 + ;; 1 b 3 4 2 + ;; 2 c 5 6 3 + + (assign df {:D #(-> (subset-cols % :C) (mul 2))}) + ;; A B C D + ;; 0 a 1 2 4 + ;; 1 b 3 4 8 + ;; 2 c 5 6 12 + ``` + " + [df-or-srs cols] + (py/call-attr-kw df-or-srs "assign" [] cols)) + +(defn stack + "Stack the prescribed level(s) from columns to index. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + + **Attrs** + + - `:level` -> numeric, keyword, str, default -1: level to stack + - `:dropna` -> bool, default true: drop rows with missing values if generated + + **Examples** + + ``` + (def df + (data-frame [[0 1] [2 3]] + {:index [:cat :dog] + :columns [:weight :height]})) + + (stack df) + ;; cat weight 0 + ;; height 1 + ;; dog weight 2 + ;; height 3 + ;; dtype: int64 + ``` + " + [df-or-srs & [{:keys [level dropna] :as attrs}]] + (u/simple-kw-call df-or-srs "stack" attrs)) + +(defn unstack + "Pivot a level of the (necessarily hierarchical) index labels, + returning a DataFrame having a new level of column labels whose inner-most + level consists of the pivoted index labels. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + + **Attrs** + + - `:level` -> numeric, keyword, str, default -1: level to unstack + - `:fill-value` -> any: replace missing values produced by `unstack` with this + + **Examples** + + ``` + (def s + (stack + (data-frame [[1 2] [3 4]] + {:index [:one :two] + :columns [:a :b]}))) + + (unstack s) + ;; a b + ;; one 1 2 + ;; two 3 4 + + (unstack s {:level 0}) + ;; one two + ;; a 1 3 + ;; b 2 4 + + (unstack (unstack s {:level 0})) + ;; one a 1 + ;; b 2 + ;; two a 3 + ;; b 4 + ;; dtype: int64 + ``` + " + [df-or-srs & [{:keys [level fill_value] :as attrs}]] + (u/simple-kw-call df-or-srs "unstack" attrs)) + +(defn transpose + "Transpose the given panthera object + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + + **Examples** + + ``` + (def df (data-frame [[1 2 3] [4 5 6] [7 8 9]])) + + (transpose df) + ;; 0 1 2 + ;; 0 1 4 7 + ;; 1 2 5 8 + ;; 2 3 6 9 + ``` + " + [df-or-srs] + (py/get-attr df-or-srs "T")) diff --git a/src/panthera/pandas/utils.clj b/src/panthera/pandas/utils.clj index c35e932..9e32256 100644 --- a/src/panthera/pandas/utils.clj +++ b/src/panthera/pandas/utils.clj @@ -1,208 +1,305 @@ -(ns panthera.pandas.utils - (:require - [libpython-clj.python :as py] - [camel-snake-kebab.core :as csk] - [camel-snake-kebab.extras :as cske] - [clojure.core.memoize :as m])) - -(py/initialize!) - -(defonce builtins (py/import-module "builtins")) -(defonce pd (py/import-module "pandas")) - -(defn slice - "Returns a Python slice. This is what you'd get by doing something like - `1:10` and it is similar to `(range 1 10)`, but works with everything - not only numbers, so `(slice \"a\" \"f\")` would mean - [\"a\" \"b\" \"c\" \"d\" \"e\" \"f\"]. Use this for subsetting arrays, - serieses and data-frames. - - Example: - - ``` - (slice) ; the empty slice, it means every index - - (slice 5) ; every index up to 5 - - (slice 3 5) ; every index from 3 to 5 - - (slice \"2019-10-11\" \"2019-12-3\") ; works with dates as well - - (slice \"a\" \"d\") ; works with strings - - (slice 1 10 2) ; every 2 values between 1 and 10 - ```" - ([] - (py/call-attr builtins "slice" nil)) - ([start] - (py/call-attr builtins "slice" start)) - ([start stop] - (py/call-attr builtins "slice" start stop)) - ([start stop incr] - (py/call-attr builtins "slice" start stop incr))) - -(defn pytype - "Return the Python type of the given objects - - Examples: - - ``` - (pytype obj) - - (pytype my-df my-srs this) - ```" - ([] nil) - ([obj] - (py/python-type obj)) - ([obj & objs] - (map pytype (concat (vector obj) objs)))) - -(def memo-key-converter - "Convert regular Clojure kebab-case keys to idiomatic - Python snake_case strings. - - Example: - - ``` - (memo-key-converter :a-key) ; \"a_key\" - ```" - (m/fifo csk/->snake_case_string {} :fifo/threshold 512)) - -(def memo-columns-converter - "Converts Python strings to idiomatic Clojure keys. - - Examples: - - ``` - (memo-columns-converter \"a_name\") ; :a-name - - (memo-columns-converter \"ALL_CAPS\") ; :all-caps - ```" - (m/fifo - #(if (number? %) - % - (csk/->kebab-case-keyword %)) {} :fifo/threshold 512)) - -(defn vec->pylist - "Converts an iterable Clojure data structure to a Python list - - Example: - - ``` - (vec->pylist my-df) - ```" - [v] - (py/->py-list v)) - -(defn nested-vector? - "Check if the given argument is a nested vector or not. - - Example: - - ``` - (nested-vector? [[1 2] [3 4]]) - ```" - [v] - (some vector? v)) - -(defn nested-slice? - "Check if the given value contains at least one `:slice`. - - Example: - - ``` - (nested-slice? [(slice 3 5) (slice)]) - ```" - [v] - (some #(identical? :slice (pytype %)) v)) - -(defn vals->pylist - "Takes some values and dispatches them to the right conversion to a Python - data structure. - - Examples: - - ``` - (vals->pylist [1 2 3]) - - (vals->pylist [[1 2] [3 4]]) - - (vals->pylist [(slice 1 5) (slice)]) - ```" - [obj] - (cond - (not (coll? obj)) obj - (map? obj) obj - (nested-vector? obj) (to-array-2d obj) - (vector? obj) (if (nested-slice? obj) - obj - (py/->py-list obj)) - :else obj)) - -(defn keys->pyargs - "Takes a map as an argument and converts keys to Python strings - and values to the proper data structure. - - Examples: - - ``` - (keys->pyargs {:a 1 :a-key [1 2 3] \"c\" (slice)}) - ```" - [m] - (let [nm (reduce-kv - (fn [m k v] - (assoc m k (vals->pylist v))) - {} m)] - (cske/transform-keys memo-key-converter nm))) - -(defn series? - "Check if the given argument is a series" - [obj] - (identical? :series (pytype obj))) - -(defn data-frame? - "Check if the given argument is a data-frame" - [obj] - (identical? :data-frame (pytype obj))) - -(defn ->clj - "Convert the given panthera data-frame or series to a Clojure vector of maps. - The idea is to have a common, simple and fast access point to conversion of - the main data structures between languages. - - - series: a series gets converted to a vector of maps with only one key and - one value. If the series has a name that becomes the key of the maps, - otherwise `->clj` falls back to the `:unnamed` key. - - data-frame: a data-frame is converted to a vector of maps with names - of the columns as keys and values as the corresponding row/column value. - - Examples: - - ``` - (->clj my-srs) - - (->clj my-df) - ```" - [df-or-srs] - (if (series? df-or-srs) - (let [nm (memo-columns-converter - (or (py/get-attr df-or-srs "name") - :unnamed))] - (into [] (map #(assoc {} nm %)) - (vec df-or-srs))) - (let [ks (map memo-columns-converter - (py/get-attr df-or-srs "columns"))] - (into [] (map #(zipmap ks %)) - (py/get-attr df-or-srs "values"))))) - -(defn simple-kw-call - "Helper for a cleaner access to `call-attr-kw` from `libpython-clj`" - [df kw & [attrs]] - (py/call-attr-kw df kw [] - (keys->pyargs attrs))) - -(defn kw-call - "Helper for a cleaner access to `call-attr-kw` from `libpython-clj`" - [df kw pos & [attrs]] - (py/call-attr-kw df kw [(vals->pylist pos)] - (keys->pyargs attrs))) +(ns panthera.pandas.utils + (:require + [libpython-clj.python :as py] + [libpython-clj.require :refer [require-python]] + [camel-snake-kebab.extras :as cske] + [clojure.core.memoize :as m])) + +(defonce builtins (py/import-module "builtins")) + +(defonce pd (py/import-module "pandas")) + +(defn slice + "Returns a Python slice. This is what you'd get by doing something like + `1:10` and it is similar to `(range 1 10)`, but works with everything + not only numbers, so `(slice \"a\" \"f\")` would mean + [\"a\" \"b\" \"c\" \"d\" \"e\" \"f\"]. Use this for subsetting arrays, + serieses and data-frames. + + Example: + + ``` + (slice) ; the empty slice, it means every index + + (slice 5) ; every index up to 5 + + (slice 3 5) ; every index from 3 to 5 + + (slice \"2019-10-11\" \"2019-12-3\") ; works with dates as well + + (slice \"a\" \"d\") ; works with strings + + (slice 1 10 2) ; every 2 values between 1 and 10 + ```" + ([] + (py/call-attr builtins "slice" nil)) + ([start] + (py/call-attr builtins "slice" start)) + ([start stop] + (py/call-attr builtins "slice" start stop)) + ([start stop incr] + (py/call-attr builtins "slice" start stop incr))) + +(defn pytype + "Return the Python type of the given objects + + Examples: + + ``` + (pytype obj) + + (pytype my-df my-srs this) + ```" + ([] nil) + ([obj] + (py/python-type obj)) + ([obj & objs] + (map pytype (concat (vector obj) objs)))) + +(def pystr->cljk + (comp + keyword + #(clojure.string/replace % #"_" "-") + #(clojure.string/replace % #" " "-"))) + +(def cljk->pystr + (comp + #(clojure.string/replace % #"-" "_") + name)) + +(def memo-key-converter + "Convert regular Clojure kebab-case keys to idiomatic + Python snake_case strings. + + Example: + + ``` + (memo-key-converter :a-key) ; \"a_key\" + ```" + (m/fifo #(if (keyword? %) (cljk->pystr %) %) {} :fifo/threshold 512)) + +(def memo-columns-converter + "Converts Python strings to idiomatic Clojure keys. + + Examples: + + ``` + (memo-columns-converter \"a_name\") ; :a-name + + (memo-columns-converter \"ALL_CAPS\") ; :ALL-CAPS + ```" + (m/fifo + #(cond + (number? %) % + (string? %) (pystr->cljk %) + (nil? %) nil + :else (mapv pystr->cljk %)) {} :fifo/threshold 512)) + +(defn vec->pylist + "Converts an iterable Clojure data structure to a Python list + + Example: + + ``` + (vec->pylist my-df) + ```" + [v] + (py/->py-list v)) + +(defn nested-vector? + "Check if the given argument is a nested vector or not. + + Example: + + ``` + (nested-vector? [[1 2] [3 4]]) + ```" + [v] + (some vector? v)) + +(defn nested-slice? + "Check if the given value contains at least one `:slice`. + + Example: + + ``` + (nested-slice? [(slice 3 5) (slice)]) + ```" + [v] + (some #(identical? :slice (pytype %)) v)) + +(defn vals->pylist + "Takes some values and dispatches them to the right conversion to a Python + data structure. + + Examples: + + ``` + (vals->pylist [1 2 3]) + + (vals->pylist [[1 2] [3 4]]) + + (vals->pylist [(slice 1 5) (slice)]) + ```" + [obj] + (cond + (not (coll? obj)) obj + (map? obj) obj + (nested-vector? obj) (to-array-2d obj) + (vector? obj) (if (nested-slice? obj) + obj + (py/->py-list obj)) + :else obj)) + +(defn keys->pyargs + "Takes a map as an argument and converts keys to Python strings + and values to the proper data structure. + + Examples: + + ``` + (keys->pyargs {:a 1 :a-key [1 2 3] \"c\" (slice)}) + ```" + [m] + (let [nm (reduce-kv + (fn [m k v] + (assoc m k (vals->pylist v))) + {} m)] + (cske/transform-keys memo-key-converter nm))) + +(defn series? + "Check if the given argument is a series" + [obj] + (identical? :series (pytype obj))) + +(defn data-frame? + "Check if the given argument is a data-frame" + [obj] + (identical? :data-frame (pytype obj))) + +(defrecord DATASET [id cols data shape]) + +(defn pr-lazy-dataset + [data] + (let [cnt (first (:shape data))] + (if (> cnt 4) + (conj (vec (take 5 (:data data))) '...) + (vec (:data data))))) + +(defmethod print-method DATASET [v ^java.io.Writer w] + (let [id (:id v) + cols (:cols v) + shape (:shape v) + data (pr-lazy-dataset v)] + (clojure.pprint/pprint {:id id :cols cols :data data}))) + +(defmethod print-dup DATASET [v ^java.io.Writer w] + (let [id (:id v) + cols (:cols v) + shape (:shape v) + data (pr-lazy-dataset v)] + (clojure.pprint/pprint {:shape (vec shape) :id id :cols cols :data data}))) + +(defmethod clojure.pprint/simple-dispatch DATASET [v] + (let [id (:id v) + cols (:cols v) + shape (:shape v) + data (pr-lazy-dataset v)] + (clojure.pprint/pprint {:shape (vec shape) :id id :cols cols :data data}))) + +(defmulti to-clj + (fn [obj] (identical? :series (py/python-type obj)))) + +(defmethod to-clj false + [obj] + (let [cnt (py/get-attr obj "shape")] + (->DATASET + (py/get-attr obj "index") + (py/get-attr obj "columns") + (lazy-seq (py/get-attr obj "values")) + cnt))) + +(defmethod to-clj true + [obj] + (let [cnt (py/get-attr obj "shape")] + (->DATASET + (py/get-attr obj "index") + (or (py/get-attr obj "name") "unnamed") + (lazy-seq (py/get-attr obj "values")) + cnt))) + +(defmulti kwrds? + (fn [obj keywords?] (boolean keywords?))) + +(defmethod kwrds? true + [obj keywords?] + (if (series? obj) + (let [nm (memo-columns-converter + (or (py/get-attr obj "name") + "unnamed"))] + (into [] (map #(assoc {} nm %)) + (vec obj))) + (let [ks (map memo-columns-converter + (py/get-attr obj "columns"))] + (into [] (map #(zipmap ks %)) + (py/get-attr obj "values"))))) + +(defmethod kwrds? false + [obj keywords?] + (if (series? obj) + (let [nm (or (py/get-attr obj "name") + "unnamed")] + (into [] (map #(assoc {} nm %)) + (vec obj))) + (let [ks (py/get-attr obj "columns")] + (into [] (map #(zipmap ks %)) + (py/get-attr obj "values"))))) + +(defn ->clj + "Convert the given panthera data-frame or series to a Clojure vector of maps. + The idea is to have a common, simple and fast access point to conversion of + the main data structures between languages. + + - `series`: a `series` gets converted to a vector of maps with only one key and + one value. If the series has a name that becomes the key of the maps, + otherwise `->clj` falls back to the `:unnamed` key. + - `data-frame`: a `data-frame` is converted to a vector of maps with names + of the columns as keys and values as the corresponding row/column value. + + With the default method you might incur a data loss: the index doesn't get + converted and in case you're using a hierarchical index you get only one level + out of it. To keep everything in one place you have to make `full?` true, in + this way you get back a map with keys `{:id :cols :data}`. + + **Arguments** + + - `df-or-srs` -> `data-frame` or `series` + - `:full?` -> whether to use the full conversion, default false + - `:keywords?` -> wether to convert column names to keywords, default true + + N.B.: `:full?` usage excludes `:keywords?` + + **Examples** + + ``` + (->clj my-srs) + + (->clj my-df) + ``` + " + [df-or-srs & {:keys [full? keywords?] :or {keywords? true}}] + (if full? + (to-clj df-or-srs) + (kwrds? df-or-srs keywords?))) + +(defn simple-kw-call + "Helper for a cleaner access to `call-attr-kw` from `libpython-clj`" + [df kw & [attrs]] + (py/call-attr-kw df kw [] + (keys->pyargs attrs))) + +(defn kw-call + "Helper for a cleaner access to `call-attr-kw` from `libpython-clj`" + [df kw pos & [attrs]] + (py/call-attr-kw df kw [(vals->pylist pos)] + (keys->pyargs attrs))) diff --git a/src/panthera/panthera.clj b/src/panthera/panthera.clj index a379a66..1fbb13f 100644 --- a/src/panthera/panthera.clj +++ b/src/panthera/panthera.clj @@ -1,140 +1,146 @@ -(ns panthera.panthera - (:refer-clojure - :exclude [mod any?]) - (:require - [tech.parallel.utils :refer [export-symbols]] - [panthera.pandas.generics] - [panthera.pandas.math] - [panthera.pandas.utils] - [panthera.pandas.conversion] - [panthera.pandas.reshape])) - -(export-symbols - panthera.pandas.generics - n-rows - one-hot - hasnans? - swap-level - cross-section - n-unique - n-smallest - any? - subset-cols - n-largest - names - read-csv - select-rows - unique - filter-rows - dtype - value-counts - index - series - all? - read-excel - set-index - to-csv - data-frame - subset-rows - decreasing? - n-cols - head - increasing? - memory-usage - values - tail - reset-index - unique? - not-na? - shape - fill-na - nbytes - ftype) - -(export-symbols - panthera.pandas.math - dot - ne - quantile - kurtosis - lt - std - le - add - sum - diff - ge - cumprod - clip - cumsum - eq - compound - mean - corr - sub - mod - pow - skew - rank - maximum - mode - between - pct-change - cummin - cnt - cummax - ops - autocorr - cov - div - round - mul - sem - var - abs - median - gt - minimum - describe - mean-abs-dev - floor-div) - -(export-symbols - panthera.pandas.utils - pytype - slice - ->clj - series? - data-frame?) - -(export-symbols - panthera.pandas.conversion - ->timedelta - date-range - astype - ->numeric - timedelta-range - infer-time-freq - ->datetime - interval-range) - -(export-symbols - panthera.pandas.reshape - pivot - aggregate - crosstab - cut - rolling - concatenate - remap - factorize - qcut - merge-ordered - dropna - merge-asof - ewm - groupby - melt - assign - unstack - transpose) +(ns panthera.panthera + (:refer-clojure + :exclude [mod any? drop]) + (:require + [tech.parallel.utils :refer [export-symbols]] + [panthera.pandas.generics] + [panthera.pandas.math] + [panthera.pandas.utils] + [panthera.pandas.conversion] + [panthera.pandas.reshape])) + +(export-symbols + panthera.pandas.generics + n-rows + one-hot + hasnans? + swap-level + cross-section + n-unique + n-smallest + any? + subset-cols + n-largest + names + read-csv + select-rows + unique + filter-rows + dtype + value-counts + index + series + all? + read-excel + factorize + to-excel + set-index + to-csv + data-frame + subset-rows + rename + decreasing? + n-cols + head + increasing? + memory-usage + values + tail + reset-index + unique? + not-na? + shape + fill-na + nbytes + ftype) + +(export-symbols + panthera.pandas.math + dot + ne + quantile + kurtosis + lt + std + le + add + sum + diff + same? + ge + cumprod + clip + cumsum + eq + mean + corr + sub + mod + pow + skew + rank + maximum + mode + between + pct-change + cummin + cnt + cummax + ops + autocorr + cov + div + round + mul + sem + var + abs + median + gt + minimum + describe + mean-abs-dev + floor-div) + +(export-symbols + panthera.pandas.utils + pytype + slice + ->clj + series? + data-frame?) + +(export-symbols + panthera.pandas.conversion + ->timedelta + date-range + astype + ->numeric + timedelta-range + infer-time-freq + ->datetime + interval-range) + +(export-symbols + panthera.pandas.reshape + pivot + aggregate + crosstab + cut + rolling + unstack + concatenate + remap + transpose + drop-cols + qcut + drop + merge-ordered + drop-rows + dropna + merge-asof + assign + ewm + groupby + stack + melt) diff --git a/test/panthera/config.clj b/test/panthera/config.clj new file mode 100644 index 0000000..1c833f6 --- /dev/null +++ b/test/panthera/config.clj @@ -0,0 +1,8 @@ +(ns panthera.config + (:require + [libpython-clj.python :as py])) + +(defn start-python! + [f] + (py/initialize!) + (f)) diff --git a/test/panthera/generics_test.clj b/test/panthera/generics_test.clj index e4750ea..b18b695 100644 --- a/test/panthera/generics_test.clj +++ b/test/panthera/generics_test.clj @@ -1,388 +1,386 @@ -(ns panthera.generics-test - (:require - [clojure.test :refer :all] - [libpython-clj.python :as py] - [panthera.pandas.generics :as g] - [panthera.pandas.utils :as u :reload true] - [panthera.pandas.math :as m])) - -(deftest series - (are [i m] - (u/series? (g/series i m)) - [] {} - [] {:name :test} - [1 2 3] {} - 1 {} - ["1" "2"] {} - ["1" "2"] {:dtype :float32}) - (are [i m o] - (= (vec (g/series i m)) o) - [] {} [] - [] {:name :test} [] - [1 2 3] {} [1 2 3] - [:a :b] {} ["a" "b"] - ["a" "b"] {} ["a" "b"] - [1 2] {:dtype :str} ["1" "2"] - ["1" "2"] {:dtype :float32} [1.0 2.0])) - -(deftest data-frame - (are [i m] - (u/data-frame? (g/data-frame i m)) - [{:a 1 :b 2}] {} - (to-array-2d [[1 2] [3 4]]) {} - (to-array-2d [[1 2] [3 4]]) {:columns [:a :b]} - (to-array-2d [[1 2] [3 4]]) {:dtype :int8}) - (are [i m o] - (= (u/->clj (g/data-frame i m)) o) - [] {} [] - [] {:columns [:a :b]} [] - [{:a 1 :b 2} {:a 1 :b 2}] {} [{:a 1 :b 2} {:a 1 :b 2}] - [{:a "1" :b 2} {:a "3" :b 2}] {} [{:a "1" :b 2} {:a "3" :b 2}] - - [{:a "1" :b 2} {:a "3" :b 2}] - {:dtype :float32} - [{:a 1.0 :b 2.0} {:a 3.0 :b 2.0}] - - [{:a "1" :b 2} {:a "3" :b 2}] - {:dtype :str} - [{:a "1" :b "2"} {:a "3" :b "2"}] - - (to-array-2d [[1 2] [3 4]]) {} [{0 1 1 2} {0 3 1 4}] - (to-array-2d [[1 2] [3 4]]) - {:columns [:a :b]} [{:a 1 :b 2} {:a 3 :b 4}])) - -(deftest one-hot - (are [i m o] - (= (u/->clj (g/one-hot (g/series i) m)) o) - [] {} [] - ["a" "b"] {} [{:a 1 - :b 0} - {:a 0 - :b 1}] - ["a" "b"] {:prefix "pre"} [{:pre-a 1 - :pre-b 0} - {:pre-a 0 - :pre-b 1}]) - (are [i m o] - (= (u/->clj (g/one-hot (g/data-frame i) - {:columns m})) o) - - [{:a 1 :b "c"} {:a 2 :b "d"}] - [:b] - [{:a 1 - :b-c 1 - :b-d 0} - {:a 2 - :b-c 0 - :b-d 1}] - - [{:a 1 :b "c" :c 1} {:a 2 :b "d" :c 2}] - [:b :c] - [{:a 1 - :b-c 1 - :b-d 0 - :c-1 1 - :c-2 0} - {:a 2 - :b-c 0 - :b-d 1 - :c-1 0 - :c-2 1}])) - -(deftest unique - (are [i o] - (= (vec (g/unique i)) o) - [] [] - [1 1] [1] - [:a :b :a] ["a" "b"] - [1 -1 1] [1 -1])) - -(deftest index - (are [i o] - (= (vec (g/index i)) o) - (g/series []) [] - (g/series [1 2 3]) [0 1 2] - (g/series [1 2] {:index [100 1000]}) [100 1000])) - -(deftest values - (are [i o] - (= (vec (g/values i)) o) - (g/series []) [] - (g/series [1 2 3]) [1 2 3]) - (is (= (mapv vec (g/values (g/data-frame (to-array-2d [[1 2] [3 4]])))) - [[1 2] [3 4]]))) - -(deftest shape - (are [i o] - (= (vec (g/shape i)) o) - (g/series []) [0] - (g/series [1 2 3]) [3] - (g/data-frame (to-array-2d [[1 2] [3 4]])) [2 2])) - -(deftest hasnans? - (are [i o] - (= (g/hasnans? i) o) - (g/series []) false - (g/series [nil]) true - (g/series [1 2 nil]) true)) - -(deftest subset-rows - (are [s o] - (= (u/->clj (apply g/subset-rows - (g/data-frame (->> (range 1 11) - (partition 2) - to-array-2d)) s)) o) - [] (u/->clj (g/data-frame (->> (range 1 11) - (partition 2) - to-array-2d))) - [1] [{0 1 1 2}] - [1 3] [{0 3 1 4} {0 5 1 6}] - [1 3 2] [{0 3 1 4}])) - -(deftest cross-section - (are [k o] - (= (vec - (g/cross-section - (g/series (range 5) - {:index [:a :b :b :c :a]}) k)) - o) - :a [0 4] - :b [1 2])) - -(deftest head - (are [n o] - (= (u/->clj - (g/head - (g/data-frame - (flatten - (repeat 5 [{:a 1 :b 2} - {:a 2 :b 3}]))) n)) - o) - nil (drop-last (flatten - (repeat 3 [{:a 1 :b 2} - {:a 2 :b 3}]))) - 1 [{:a 1 :b 2}] - 8 (flatten - (repeat 4 [{:a 1 :b 2} - {:a 2 :b 3}])))) - -(deftest subset-cols - (are [i cols o] - (= (u/->clj - (apply - g/subset-cols - (g/data-frame i) - cols)) - o) - [{:a 1}] [:a] [{:a 1}] - [{:a 1 :b 2 :c 3}] [:a :c] [{:a 1 :c 3}] - (repeat 5 {:a 1 :b 2}) [:b] (repeat 5 {:b 2}) - [{:wEiR__.D 1 :b 2}] [:wEiR__.D] [{:w-ei-r-.-d 1}])) - -(deftest n-largest - (are [m o] - (= (vec - (g/n-largest - (g/series (range 20)) m)) - o) - {:n 5} (range 19 14 -1) - {:n 3} [19 18 17] - {:n 8} (range 19 11 -1))) - -(deftest n-smallest - (are [m o] - (= (vec - (g/n-smallest - (g/series (range 20)) m)) - o) - {:n 5} (range 5) - {:n 3} (range 3) - {:n 8} (range 8))) - -(deftest n-unique - (are [i o] - (= (g/n-unique - (g/series i)) - o) - (range 10) 10 - [1 1 2] 2 - [11 nil 3] 2)) - -(deftest unique? - (are [i o] - (= (g/unique? i) o) - [] true - [1 2 3] true - [1 1] false - [-1 1] true - [1 nil] true - ["a" "b"] true - (g/series [1 1]) false)) - -(deftest increasing? - (are [i o] - (= (g/increasing? i) o) - [] true - [1 5 9] true - [1 nil 3] false - [1 1 1 1] true - [3 2 1] false)) - -(deftest decreasing? - (are [i o] - (= (g/decreasing? i) o) - [] true - [9 7 1] true - [3 nil 1] false - [3 3 3] true - [1 2 3] false)) - -(deftest value-counts - (are [i m o] - (= (g/value-counts i (merge {:clj true} m)) o) - [] {} {} - [1 1 2] {} {1 2 2 1} - [:a :a :b :c] {} {:a 2 :b 1 :c 1} - (repeat 50 :a) {} {:a 50} - [:a :a :b :c] {:normalize true} {:a 0.5 :b 0.25 :c 0.25} - ;(range 20) {:bins 4} {:a 0.5 :b 0.25 :c 0.25} Intervals are not handled - )) - -(deftest reset-index - (are [i m o] - (= (u/->clj (g/reset-index (g/series i) m)) o) - (range 3) {} [{:index 0 0 0} - {:index 1 0 1} - {:index 2 0 2}] - (range 3) {:drop true} [{:unnamed 0} - {:unnamed 1} - {:unnamed 2}] - (range 3) {:name "col"} [{:index 0 :col 0} - {:index 1 :col 1} - {:index 2 :col 2}])) - -(deftest names - (are [i o] - (= (g/names i) o) - (g/series [1 2]) nil - (g/series [1 2] {:name "name"}) "name" - (g/series [1 2] {:name :my-name}) "my-name") - (are [i o] - (= (vec (g/names (g/data-frame i))) o) - [{:a 1 :b 2}] ["a" "b"] - [{"a name" 1 :c 2}] ["a name" "c"] - [{123 1 1/5 3}] [123.0 0.2])) - -(deftest filter-rows - (are [i b o] - (= (u/->clj - (g/filter-rows i b)) o) - (g/series (range 10)) #(m/gt % 5) [{:unnamed 6} - {:unnamed 7} - {:unnamed 8} - {:unnamed 9}] - (g/series (range 4)) [false true false true] [{:unnamed 1} - {:unnamed 3}] - - (g/data-frame [{:a 1 :b 2} - {:a 3 :b 4}]) - #(-> % - (g/subset-cols :a) - (m/lt 3) - g/values) - [{:a 1 :b 2}] - - (g/data-frame [{:a 1 :b 2} - {:a 3 :b 4} - {:a 4 :b 5}]) - [true false false] - [{:a 1 :b 2}])) - -(deftest tail - (are [i n o] - (= (u/->clj - (g/tail i n)) - o) - (g/series (range 20)) nil [{:unnamed 15} - {:unnamed 16} - {:unnamed 17} - {:unnamed 18} - {:unnamed 19}] - (g/series (range 20)) 2 [{:unnamed 18} {:unnamed 19}] - (g/data-frame (repeat 10 {:a 1 :b 2})) nil (repeat 5 {:a 1 :b 2}) - (g/data-frame (repeat 10 {:a 1 :b 2})) 2 (repeat 2 {:a 1 :b 2}))) - -(deftest fill-na - (are [v m o] - (= (vec - (g/fill-na (g/series [1 nil 2 nil]) v m)) o) - 3 {} [1.0 3.0 2.0 3.0] - "a" {} [1.0 "a" 2.0 "a"] - nil {:method :ffill} [1.0 1.0 2.0 2.0])) - -(deftest select-rows - (are [i id l h o] - (= (u/->clj - (g/select-rows - (g/data-frame i (or {:index l} {})) - id h)) - o) - (to-array-2d (partition 2 (range 20))) - [] - nil - nil - [] - - (to-array-2d (partition 2 (range 20))) - [0 3] - nil - nil - [{0 0 1 1} {0 6 1 7}] - - (to-array-2d (partition 2 (range 10))) - [0 3] - [:a :b :c :d :e] - nil - [{0 0 1 1} {0 6 1 7}] - - (to-array-2d (partition 2 (range 10))) - [0 3] - nil - :loc - [{0 0 1 1} {0 6 1 7}] - - (to-array-2d (partition 2 (range 10))) - [:a :d] - [:a :b :c :d :e] - :loc - [{0 0 1 1} {0 6 1 7}] - - (to-array-2d (partition 2 (range 10))) - (u/slice 3) - nil - nil - [{0 0 1 1} {0 2 1 3} {0 4 1 5}] - - (to-array-2d (partition 4 (range 20))) - [(u/slice 2) (u/slice 1)] - nil - :loc - [{0 0 1 1} {0 4 1 5} {0 8 1 9}])) - -(deftest set-index - (are [idx m oid ov] - (and (= (vec - (g/index - (g/set-index - (g/data-frame [{:a 1 :b 2 :c 3} {:a 2 :b 3 :c 4}]) - idx m))) - oid) - (= (u/->clj - (g/set-index - (g/data-frame [{:a 1 :b 2 :c 3} {:a 2 :b 3 :c 4}]) - idx m)) - ov)) - [:a] {} [1 2] [{:b 2 :c 3} {:b 3 :c 4}] - [:a :b] {} [[1 2] [2 3]] [{:c 3} {:c 4}] - [:a] {:drop false} [1 2] [{:a 1 :b 2 :c 3} {:a 2 :b 3 :c 4}] - [:a] {:append true} [[0 1] [1 2]] [{:b 2 :c 3} {:b 3 :c 4}])) +(ns panthera.generics-test + (:require + [clojure.test :refer :all] + [libpython-clj.python :as py] + [panthera.pandas.generics :as g :reload true] + [panthera.pandas.utils :as u :reload true] + [panthera.pandas.math :as m])) + +(deftest series + (are [i m] + (u/series? (g/series i m)) + [] {} + [] {:name :test} + [1 2 3] {} + 1 {} + ["1" "2"] {} + ["1" "2"] {:dtype :float32}) + (are [i m o] + (= (vec (g/series i m)) o) + [] {} [] + [] {:name :test} [] + [1 2 3] {} [1 2 3] + [:a :b] {} ["a" "b"] + ["a" "b"] {} ["a" "b"] + [1 2] {:dtype :str} ["1" "2"] + ["1" "2"] {:dtype :float32} [1.0 2.0])) + +(deftest data-frame + (are [i m] + (u/data-frame? (g/data-frame i m)) + [{:a 1 :b 2}] {} + (to-array-2d [[1 2] [3 4]]) {} + (to-array-2d [[1 2] [3 4]]) {:columns [:a :b]} + (to-array-2d [[1 2] [3 4]]) {:dtype :int8}) + (are [i m o] + (= (u/->clj (g/data-frame i m)) o) + [] {} [] + [] {:columns [:a :b]} [] + [{:a 1 :b 2} {:a 1 :b 2}] {} [{:a 1 :b 2} {:a 1 :b 2}] + [{:a "1" :b 2} {:a "3" :b 2}] {} [{:a "1" :b 2} {:a "3" :b 2}] + + [{:a "1" :b 2} {:a "3" :b 2}] + {:dtype :float32} + [{:a 1.0 :b 2.0} {:a 3.0 :b 2.0}] + + [{:a "1" :b 2} {:a "3" :b 2}] + {:dtype :str} + [{:a "1" :b "2"} {:a "3" :b "2"}] + + (to-array-2d [[1 2] [3 4]]) {} [{0 1 1 2} {0 3 1 4}] + (to-array-2d [[1 2] [3 4]]) + {:columns [:a :b]} [{:a 1 :b 2} {:a 3 :b 4}])) + +(deftest one-hot + (are [i m o] + (= (u/->clj (g/one-hot (g/series i) m)) o) + [] {} [] + ["a" "b"] {} [{:a 1 + :b 0} + {:a 0 + :b 1}] + ["a" "b"] {:prefix "pre"} [{:pre-a 1 + :pre-b 0} + {:pre-a 0 + :pre-b 1}]) + (are [i m o] + (= (u/->clj (g/one-hot (g/data-frame i) + {:columns m})) o) + + [{:a 1 :b "c"} {:a 2 :b "d"}] + [:b] + [{:a 1 + :b-c 1 + :b-d 0} + {:a 2 + :b-c 0 + :b-d 1}] + + [{:a 1 :b "c" :c 1} {:a 2 :b "d" :c 2}] + [:b :c] + [{:a 1 + :b-c 1 + :b-d 0 + :c-1 1 + :c-2 0} + {:a 2 + :b-c 0 + :b-d 1 + :c-1 0 + :c-2 1}])) + +(deftest unique + (are [i o] + (= (vec (g/unique i)) o) + [] [] + [1 1] [1] + [:a :b :a] ["a" "b"] + [1 -1 1] [1 -1])) + +(deftest index + (are [i o] + (= (vec (g/index i)) o) + (g/series []) [] + (g/series [1 2 3]) [0 1 2] + (g/series [1 2] {:index [100 1000]}) [100 1000])) + +(deftest values + (are [i o] + (= (vec (g/values i)) o) + (g/series []) [] + (g/series [1 2 3]) [1 2 3]) + (is (= (mapv vec (g/values (g/data-frame (to-array-2d [[1 2] [3 4]])))) + [[1 2] [3 4]]))) + +(deftest shape + (are [i o] + (= (vec (g/shape i)) o) + (g/series []) [0] + (g/series [1 2 3]) [3] + (g/data-frame (to-array-2d [[1 2] [3 4]])) [2 2])) + +(deftest hasnans? + (are [i o] + (= (g/hasnans? i) o) + (g/series []) false + (g/series [nil]) true + (g/series [1 2 nil]) true)) + +(deftest subset-rows + (are [s o] + (= (u/->clj (apply g/subset-rows + (g/data-frame (->> (range 1 11) + (partition 2) + to-array-2d)) s)) o) + [] (u/->clj (g/data-frame (->> (range 1 11) + (partition 2) + to-array-2d))) + [1] [{0 1 1 2}] + [1 3] [{0 3 1 4} {0 5 1 6}] + [1 3 2] [{0 3 1 4}])) + +(deftest cross-section + (are [k o] + (= (vec + (g/cross-section + (g/series (range 5) + {:index [:a :b :b :c :a]}) k)) + o) + :a [0 4] + :b [1 2])) + +(deftest head + (are [n o] + (= (u/->clj + (g/head + (g/data-frame + (vec + (flatten + (repeat 5 [{:a 1 :b 2} + {:a 2 :b 3}])))) n)) + o) + nil (drop-last (flatten + (repeat 3 [{:a 1 :b 2} + {:a 2 :b 3}]))) + 1 [{:a 1 :b 2}] + 8 (flatten + (repeat 4 [{:a 1 :b 2} + {:a 2 :b 3}])))) + +(deftest subset-cols + (are [i cols o] + (= (u/->clj + (apply + g/subset-cols + (g/data-frame i) + cols)) + o) + [{:a 1}] [:a] [{:a 1}] + [{:a 1 :b 2 :c 3}] [:a :c] [{:a 1 :c 3}] + (vec (repeat 5 {:a 1 :b 2})) [:b] (vec (repeat 5 {:b 2})))) + +(deftest n-largest + (are [m o] + (= (vec + (g/n-largest + (g/series (range 20)) m)) + o) + {:n 5} (range 19 14 -1) + {:n 3} [19 18 17] + {:n 8} (range 19 11 -1))) + +(deftest n-smallest + (are [m o] + (= (vec + (g/n-smallest + (g/series (range 20)) m)) + o) + {:n 5} (range 5) + {:n 3} (range 3) + {:n 8} (range 8))) + +(deftest n-unique + (are [i o] + (= (g/n-unique + (g/series i)) + o) + (range 10) 10 + [1 1 2] 2 + [11 nil 3] 2)) + +(deftest unique? + (are [i o] + (= (g/unique? i) o) + [] true + [1 2 3] true + [1 1] false + [-1 1] true + [1 nil] true + ["a" "b"] true + (g/series [1 1]) false)) + +(deftest increasing? + (are [i o] + (= (g/increasing? i) o) + [] true + [1 5 9] true + [1 nil 3] false + [1 1 1 1] true + [3 2 1] false)) + +(deftest decreasing? + (are [i o] + (= (g/decreasing? i) o) + [] true + [9 7 1] true + [3 nil 1] false + [3 3 3] true + [1 2 3] false)) + +(deftest value-counts + (are [i m o] + (= (g/value-counts i (merge {:clj true} m)) o) + [] {} {} + [1 1 2] {} {1 2 2 1} + [:a :a :b :c] {} {:a 2 :b 1 :c 1} + (repeat 50 :a) {} {:a 50} + [:a :a :b :c] {:normalize true} {:a 0.5 :b 0.25 :c 0.25})) + +(deftest reset-index + (are [i m o] + (= (u/->clj (g/reset-index (g/series i) m)) o) + (range 3) {} [{:index 0 0 0} + {:index 1 0 1} + {:index 2 0 2}] + (range 3) {:drop true} [{:unnamed 0} + {:unnamed 1} + {:unnamed 2}] + (range 3) {:name "col"} [{:index 0 :col 0} + {:index 1 :col 1} + {:index 2 :col 2}])) + +(deftest names + (are [i o] + (= (g/names i) o) + (g/series [1 2]) nil + (g/series [1 2] {:name "name"}) "name" + (g/series [1 2] {:name :my-name}) "my-name") + (are [i o] + (= (vec (g/names (g/data-frame i))) o) + [{:a 1 :b 2}] ["a" "b"] + [{"a name" 1 :c 2}] ["a name" "c"] + [{123 1 1/5 3}] [123.0 0.2])) + +(deftest filter-rows + (are [i b o] + (= (u/->clj + (g/filter-rows i b)) o) + (g/series (range 10)) #(m/gt % 5) [{:unnamed 6} + {:unnamed 7} + {:unnamed 8} + {:unnamed 9}] + (g/series (range 4)) [false true false true] [{:unnamed 1} + {:unnamed 3}] + + (g/data-frame [{:a 1 :b 2} + {:a 3 :b 4}]) + #(-> % + (g/subset-cols :a) + (m/lt 3) + g/values) + [{:a 1 :b 2}] + + (g/data-frame [{:a 1 :b 2} + {:a 3 :b 4} + {:a 4 :b 5}]) + [true false false] + [{:a 1 :b 2}])) + +(deftest tail + (are [i n o] + (= (u/->clj + (g/tail i n)) + o) + (g/series (range 20)) nil [{:unnamed 15} + {:unnamed 16} + {:unnamed 17} + {:unnamed 18} + {:unnamed 19}] + (g/series (range 20)) 2 [{:unnamed 18} {:unnamed 19}] + (g/data-frame (vec (repeat 10 {:a 1 :b 2}))) nil (repeat 5 {:a 1 :b 2}) + (g/data-frame (vec (repeat 10 {:a 1 :b 2}))) 2 (repeat 2 {:a 1 :b 2}))) + +(deftest fill-na + (are [v m o] + (= (vec + (g/fill-na (g/series [1 nil 2 nil]) v m)) o) + 3 {} [1.0 3.0 2.0 3.0] + "a" {} [1.0 "a" 2.0 "a"] + nil {:method :ffill} [1.0 1.0 2.0 2.0])) + +(deftest select-rows + (are [i id l h o] + (= (u/->clj + (g/select-rows + (g/data-frame i (or {:index l} {})) + id h)) + o) + (to-array-2d (partition 2 (range 20))) + [] + nil + nil + [] + + (to-array-2d (partition 2 (range 20))) + [0 3] + nil + nil + [{0 0 1 1} {0 6 1 7}] + + (to-array-2d (partition 2 (range 10))) + [0 3] + [:a :b :c :d :e] + nil + [{0 0 1 1} {0 6 1 7}] + + (to-array-2d (partition 2 (range 10))) + [0 3] + nil + :loc + [{0 0 1 1} {0 6 1 7}] + + (to-array-2d (partition 2 (range 10))) + [:a :d] + [:a :b :c :d :e] + :loc + [{0 0 1 1} {0 6 1 7}] + + (to-array-2d (partition 2 (range 10))) + (u/slice 3) + nil + nil + [{0 0 1 1} {0 2 1 3} {0 4 1 5}] + + (to-array-2d (partition 4 (range 20))) + [(u/slice 2) (u/slice 1)] + nil + :loc + [{0 0 1 1} {0 4 1 5} {0 8 1 9}])) + +(deftest set-index + (are [idx m oid ov] + (and (= (vec + (g/index + (g/set-index + (g/data-frame [{:a 1 :b 2 :c 3} {:a 2 :b 3 :c 4}]) + idx m))) + oid) + (= (u/->clj + (g/set-index + (g/data-frame [{:a 1 :b 2 :c 3} {:a 2 :b 3 :c 4}]) + idx m)) + ov)) + [:a] {} [1 2] [{:b 2 :c 3} {:b 3 :c 4}] + [:a :b] {} [[1 2] [2 3]] [{:c 3} {:c 4}] + [:a] {:drop false} [1 2] [{:a 1 :b 2 :c 3} {:a 2 :b 3 :c 4}] + [:a] {:append true} [[0 1] [1 2]] [{:b 2 :c 3} {:b 3 :c 4}])) diff --git a/test/panthera/math_test.clj b/test/panthera/math_test.clj index 1231723..0fa407e 100644 --- a/test/panthera/math_test.clj +++ b/test/panthera/math_test.clj @@ -114,6 +114,4 @@ :prod (repeat 10 0) :sum [0 1 3 6 10 15 21 28 36 45]) (is (= (drop 1 (vec ((#'m/base-cumulative :diff) (g/series (range 10))))) - (repeat 9 1.0))) - (is (= ((#'m/base-cumulative :cmpnd) (g/series (range 10))) - 3628799))) + (repeat 9 1.0)))) diff --git a/test/panthera/reshape_test.clj b/test/panthera/reshape_test.clj new file mode 100644 index 0000000..ecb4072 --- /dev/null +++ b/test/panthera/reshape_test.clj @@ -0,0 +1,492 @@ +(ns panthera.reshape-test + (:refer-clojure + :exclude [drop]) + (:require + [clojure.test :refer :all] + [libpython-clj.python :as py] + [libpython-clj.require :refer [require-python]] + [panthera.pandas.utils :as u :reload true] + [panthera.pandas.generics :as g] + [panthera.pandas.reshape :as r :reload true] + [panthera.pandas.math :as m :reload true] + [panthera.pandas.conversion :as c])) + +(require-python '[numpy :as np]) + +(defn filter-nan + [d] + (into [] (comp (mapcat vals) (filter (complement #(.isNaN %)))) d)) + +(deftest crosstab + (are [r c o] + (= (u/->clj (r/crosstab r {:columns c})) o) + [[]] [[]] [] + [[1 2 2]] [[:a :b :a]] [{:a 1 :b 0} {:a 1 :b 1}] + (g/series [1 2 3]) [[:a :b :a]] [{:a 1 :b 0} {:a 0 :b 1} {:a 1 :b 0}]) + (are [r d o] + (= (filter-nan (u/->clj (r/crosstab r d))) o) + [[1 2 2]] {:columns [[:a :b :b]] + :values [10 20 30] + :aggfunc :mean} [10.0 25.0]) + (is (= (u/->clj + (r/crosstab [[1 2 2]] {:columns [[:a :b :a]] :margins true})) + [{:a 1 :b 0 :All 1} + {:a 1 :b 1 :All 2} + {:a 2 :b 1 :All 3}]))) + +(deftest pivot + (are [d o] + (= (u/->clj (r/pivot (g/data-frame {:foo [:one :one :one :two :two :two] + :bar [:a :b :c :a :b :c] + :baz [1 2 3 4 5 6] + :zoo [:x :y :z :q :w :t]}) + d)) o) + + {:columns :bar :index :foo} [{[:baz :a] 1, + [:baz :b] 2, + [:baz :c] 3, + [:zoo :a] "x", + [:zoo :b] "y", + [:zoo :c] "z"} + {[:baz :a] 4, + [:baz :b] 5, + [:baz :c] 6, + [:zoo :a] "q", + [:zoo :b] "w", + [:zoo :c] "t"}] + + {:index :foo :columns :bar :values [:baz :zoo]} [{[:baz :a] 1, + [:baz :b] 2, + [:baz :c] 3, + [:zoo :a] "x", + [:zoo :b] "y", + [:zoo :c] "z"} + {[:baz :a] 4, + [:baz :b] 5, + [:baz :c] 6, + [:zoo :a] "q", + [:zoo :b] "w", + [:zoo :c] "t"}])) + +(deftest cut + (is + (->> (u/->clj (r/cut (g/series [1 7 5 4 6 3]) 3)) + first + vals + first + (m/eq (u/simple-kw-call u/pd "Interval" {:left 0.994 :right 3.0})))) + (are [b d o] + (= (u/->clj (r/cut (g/series [1 7 5 4 6 3]) b d)) o) + 3 {:labels false} [{:unnamed 0} {:unnamed 2} {:unnamed 1} + {:unnamed 1} {:unnamed 2} {:unnamed 0}] + + 3 {:labels [:a :b :c]} [{:unnamed "a"} {:unnamed "c"} {:unnamed "b"} + {:unnamed "b"} {:unnamed "c"} {:unnamed "a"}] + + [0 3 5 7] {:labels false} [{:unnamed 0} {:unnamed 2} {:unnamed 1} + {:unnamed 1} {:unnamed 2} {:unnamed 0}])) + +(deftest qcut + (is + (->> (u/->clj (r/cut (g/series (range 5)) 4)) + first + vals + first + (m/eq (u/simple-kw-call u/pd "Interval" {:left -0.004 :right 1.0})))) + (are [b d o] + (= (u/->clj (r/cut (g/series (range 5)) b d)) o) + 3 {:labels false} [{:unnamed 0} {:unnamed 0} + {:unnamed 1} {:unnamed 2} + {:unnamed 2}] + + 3 {:labels [:low :medium :high]} [{:unnamed "low"} + {:unnamed "low"} + {:unnamed "medium"} + {:unnamed "high"} + {:unnamed "high"}])) + +(deftest merge-ordered + (let [a (g/data-frame + {:key [:a :c :e :a] + :lvalue [1 2 3 1] + :group [:a :a :a :b]}) + b (g/data-frame + {:key [:b :c :d] + :rvalue [1 2 3]})] + (are [d o] + (m/same? (r/merge-ordered a b d) (g/data-frame o)) + {} [{:key "a", :lvalue 1.0, :group "a", :rvalue ##NaN} + {:key "a", :lvalue 1.0, :group "b", :rvalue ##NaN} + {:key "b", :lvalue ##NaN, :group ##NaN, :rvalue 1.0} + {:key "c", :lvalue 2.0, :group "a", :rvalue 2.0} + {:key "d", :lvalue ##NaN, :group ##NaN, :rvalue 3.0} + {:key "e", :lvalue 3.0, :group "a", :rvalue ##NaN}]))) + +(deftest merge-asof + (let [trades (g/data-frame + {:time (c/->datetime ["2016-05-25 13:30:00.023" + "2016-05-25 13:30:00.038" + "2016-05-25 13:30:00.048" + "2016-05-25 13:30:00.048"]) + :ticker [:MSFT :MSFT :GOOG :AAPL] + :price [51.95 51.95 720.77 98.00] + :quantity [75 155 100 100]}) + quotes (g/data-frame + {:time (c/->datetime ["2016-05-25 13:30:00.023" + "2016-05-25 13:30:00.023" + "2016-05-25 13:30:00.030" + "2016-05-25 13:30:00.048" + "2016-05-25 13:30:00.049"]) + :ticker [:GOOG :MSFT :MSFT :GOOG :AAPL] + :bid [720.5 51.95 51.97 720.5 97.99] + :ask [720.93 51.96 51.98 720.93 98.01]})] + (are [d o] + (m/same? (r/merge-asof trades quotes d) (g/data-frame o)) + {:on :time + :suffixes [:-x :-y]} [{:time (c/->datetime "2016-05-25 13:30:00.023000"), + :ticker-x "MSFT", + :price 51.95, + :quantity 75, + :ticker-y "MSFT", + :bid 51.95, + :ask 51.96} + {:time (c/->datetime "2016-05-25 13:30:00.038000"), + :ticker-x "MSFT", + :price 51.95, + :quantity 155, + :ticker-y "MSFT", + :bid 51.97, + :ask 51.98} + {:time (c/->datetime "2016-05-25 13:30:00.048000"), + :ticker-x "GOOG", + :price 720.77, + :quantity 100, + :ticker-y "GOOG", + :bid 720.5, + :ask 720.93} + {:time (c/->datetime "2016-05-25 13:30:00.048000"), + :ticker-x "AAPL", + :price 98.0, + :quantity 100, + :ticker-y "GOOG", + :bid 720.5, + :ask 720.93}] + + {:on :time + :allow-exact-matches false + :suffixes [:-x :-y]} [{:time (c/->datetime "2016-05-25 13:30:00.023000"), + :ticker-x "MSFT", + :price 51.95, + :quantity 75, + :ticker-y ##NaN, + :bid ##NaN, + :ask ##NaN} + {:time (c/->datetime "2016-05-25 13:30:00.038000"), + :ticker-x "MSFT", + :price 51.95, + :quantity 155, + :ticker-y "MSFT", + :bid 51.97, + :ask 51.98} + {:time (c/->datetime "2016-05-25 13:30:00.048000"), + :ticker-x "GOOG", + :price 720.77, + :quantity 100, + :ticker-y "MSFT", + :bid 51.97, + :ask 51.98} + {:time (c/->datetime "2016-05-25 13:30:00.048000"), + :ticker-x "AAPL", + :price 98.0, + :quantity 100, + :ticker-y "MSFT", + :bid 51.97, + :ask 51.98}] + + {:on :time + :direction :forward + :suffixes [:-x :-y]} [{:time (c/->datetime "2016-05-25 13:30:00.023000"), + :ticker-x "MSFT", + :price 51.95, + :quantity 75, + :ticker-y "GOOG", + :bid 720.5, + :ask 720.93} + {:time (c/->datetime "2016-05-25 13:30:00.038000"), + :ticker-x "MSFT", + :price 51.95, + :quantity 155, + :ticker-y "GOOG", + :bid 720.5, + :ask 720.93} + {:time (c/->datetime "2016-05-25 13:30:00.048000"), + :ticker-x "GOOG", + :price 720.77, + :quantity 100, + :ticker-y "GOOG", + :bid 720.5, + :ask 720.93} + {:time (c/->datetime "2016-05-25 13:30:00.048000"), + :ticker-x "AAPL", + :price 98.0, + :quantity 100, + :ticker-y "GOOG", + :bid 720.5, + :ask 720.93}] + {:on :time + :by :ticker + :suffixes [:-x :-y]} [{:time (c/->datetime "2016-05-25 13:30:00.023000"), + :ticker "MSFT", + :price 51.95, + :quantity 75, + :bid 51.95, + :ask 51.96} + {:time (c/->datetime "2016-05-25 13:30:00.038000"), + :ticker "MSFT", + :price 51.95, + :quantity 155, + :bid 51.97, + :ask 51.98} + {:time (c/->datetime "2016-05-25 13:30:00.048000"), + :ticker "GOOG", + :price 720.77, + :quantity 100, + :bid 720.5, + :ask 720.93} + {:time (c/->datetime "2016-05-25 13:30:00.048000"), + :ticker "AAPL", + :price 98.0, + :quantity 100, + :bid ##NaN, + :ask ##NaN}]))) + +(deftest concatenate + (are [d o do] + (m/same? + (r/concatenate [(g/data-frame {:a [1 2 3] + :b [4 5 6]}) + (g/data-frame {:a [2 2 2] + :b [3 3 3]})] d) + (g/data-frame o do)) + + {} [{:a 1, :b 4} {:a 2, :b 5} {:a 3, :b 6} + {:a 2, :b 3} {:a 2, :b 3} {:a 2, :b 3}] {:index [0 1 2 0 1 2]} + + {:axis 1} [[1 4 2 3] [2 5 2 3] [3 6 2 3]] {:columns [:a :b :a :b]} + + {:axis 1 + :ignore-index true} [{0 1, 1 4, 2 2, 3 3} + {0 2, 1 5, 2 2, 3 3} + {0 3, 1 6, 2 2, 3 3}] {})) + +(deftest aggregate + (are [v d o] + (m/same? + (r/aggregate (g/data-frame [[1, 2, 3], + [4, 5, 6], + [7, 8, 9], + [##NaN, ##NaN, ##NaN]] + {:columns [:A :B :C]}) v d) + o) + + :sum {} (g/series [12.0 15 18] {:index [:A :B :C]}) + + [:sum :min] {} (g/data-frame + {:A [12.0 1] :B [15.0 2] :C [18.0 3]} + {:index [:sum :min]}) + + :sum {:axis 1} (g/series [6.0 15 24 0]))) + +(deftest remap + (are [in mpgs ign o] + (m/same? + (r/remap + (g/series in) + mpgs ign) + o) + [:a :b :c] {:a 1 :b 2 :c 3} nil (g/series [1 2 3]) + [:a :b ##NaN] #(str "Test " %) :ignore (g/series ["Test a" "Test b" ##NaN]))) + +(deftest groupby + (are [d f o] + (m/same? + (-> (g/data-frame {:animal [:falcon :falcon :parrot :parrot] + :max-speed [380 370 24 26]}) + (r/groupby :animal d) + f) + o) + + {} m/mean (g/data-frame {:max-speed [375 25]} + {:index (g/series [:falcon :parrot] {:name :animal})}) + + {:as-index false} m/mean (g/data-frame [{:animal "falcon" :max-speed 375} + {:animal "parrot" :max-speed 25}]) + + {} m/std (g/data-frame [{:max-speed 7.0710678118654755} + {:max-speed 1.4142135623730951}] + {:index (g/series [:falcon :parrot] {:name :animal})}))) + +(deftest rolling + (are [w d o] + (m/same? + (-> (g/data-frame {:b [0 1 2 3 4]} + {:index + (panthera.pandas.conversion/->datetime + (g/series + ["20130101 09:00:00" + "20130101 09:00:02" + "20130101 09:00:03" + "20130101 09:00:05" + "20130101 09:00:06"]))}) + (r/rolling w d) + m/sum) + (g/data-frame o + {:index + (panthera.pandas.conversion/->datetime + (g/series + ["20130101 09:00:00" + "20130101 09:00:02" + "20130101 09:00:03" + "20130101 09:00:05" + "20130101 09:00:06"]))})) + 2 {} [{:b ##NaN} {:b 1.0} {:b 3.0} {:b 5.0} {:b 7.0}] + :2s {} [{:b 0.0} {:b 1.0} {:b 3.0} {:b 3.0} {:b 7.0}] + 2 {:win-type :triang} [{:b ##NaN} {:b 0.5} {:b 1.5} {:b 2.5} {:b 3.5}] + 2 {:min-periods 1} [{:b 0.0} {:b 1.0} {:b 3.0} {:b 5.0} {:b 7.0}])) + +(deftest ewm + (are [d o] + (m/same? + (-> (g/data-frame {:b [0 1 2 ##NaN 4]}) + (r/ewm d) + m/mean) + (g/data-frame o)) + + {:com 0.5} [{:b 0.0} + {:b 0.7499999999999999} + {:b 1.6153846153846152} + {:b 1.6153846153846152} + {:b 3.670212765957447}] + + {:span 2} [{:b 0.0} + {:b 0.7499999999999999} + {:b 1.6153846153846152} + {:b 1.6153846153846152} + {:b 3.670212765957447}] + + {:com 0.5 :ignore-na true} [{:b 0.0} + {:b 0.7499999999999999} + {:b 1.6153846153846152} + {:b 1.6153846153846152} + {:b 3.2249999999999996}])) + +(deftest dropna + (are [s o d] + (m/same? + (-> (g/series s) + r/dropna) + (g/series o d)) + [] [] {} + [1 nil 2] [1.0 2.0] {:index [0 2]}) + + (are [att out opt] + (m/same? + (-> (g/data-frame {:name ["Alfred" "Batman" "Robin"] + :toy [nil "Batmobile" "Whip"] + :born [nil "1940-04-25" nil]}) + (r/dropna att)) + (g/data-frame out opt)) + {} [{:name "Batman", :toy "Batmobile", :born "1940-04-25"}] {:index [1]} + {:axis 1} [{:name "Alfred"} {:name "Batman"} {:name "Robin"}] {} + {:how :all} [{:name "Alfred", :toy nil, :born nil} + {:name "Batman", :toy "Batmobile", :born "1940-04-25"} + {:name "Robin", :toy "Whip", :born nil}] {} + {:thresh 2} [{:name "Batman", :toy "Batmobile", :born "1940-04-25"} + {:name "Robin", :toy "Whip", :born nil}] {:index [1 2]} + {:subset [:toy]} [{:name "Batman", :toy "Batmobile", :born "1940-04-25"} + {:name "Robin", :toy "Whip", :born nil}] {:index [1 2]})) + +(deftest drop + (are [l d o df] + (m/same? + (r/drop + (g/data-frame + (py/$a (np/arange 12) np/reshape [3 4]) + {:columns [:A :B :C :D]}) l d) + (g/data-frame o df)) + [:B :C] {:axis 1} [{:A 0 :D 3} {:A 4 :D 7} {:A 8 :D 11}] {} + [0 1] {} [{"A" 8 "B" 9 "C" 10 "D" 11}] {:index [2]})) + +(deftest melt + (are [d o df] + (m/same? + (r/melt + (r/transpose + (g/data-frame [[:a :b :c] [1 3 5] [2 4 6]] + {:columns [0 1 2] + :index [:A :B :C]})) + d) + (g/data-frame o df)) + + {} [{:variable "A", :value "a"} + {:variable "A", :value "b"} + {:variable "A", :value "c"} + {:variable "B", :value 1} + {:variable "B", :value 3} + {:variable "B", :value 5} + {:variable "C", :value 2} + {:variable "C", :value 4} + {:variable "C", :value 6}] {} + + {:id-vars [:A] + :value-vars [:B]} [{:A "a", :variable "B", :value 1} + {:A "b", :variable "B", :value 3} + {:A "c", :variable "B", :value 5}] {:dtype np/object})) + +(deftest assign + (are [i o d] + (m/same? + (-> (g/data-frame [[:a 1 2] [:b 3 4] [:c 5 6]] + {:columns [:A :B :C]}) + (r/assign i)) + (g/data-frame o d)) + + {:D 3} [{:A "a", :B 1, :C 2, :D 3} + {:A "b", :B 3, :C 4, :D 3} + {:A "c", :B 5, :C 6, :D 3}] {} + + {:D [1 2 3]} [{:A "a", :B 1, :C 2, :D 1} + {:A "b", :B 3, :C 4, :D 2} + {:A "c", :B 5, :C 6, :D 3}] {} + + {:D #(-> (g/subset-cols % :C) + (m/mul 2))} [{:A "a", :B 1, :C 2, :D 4} + {:A "b", :B 3, :C 4, :D 8} + {:A "c", :B 5, :C 6, :D 12}] {})) + +(deftest stack + (is (m/same? + (r/stack + (g/data-frame [[0 1] [2 3]] + {:index [:cat :dog] + :columns [:weight :height]})) + (g/series [0 1 2 3] + {:index [[:cat :cat :dog :dog] + [:weight :height :weight :height]]})))) + +(deftest unstack + (are [d o df] + (m/same? + (r/unstack + (r/stack + (g/data-frame [[1 2] [3 4]] + {:index [:one :two] + :columns [:a :b]})) d) + (g/data-frame o df)) + {} [{:a 1 :b 2} {:a 3 :b 4}] {:index [:one :two]} + {:level 0} [{:one 1, :two 3} {:one 2, :two 4}] {:index [:a :b]})) + +(deftest transpose + (is (m/same? + (r/transpose (g/data-frame [[1 2 3] [4 5 6] [7 8 9]])) + (g/data-frame [[1 4 7] [2 5 8] [3 6 9]])))) diff --git a/test/panthera/utils_test.clj b/test/panthera/utils_test.clj index 4e6ed6d..f016a87 100644 --- a/test/panthera/utils_test.clj +++ b/test/panthera/utils_test.clj @@ -1,59 +1,72 @@ -(ns panthera.utils-test - (:require - [clojure.test :refer :all] - [libpython-clj.python :as py] - [panthera.pandas.utils :as u])) - -(deftest pytype - (are [t d] - (identical? t (u/pytype d)) - :list (py/->py-list []) - :list (py/->py-list [-1]) - :list (py/->py-list [1 2 3]) - :list (py/->py-list [[1 2] [3 4]]) - :tuple (py/->py-tuple []) - :tuple (py/->py-tuple [0]) - :tuple (py/->py-tuple [1 2 3]) - :tuple (py/->py-tuple [[1 2] [3 4]]) - :dict (py/->py-dict {}) - :dict (py/->py-dict {:a 1 :b "2" :c [1 2 3]}) - :dict (py/->py-dict {"a" 1}))) - -(deftest slice - (are [d] - (identical? :slice (u/pytype (apply u/slice d))) - [] - [nil] - [1] - [1 2] - [1 2 3] - [3 7 2]) - (are [s res] - (= (py/->jvm - (py/get-item - (py/->py-list (range 4)) s)) res) - (u/slice) (vec (range 4)) - (u/slice 2) [0 1] - (u/slice 1 3) [1 2] - (u/slice -1) [0 1 2] - (u/slice 0 5 2) [0 2])) - -(deftest keys->pyargs - (are [i o] - (= (u/keys->pyargs i) o) - {} {} - {:a 1} {"a" 1} - {:a 1 :b 2} {"a" 1 "b" 2} - {:a-k 1} {"a_k" 1} - {(keyword "with spaces") 1} {"with_spaces" 1})) - -(deftest ->clj - (is (= (u/->clj - (py/call-attr u/pd "DataFrame" [{:a 1 :b 2} {:a 3 :b 4}])) - [{:a 1 :b 2} {:a 3 :b 4}])) - (is (= (u/->clj - (py/call-attr u/pd "Series" [1 2 3])) - [{:unnamed 1} {:unnamed 2} {:unnamed 3}])) - (is (= (u/->clj - (py/call-attr-kw u/pd "Series" [[1 2 3]] {"name" "test"})) - [{:test 1} {:test 2} {:test 3}]))) \ No newline at end of file +(ns panthera.utils-test + (:require + [clojure.test :refer :all] + [panthera.config :refer [start-python!]] + [libpython-clj.python :as py] + [panthera.pandas.utils :as u])) + +(use-fixtures :once start-python!) + +(deftest pytype + (are [t d] + (identical? t (u/pytype d)) + :list (py/->py-list []) + :list (py/->py-list [-1]) + :list (py/->py-list [1 2 3]) + :list (py/->py-list [[1 2] [3 4]]) + :tuple (py/->py-tuple []) + :tuple (py/->py-tuple [0]) + :tuple (py/->py-tuple [1 2 3]) + :tuple (py/->py-tuple [[1 2] [3 4]]) + :dict (py/->py-dict {}) + :dict (py/->py-dict {:a 1 :b "2" :c [1 2 3]}) + :dict (py/->py-dict {"a" 1}))) + +(deftest slice + (are [d] + (identical? :slice (u/pytype (apply u/slice d))) + [] + [nil] + [1] + [1 2] + [1 2 3] + [3 7 2]) + (are [s res] + (= (py/->jvm + (py/get-item + (py/->py-list (range 4)) s)) res) + (u/slice) (vec (range 4)) + (u/slice 2) [0 1] + (u/slice 1 3) [1 2] + (u/slice -1) [0 1 2] + (u/slice 0 5 2) [0 2])) + +(deftest keys->pyargs + (are [i o] + (= (u/keys->pyargs i) o) + {} {} + {:a 1} {"a" 1} + {:a 1 :b 2} {"a" 1 "b" 2} + {:a-k 1} {"a_k" 1})) + +(deftest memo-columns-converter + (are [i o] + (= (u/memo-columns-converter i) o) + 1 1 + nil nil + "a" :a + "col_1" :col-1 + ["multi" "col"] [:multi :col] + "ALL_CAPS" :ALL-CAPS + "WeIrD_caPs" :WeIrD-caPs)) + +(deftest ->clj + (is (= (u/->clj + (py/call-attr u/pd "DataFrame" [{:a 1 :b 2} {:a 3 :b 4}])) + [{:a 1 :b 2} {:a 3 :b 4}])) + (is (= (u/->clj + (py/call-attr u/pd "Series" [1 2 3])) + [{:unnamed 1} {:unnamed 2} {:unnamed 3}])) + (is (= (u/->clj + (py/call-attr-kw u/pd "Series" [[1 2 3]] {"name" "test"})) + [{:test 1} {:test 2} {:test 3}])))