From 164e887ac802008f31d9304a0f4626c7bd2b4328 Mon Sep 17 00:00:00 2001 From: Alan Marazzi Date: Wed, 27 Nov 2019 18:12:15 +0100 Subject: [PATCH 01/12] Add crosstab doc and start its test --- project.clj | 5 +-- src/panthera/pandas/reshape.clj | 66 +++++++++++++++++++++++++++++++-- test/panthera/reshape_test.clj | 10 +++++ 3 files changed, 75 insertions(+), 6 deletions(-) create mode 100644 test/panthera/reshape_test.clj diff --git a/project.clj b/project.clj index 0a75a1d..b48a063 100644 --- a/project.clj +++ b/project.clj @@ -4,8 +4,7 @@ :scm {:name "git" :url "https://github.com/alanmarazzi/panthera"} :license {:name "EPL-2.0" :url "https://www.eclipse.org/legal/epl-2.0/"} - :dependencies [[cnuernber/libpython-clj "1.0"] + :dependencies [[cnuernber/libpython-clj "1.13"] [org.clojure/core.memoize "0.7.2"]] :profiles {:dev {:dependencies [[org.clojure/clojure "1.10.1"]] - :plugins [[refactor-nrepl "2.4.0"] - [cider/cider-nrepl "0.18.0"]]}}) + :resource-paths ["/home/alan/miniconda3/envs/panthera"]}}) diff --git a/src/panthera/pandas/reshape.clj b/src/panthera/pandas/reshape.clj index 04f22fd..1b084b4 100644 --- a/src/panthera/pandas/reshape.clj +++ b/src/panthera/pandas/reshape.clj @@ -4,8 +4,68 @@ [panthera.pandas.utils :as u])) (defn crosstab - [df-or-srs & [attrs]] - (u/kw-call u/pd "crosstab" df-or-srs attrs)) + "Compute a cross tabulation of two (or more) factors. By default + computes a frequency table of the factors unless an array of values and an + aggregation function are passed. + + **Arguments** + + - `seq-or-srs` -> seqable, series + + **Attrs** + + - `:columns` -> Iterable, series, Iterable of Iter/srs: values to group by + - `:values` -> Iterable, series, Iterable of Iter/srs: values to group + according to factors, requires `:aggfunc` + - `:rownames` -> Iterable, series: the names of `seq-or-srs` + - `:colnames` -> Iterable, series: the names of `:columns` + - `:aggfunc` -> function, keyword, str: the aggregation function, requires + `:values`. It can be a panthera function (pt/sum), a numpy function (npy :sum), + the name of a numpy function (:mean or \"mean\") or a Clojure function. In the + latter case be aware that you have to reduce over a map. + - `:margins` -> bool, default false: add subtotals + - `:margins_name`: str, default \"All\": name of the row/column holding totals + when `:margins` true + - `:dropna` -> bool, default true: exclude columns with all missing values + - `:normalize` -> bool, {\"all\" \"index\" \"columns\"}, {0 1}, default false: + normalize by dividing all values by the sum of values + + **Examples** + + ``` + (crosstab [[1 2 2]] {:columns [[:a :b :a]]}) + ;; col_0 a b + ;; row_0 + ;; 1 1 0 + ;; 2 1 1 + + (crosstab [[1 2 2]] {:columns [[:a :b :a]] + :rownames [:myrows] + :colnames [:mycols]}) + ;; mycols a b + ;; myrows + ;; 1 1 0 + ;; 2 1 1 + + (crosstab [[1 2 2]] {:columns [[:a :b :b]] + :values [10 20 30] + :aggfunc :mean}) + ;; col_0 a b + ;; row_0 + ;; 1 10.0 NaN + ;; 2 NaN 25.0 + + (crosstab [[1 2 2]] {:columns [[:a :b :a]] + :margins true}) + ;; col_0 a b All + ;; row_0 + ;; 1 1 0 1 + ;; 2 1 1 2 + ;; All 2 1 3 + ``` + " + [seq-or-srs & [attrs]] + (u/kw-call u/pd "crosstab" seq-or-srs attrs)) (defn pivot [df & [attrs]] @@ -80,4 +140,4 @@ (defn transpose "Transpose the given panthera object" [df-or-srs] - (py/call-attr df-or-srs "transpose")) \ No newline at end of file + (py/call-attr df-or-srs "transpose")) diff --git a/test/panthera/reshape_test.clj b/test/panthera/reshape_test.clj new file mode 100644 index 0000000..9eb5e77 --- /dev/null +++ b/test/panthera/reshape_test.clj @@ -0,0 +1,10 @@ +(ns panthera.reshape-test + (:require + [clojure.test :refer :all] + [libpython-clj.python :as py] + [panthera.pandas.utils :as u] + [panthera.pandas.generics :as g])) + +(deftest crosstab + (are [r c o] + (= (u/->clj )))) From d48f6e571d366ce093dcb0d90016e2a343435bd8 Mon Sep 17 00:00:00 2001 From: Alan Marazzi Date: Fri, 6 Dec 2019 11:41:46 +0100 Subject: [PATCH 02/12] Add first reshape tests & docs --- project.clj | 2 +- src/panthera/numpy.clj | 2 +- src/panthera/pandas/generics.clj | 5 +- src/panthera/pandas/math.clj | 7 + src/panthera/pandas/reshape.clj | 244 +++++++++++++++++++++++++++++-- src/panthera/pandas/utils.clj | 10 +- src/panthera/panthera.clj | 241 +++++++++++++++--------------- test/panthera/generics_test.clj | 2 +- test/panthera/reshape_test.clj | 113 +++++++++++++- test/panthera/utils_test.clj | 11 +- 10 files changed, 489 insertions(+), 148 deletions(-) diff --git a/project.clj b/project.clj index b48a063..d30f614 100644 --- a/project.clj +++ b/project.clj @@ -4,7 +4,7 @@ :scm {:name "git" :url "https://github.com/alanmarazzi/panthera"} :license {:name "EPL-2.0" :url "https://www.eclipse.org/legal/epl-2.0/"} - :dependencies [[cnuernber/libpython-clj "1.13"] + :dependencies [[cnuernber/libpython-clj "1.15"] [org.clojure/core.memoize "0.7.2"]] :profiles {:dev {:dependencies [[org.clojure/clojure "1.10.1"]] :resource-paths ["/home/alan/miniconda3/envs/panthera"]}}) diff --git a/src/panthera/numpy.clj b/src/panthera/numpy.clj index ead1931..4113746 100644 --- a/src/panthera/numpy.clj +++ b/src/panthera/numpy.clj @@ -47,7 +47,7 @@ (py/get-attr py-module (u/memo-key-converter x)))) ([attrs] (if (seqable? x) - (let [ks (map u/memo-key-converter x)] + (let [ks (map u/memo-key-converter x)] (py/call-attr-kw (py-get-in py-module ks) (last ks) (vec (:args attrs)) (u/keys->pyargs (dissoc attrs :args)))) diff --git a/src/panthera/pandas/generics.clj b/src/panthera/pandas/generics.clj index 2833284..9ff601d 100644 --- a/src/panthera/pandas/generics.clj +++ b/src/panthera/pandas/generics.clj @@ -45,7 +45,10 @@ (series [\"1.3\" \"3.0\"] {:dtype :float32}) ```" [data & [attrs]] - (u/kw-call u/pd "Series" data attrs)) + (u/kw-call u/pd "Series" data attrs) + ;[& [attrs]] + ;((py/$. u/pd Series) attrs) + ) (defn data-frame "Creates a panthera data-frame, the underlying backend is a diff --git a/src/panthera/pandas/math.clj b/src/panthera/pandas/math.clj index b6a05cd..1d47f73 100644 --- a/src/panthera/pandas/math.clj +++ b/src/panthera/pandas/math.clj @@ -5,6 +5,13 @@ [libpython-clj.python :as py] [panthera.pandas.utils :as u])) +(defn same? + "This works differently than `eq`: the latter checks equality + value by value, `same?` checks that the given `series`es or `data-frame`s contain + the same exact values. This works even with missing values." + [left right] + (py/call-attr left :equals right)) + (defn- base-math [k] (fn [& args] diff --git a/src/panthera/pandas/reshape.clj b/src/panthera/pandas/reshape.clj index 1b084b4..84c2808 100644 --- a/src/panthera/pandas/reshape.clj +++ b/src/panthera/pandas/reshape.clj @@ -1,7 +1,10 @@ (ns panthera.pandas.reshape (:require - [libpython-clj.python :as py] - [panthera.pandas.utils :as u])) + [libpython-clj.python :as py] + [panthera.pandas.utils :as u] + [panthera.pandas.generics :as g] + + [libpython-clj.python.protocols :as p])) (defn crosstab "Compute a cross tabulation of two (or more) factors. By default @@ -10,24 +13,24 @@ **Arguments** - - `seq-or-srs` -> seqable, series + - `seq-or-srs` -> seqable, `series` **Attrs** - - `:columns` -> Iterable, series, Iterable of Iter/srs: values to group by - - `:values` -> Iterable, series, Iterable of Iter/srs: values to group + - `:columns` -> Iterable, `series`, Iterable of Iter/srs: values to group by + - `:values` -> Iterable, `series`, Iterable of Iter/srs: values to group according to factors, requires `:aggfunc` - - `:rownames` -> Iterable, series: the names of `seq-or-srs` - - `:colnames` -> Iterable, series: the names of `:columns` + - `:rownames` -> Iterable, `series`: the names of `seq-or-srs` + - `:colnames` -> Iterable, `series`: the names of `:columns` - `:aggfunc` -> function, keyword, str: the aggregation function, requires - `:values`. It can be a panthera function (pt/sum), a numpy function (npy :sum), - the name of a numpy function (:mean or \"mean\") or a Clojure function. In the + `:values`. It can be a panthera function (`sum`), a numpy function (`(npy :sum)`), + the name of a numpy function (`:mean` or \"mean\") or a Clojure function. In the latter case be aware that you have to reduce over a map. - - `:margins` -> bool, default false: add subtotals + - `:margins` -> bool, default `false`: add subtotals - `:margins_name`: str, default \"All\": name of the row/column holding totals when `:margins` true - - `:dropna` -> bool, default true: exclude columns with all missing values - - `:normalize` -> bool, {\"all\" \"index\" \"columns\"}, {0 1}, default false: + - `:dropna` -> bool, default `true`: exclude columns with all missing values + - `:normalize` -> bool, {`:all` `:index` `columns`}, {0 1}, default `false`: normalize by dividing all values by the sum of values **Examples** @@ -68,20 +71,229 @@ (u/kw-call u/pd "crosstab" seq-or-srs attrs)) (defn pivot + "Returns a stacked `data-frame`: basically changes it from long format to wide. + + **Arguments** + + - `df` -> `data-frame` + + **Attrs** + + - `:index` -> str, keyword, default `nil`: the column to use as the new index. + When `nil` uses the current one + - `:columns` -> str, keyword: columns to use for the new `data-frame` + - `:values` -> str, keyword, Iterable, default `nil`: columns to use to populate + values. If `nil` all remaining columns will be used + + **Examples** + + ``` + (def df (data-frame {:foo [:one :one :one :two :two :two] + :bar [:a :b :c :a :b :c] + :baz [1 2 3 4 5 6] + :zoo [:x :y :z :q :w :t]})) + + (pivot df {:columns :bar :index :foo}) + ;; baz zoo + ;; bar a b c a b c + ;; foo + ;; one 1 2 3 x y z + ;; two 4 5 6 q w t + + (pivot df {:index :foo :columns :bar :values [:baz :zoo]}) + ;; baz zoo + ;; bar a b c a b c + ;; foo + ;; one 1 2 3 x y z + ;; two 4 5 6 q w t + ``` + " [df & [attrs]] (u/simple-kw-call df "pivot" attrs)) (defn cut - [data-or-srs bins & [attrs]] - (py/call-attr-kw u/pd "cut" [data-or-srs bins] + "Bin the given values into categories. + + Use this when you want to go from continuous values to ordered categories. For + example, you could go from age to age ranges. + + N.B.: `cut` converts your values to a [`Categorical`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Categorical.html#pandas.Categorical) type. This + means that you can choose whether you want a label back or just the new value. + + **Arguments** + + - `seq-or-srs` -> seqable or `series` + - `bins` -> int, Iterable, `series`: how to bin the data. If int defines the number + of equal-width bins, otherwise values are treated as bins edges + + **Attrs** + + - `:right` -> bool, default `true`: include the rightmost edge? + - `:labels` -> Iterable, bool: if Iterable, specifies the labels for the bins, + if false it doesn't return the labels, only the values (**N.B.: the suggestion + is to work with `{:labels false}` as much as possible, especially if you have to + convert things to Clojure at some point**) + - `:retbins` -> bool, default `false`: return bins? + - `:precision` -> int, default 3: the precision of the bins labels + - `:include-lowest` -> bool, default `false`: should the first interval be left-inclusive? + - `:duplicates` -> {`:raise`, `:drop`, `nil`}: ff bin edges are not unique, + raise error or drop non-uniques + + **Examples** + + ``` + (def s (series [1 7 5 4 6 3])) + + (cut s 3) + ;; 0 (0.994, 3.0] + ;; 1 (5.0, 7.0] + ;; 2 (3.0, 5.0] + ;; 3 (3.0, 5.0] + ;; 4 (5.0, 7.0] + ;; 5 (0.994, 3.0] + ;; dtype: category + ;; Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]] + + (cut s [3 5 7]) + ;; 0 NaN + ;; 1 (5.0, 7.0] + ;; 2 (3.0, 5.0] + ;; 3 (3.0, 5.0] + ;; 4 (5.0, 7.0] + ;; 5 NaN + ;; dtype: category + ;; Categories (2, interval[int64]): [(3, 5] < (5, 7]] + + (cut s 3 {:labels false}) + ;; 0 0 + ;; 1 2 + ;; 2 1 + ;; 3 1 + ;; 4 2 + ;; 5 0 + ;; dtype: int64 + ``` + " + [seq-or-srs bins & [attrs]] + (py/call-attr-kw u/pd "cut" [seq-or-srs bins] (u/keys->pyargs attrs))) (defn qcut - [data-or-srs q & [attrs]] - (py/call-attr-kw u/pd "qcut" [data-or-srs q] + "Bin values into quantiles. + + The same as `cut`, but categories are quantiles. + + **Arguments** + + - `seq-or-srs` -> seqable or `series` + - `q` -> int, Iterable: either number of quantiles or Iterable of quantiles + + **Attrs** + + - `:labels` -> Iterable, bool: if Iterable, specifies the labels for the bins, + if false it doesn't return the labels, only the values (**N.B.: the suggestion + is to work with `{:labels false}` as much as possible, especially if you have to + convert things to Clojure at some point**) + - `:retbins` -> bool, default `false`: return bins? + - `:precision` -> int, default 3: the precision of the bins labels + - `:duplicates` -> {`:raise`, `:drop`, `nil`}: ff bin edges are not unique, + raise error or drop non-uniques + + **Examples** + + ``` + (qcut (range 5) 4) + ;; [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]] + ;; Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] < (2.0, 3.0] < (3.0, 4.0]] + + (qcut (range 5) 3 {:labels [:low :medium :high]}) + ;; [low, low, medium, high, high] + ;; Categories (3, object): [low < medium < high] + + (qcut (range 5) 3 {:labels false}) + ;; [0 0 1 2 2] + ``` + " + [seq-or-srs q & [attrs]] + (py/call-attr-kw u/pd "qcut" [seq-or-srs q] (u/keys->pyargs attrs))) (defn merge-ordered + "Merge two `data-frames` together, facilities to deal with ordered data. + + **Arguments** + + - `left` -> `data-frame` + - `right` -> `data-frame` + + **Attrs** + + - `:on` -> str, keyword, Iterable: column names to be joined on. They must be the + same in both `left` and `right` + - `:left-on` -> str, keyword, Iterable, `series`: columns to join on the `left`, + use this if you have different columns names + - `:right-on` -> str, keyword, Iterable, `series`: columns to join on the `right`, + use this if you have different columns names + - `:left-by` -> str, keyword, Iterable, `series`: groupby `left` on the given + columns and then join piece by piece + - `:right-by` -> str, keyword, Iterable, `series`: groupby `right` on the given + columns and then join piece by piece + - `:fill-method` -> {`:ffill` `nil`}, default `nil`: forward fill missing data + - `:suffixes` -> Iterable, default [`:_x` `:_y`]: the suffixes to add to overlapping + column names + - `:how` -> {`:left` `:right` `:outer` `:inner`}, default `:outer`: kind of join + + **Examples** + ``` + (def A + (data-frame + {:key [:a :c :e :a] + :lvalue [1 2 3 1] + :group [:a :a :a :b]})) + + (def B + (data-frame + {:key [:b :c :d] + :rvalue [1 2 3]})) + + (merge-ordered A B) + ;; key lvalue group rvalue + ;; 0 a 1.0 a NaN + ;; 1 a 1.0 b NaN + ;; 2 b NaN NaN 1.0 + ;; 3 c 2.0 a 2.0 + ;; 4 d NaN NaN 3.0 + ;; 5 e 3.0 a NaN + + (merge-ordered A B {:fill-method :ffill}) + ;; key lvalue group rvalue + ;; 0 a 1 a NaN + ;; 1 a 1 b NaN + ;; 2 b 1 b 1.0 + ;; 3 c 2 a 2.0 + ;; 4 d 2 a 3.0 + ;; 5 e 3 a 3.0 + + (merge-ordered A B {:fill-method :ffill :left-by \"group\"}) + ;; key lvalue group rvalue + ;; 0 a 1 a NaN + ;; 1 b 1 a 1.0 + ;; 2 c 2 a 2.0 + ;; 3 d 2 a 3.0 + ;; 4 e 3 a 3.0 + ;; 5 a 1 b NaN + ;; 6 b 1 b 1.0 + ;; 7 c 1 b 2.0 + ;; 8 d 1 b 3.0 + + (merge-ordered A B {:left-on :lvalue :right-on :rvalue}) + ;; key_x lvalue group key_y rvalue + ;; 0 a 1 a b 1 + ;; 1 a 1 b b 1 + ;; 2 c 2 a c 2 + ;; 3 e 3 a d 3 + ``` + " [left right & [attrs]] (py/call-attr-kw u/pd "merge_ordered" [left right] (u/keys->pyargs attrs))) diff --git a/src/panthera/pandas/utils.clj b/src/panthera/pandas/utils.clj index c35e932..2cb7b2c 100644 --- a/src/panthera/pandas/utils.clj +++ b/src/panthera/pandas/utils.clj @@ -79,9 +79,11 @@ (memo-columns-converter \"ALL_CAPS\") ; :all-caps ```" (m/fifo - #(if (number? %) - % - (csk/->kebab-case-keyword %)) {} :fifo/threshold 512)) + #(cond + (number? %) % + (string? %) (csk/->kebab-case-keyword %) + (nil? %) nil + :else (mapv csk/->kebab-case-keyword %)) {} :fifo/threshold 512)) (defn vec->pylist "Converts an iterable Clojure data structure to a Python list @@ -187,7 +189,7 @@ (if (series? df-or-srs) (let [nm (memo-columns-converter (or (py/get-attr df-or-srs "name") - :unnamed))] + "unnamed"))] (into [] (map #(assoc {} nm %)) (vec df-or-srs))) (let [ks (map memo-columns-converter diff --git a/src/panthera/panthera.clj b/src/panthera/panthera.clj index a379a66..df25a53 100644 --- a/src/panthera/panthera.clj +++ b/src/panthera/panthera.clj @@ -10,131 +10,132 @@ [panthera.pandas.reshape])) (export-symbols - panthera.pandas.generics - n-rows - one-hot - hasnans? - swap-level - cross-section - n-unique - n-smallest - any? - subset-cols - n-largest - names - read-csv - select-rows - unique - filter-rows - dtype - value-counts - index - series - all? - read-excel - set-index - to-csv - data-frame - subset-rows - decreasing? - n-cols - head - increasing? - memory-usage - values - tail - reset-index - unique? - not-na? - shape - fill-na - nbytes - ftype) + panthera.pandas.generics + n-rows + one-hot + hasnans? + swap-level + cross-section + n-unique + n-smallest + any? + subset-cols + n-largest + names + read-csv + select-rows + unique + filter-rows + dtype + value-counts + index + series + all? + read-excel + set-index + to-csv + data-frame + subset-rows + decreasing? + n-cols + head + increasing? + memory-usage + values + tail + reset-index + unique? + not-na? + shape + fill-na + nbytes + ftype) (export-symbols - panthera.pandas.math - dot - ne - quantile - kurtosis - lt - std - le - add - sum - diff - ge - cumprod - clip - cumsum - eq - compound - mean - corr - sub - mod - pow - skew - rank - maximum - mode - between - pct-change - cummin - cnt - cummax - ops - autocorr - cov - div - round - mul - sem - var - abs - median - gt - minimum - describe - mean-abs-dev - floor-div) + panthera.pandas.math + same? + dot + ne + quantile + kurtosis + lt + std + le + add + sum + diff + ge + cumprod + clip + cumsum + eq + compound + mean + corr + sub + mod + pow + skew + rank + maximum + mode + between + pct-change + cummin + cnt + cummax + ops + autocorr + cov + div + round + mul + sem + var + abs + median + gt + minimum + describe + mean-abs-dev + floor-div) (export-symbols - panthera.pandas.utils - pytype - slice - ->clj - series? - data-frame?) + panthera.pandas.utils + pytype + slice + ->clj + series? + data-frame?) (export-symbols - panthera.pandas.conversion - ->timedelta - date-range - astype - ->numeric - timedelta-range - infer-time-freq - ->datetime - interval-range) + panthera.pandas.conversion + ->timedelta + date-range + astype + ->numeric + timedelta-range + infer-time-freq + ->datetime + interval-range) (export-symbols - panthera.pandas.reshape - pivot - aggregate - crosstab - cut - rolling - concatenate - remap - factorize - qcut - merge-ordered - dropna - merge-asof - ewm - groupby - melt - assign - unstack - transpose) + panthera.pandas.reshape + pivot + aggregate + crosstab + cut + rolling + concatenate + remap + factorize + qcut + merge-ordered + dropna + merge-asof + ewm + groupby + melt + assign + unstack + transpose) diff --git a/test/panthera/generics_test.clj b/test/panthera/generics_test.clj index e4750ea..e4efce0 100644 --- a/test/panthera/generics_test.clj +++ b/test/panthera/generics_test.clj @@ -3,7 +3,7 @@ [clojure.test :refer :all] [libpython-clj.python :as py] [panthera.pandas.generics :as g] - [panthera.pandas.utils :as u :reload true] + [panthera.pandas.utils :as u] [panthera.pandas.math :as m])) (deftest series diff --git a/test/panthera/reshape_test.clj b/test/panthera/reshape_test.clj index 9eb5e77..fc22313 100644 --- a/test/panthera/reshape_test.clj +++ b/test/panthera/reshape_test.clj @@ -2,9 +2,116 @@ (:require [clojure.test :refer :all] [libpython-clj.python :as py] - [panthera.pandas.utils :as u] - [panthera.pandas.generics :as g])) + [panthera.pandas.utils :as u :reload true] + [panthera.pandas.generics :as g] + [panthera.pandas.reshape :as r :reload true] + [panthera.pandas.math :as m :reload true])) + +(defn filter-nan + [d] + (into [] (comp (mapcat vals) (filter (complement #(.isNaN %)))) d)) (deftest crosstab (are [r c o] - (= (u/->clj )))) + (= (u/->clj (r/crosstab r {:columns c})) o) + [[]] [[]] [] + [[1 2 2]] [[:a :b :a]] [{:a 1 :b 0} {:a 1 :b 1}] + (g/series [1 2 3]) [[:a :b :a]] [{:a 1 :b 0} {:a 0 :b 1} {:a 1 :b 0}]) + (are [r d o] + (= (filter-nan (u/->clj (r/crosstab r d))) o) + [[1 2 2]] {:columns [[:a :b :b]] + :values [10 20 30] + :aggfunc :mean} [10.0 25.0]) + (is (= (u/->clj + (r/crosstab [[1 2 2]] {:columns [[:a :b :a]] :margins true})) + [{:a 1 :b 0 :all 1} + {:a 1 :b 1 :all 2} + {:a 2 :b 1 :all 3}]))) + +(deftest pivot + (are [d o] + (= (u/->clj (r/pivot (g/data-frame {:foo [:one :one :one :two :two :two] + :bar [:a :b :c :a :b :c] + :baz [1 2 3 4 5 6] + :zoo [:x :y :z :q :w :t]}) + d)) o) + + {:columns :bar :index :foo} [{[:baz :a] 1, + [:baz :b] 2, + [:baz :c] 3, + [:zoo :a] "x", + [:zoo :b] "y", + [:zoo :c] "z"} + {[:baz :a] 4, + [:baz :b] 5, + [:baz :c] 6, + [:zoo :a] "q", + [:zoo :b] "w", + [:zoo :c] "t"}] + + {:index :foo :columns :bar :values [:baz :zoo]} [{[:baz :a] 1, + [:baz :b] 2, + [:baz :c] 3, + [:zoo :a] "x", + [:zoo :b] "y", + [:zoo :c] "z"} + {[:baz :a] 4, + [:baz :b] 5, + [:baz :c] 6, + [:zoo :a] "q", + [:zoo :b] "w", + [:zoo :c] "t"}])) + +(deftest cut + (is + (->> (u/->clj (r/cut (g/series [1 7 5 4 6 3]) 3)) + first + vals + first + (m/eq (u/simple-kw-call u/pd "Interval" {:left 0.994 :right 3.0})))) + (are [b d o] + (= (u/->clj (r/cut (g/series [1 7 5 4 6 3]) b d)) o) + 3 {:labels false} [{:unnamed 0} {:unnamed 2} {:unnamed 1} + {:unnamed 1} {:unnamed 2} {:unnamed 0}] + + 3 {:labels [:a :b :c]} [{:unnamed "a"} {:unnamed "c"} {:unnamed "b"} + {:unnamed "b"} {:unnamed "c"} {:unnamed "a"}] + + [0 3 5 7] {:labels false} [{:unnamed 0} {:unnamed 2} {:unnamed 1} + {:unnamed 1} {:unnamed 2} {:unnamed 0}])) + +(deftest qcut + (is + (->> (u/->clj (r/cut (g/series (range 5)) 4)) + first + vals + first + (m/eq (u/simple-kw-call u/pd "Interval" {:left -0.004 :right 1.0})))) + (are [b d o] + (= (u/->clj (r/cut (g/series (range 5)) b d)) o) + 3 {:labels false} [{:unnamed 0} {:unnamed 0} + {:unnamed 1} {:unnamed 2} + {:unnamed 2}] + + 3 {:labels [:low :medium :high]} [{:unnamed "low"} + {:unnamed "low"} + {:unnamed "medium"} + {:unnamed "high"} + {:unnamed "high"}])) + +(deftest merge-ordered + (let [a (g/data-frame + {:key [:a :c :e :a] + :lvalue [1 2 3 1] + :group [:a :a :a :b]}) + b (g/data-frame + {:key [:b :c :d] + :rvalue [1 2 3]})] + (are [d o] + (m/same? (r/merge-ordered a b d) (g/data-frame o)) + {} [{:key "a", :lvalue 1.0, :group "a", :rvalue ##NaN} + {:key "a", :lvalue 1.0, :group "b", :rvalue ##NaN} + {:key "b", :lvalue ##NaN, :group ##NaN, :rvalue 1.0} + {:key "c", :lvalue 2.0, :group "a", :rvalue 2.0} + {:key "d", :lvalue ##NaN, :group ##NaN, :rvalue 3.0} + {:key "e", :lvalue 3.0, :group "a", :rvalue ##NaN}]))) diff --git a/test/panthera/utils_test.clj b/test/panthera/utils_test.clj index 4e6ed6d..ce818f9 100644 --- a/test/panthera/utils_test.clj +++ b/test/panthera/utils_test.clj @@ -47,6 +47,15 @@ {:a-k 1} {"a_k" 1} {(keyword "with spaces") 1} {"with_spaces" 1})) +(deftest memo-columns-converter + (are [i o] + (= (u/memo-columns-converter i) o) + 1 1 + nil nil + "a" :a + "col_1" :col-1 + ["multi" "col"] [:multi :col])) + (deftest ->clj (is (= (u/->clj (py/call-attr u/pd "DataFrame" [{:a 1 :b 2} {:a 3 :b 4}])) @@ -56,4 +65,4 @@ [{:unnamed 1} {:unnamed 2} {:unnamed 3}])) (is (= (u/->clj (py/call-attr-kw u/pd "Series" [[1 2 3]] {"name" "test"})) - [{:test 1} {:test 2} {:test 3}]))) \ No newline at end of file + [{:test 1} {:test 2} {:test 3}]))) From 7f70fcf14598f5b8aee0e4c2608dce365da8fd85 Mon Sep 17 00:00:00 2001 From: Alan Marazzi Date: Fri, 13 Dec 2019 10:26:34 +0100 Subject: [PATCH 03/12] Add some reshape tests --- project.clj | 5 +- src/panthera/pandas/reshape.clj | 170 +++++++++++++- src/panthera/pandas/utils.clj | 1 + test/panthera/reshape_test.clj | 392 ++++++++++++++++++++++---------- 4 files changed, 441 insertions(+), 127 deletions(-) diff --git a/project.clj b/project.clj index d30f614..62efb13 100644 --- a/project.clj +++ b/project.clj @@ -1,10 +1,11 @@ -(defproject panthera "0.1-alpha.16" +(defproject panthera "0.1-alpha.17" :description "Data Frames in Clojure (with Pandas) + NumPy" :url "https://github.com/alanmarazzi/panthera" :scm {:name "git" :url "https://github.com/alanmarazzi/panthera"} :license {:name "EPL-2.0" :url "https://www.eclipse.org/legal/epl-2.0/"} - :dependencies [[cnuernber/libpython-clj "1.15"] + :dependencies [[cnuernber/libpython-clj "1.20"] [org.clojure/core.memoize "0.7.2"]] :profiles {:dev {:dependencies [[org.clojure/clojure "1.10.1"]] :resource-paths ["/home/alan/miniconda3/envs/panthera"]}}) + diff --git a/src/panthera/pandas/reshape.clj b/src/panthera/pandas/reshape.clj index 84c2808..fbdd4da 100644 --- a/src/panthera/pandas/reshape.clj +++ b/src/panthera/pandas/reshape.clj @@ -27,7 +27,7 @@ the name of a numpy function (`:mean` or \"mean\") or a Clojure function. In the latter case be aware that you have to reduce over a map. - `:margins` -> bool, default `false`: add subtotals - - `:margins_name`: str, default \"All\": name of the row/column holding totals + - `:margins-name`: str, default \"All\": name of the row/column holding totals when `:margins` true - `:dropna` -> bool, default `true`: exclude columns with all missing values - `:normalize` -> bool, {`:all` `:index` `columns`}, {0 1}, default `false`: @@ -67,7 +67,9 @@ ;; All 2 1 3 ``` " - [seq-or-srs & [attrs]] + [seq-or-srs & [{:keys [columns values rownames colnames aggfunc + margins margins-name dropna normalize] + :as attrs}]] (u/kw-call u/pd "crosstab" seq-or-srs attrs)) (defn pivot @@ -108,7 +110,8 @@ ;; two 4 5 6 q w t ``` " - [df & [attrs]] + [df & [{:keys [index columns values] + :as attrs}]] (u/simple-kw-call df "pivot" attrs)) (defn cut @@ -174,7 +177,9 @@ ;; dtype: int64 ``` " - [seq-or-srs bins & [attrs]] + [seq-or-srs bins & [{:keys [right labels retbins precision + include-lowest duplicates] + :as attrs}]] (py/call-attr-kw u/pd "cut" [seq-or-srs bins] (u/keys->pyargs attrs))) @@ -214,7 +219,8 @@ ;; [0 0 1 2 2] ``` " - [seq-or-srs q & [attrs]] + [seq-or-srs q & [{:keys [labels retbins precision duplicates] + :as attrs}]] (py/call-attr-kw u/pd "qcut" [seq-or-srs q] (u/keys->pyargs attrs))) @@ -294,17 +300,165 @@ ;; 3 e 3 a d 3 ``` " - [left right & [attrs]] + [left right & [{:keys [on left-on right-on left-by right-by + fill-method suffixes how] + :as attrs}]] (py/call-attr-kw u/pd "merge_ordered" [left right] (u/keys->pyargs attrs))) (defn merge-asof - [left right & [attrs]] + "Similar to a left join, but merges on nearest key rather than equal. + + **Arguments** + + - `left` -> `data-frame`: sorted by key + - `right` -> `data-frame`: sorted by key + + **Attrs** + + - `:on` str, keyword -> column name to join on. Must be in both `data-frames` and + it must be ordered and numeric (dates, int, etc) + - `:left-on` -> str, keyword: column name to join in left `data-frame`. The + requirements are the same as for `:on` + - `:right-on` -> str, keyword: column name to join in right `data-frame`. The + requirements are the same as for `:on` + - `:left-index` -> bool: index of left `data-frame` is the join key? + - `:right-index` -> bool: index of right `data-frame` is the join key? + - `:by` -> str, keyword, Iterable, `series`: match these columns before merging + - `:left-by` -> str, keyword, Iterable. `series`: as `:by` but only for left `data-frame` + - `:right-by` -> str, keyword, Iterable. `series`: as `:by` but only for right `data-frame` + - `:suffixes` -> Iterable: suffix to add to overlapping column names, must + have length 2 and the first one is `left` and second one is `right` + - `:tolerance` -> depends on key: the tolerance for merging + - `:allow-exact-matches` -> bool, default `true`: allow matching with same `:on` value? + - `:direction` -> {`:backward` `:forward` `:nearest`}, default `:backward`: search for + prior, subsequent or closest matches + + **Examples** + + ``` + (def trades + (data-frame + {:time (->datetime [\"2016-05-25 13:30:00.023\" + \"2016-05-25 13:30:00.038\" + \"2016-05-25 13:30:00.048\" + \"2016-05-25 13:30:00.048\"]) + :ticker [:MSFT :MSFT :GOOG :AAPL] + :price [51.95 51.95 720.77 98.00] + :quantity [75 155 100 100]})) + + (def quotes + (data-frame + {:time (->datetime [\"2016-05-25 13:30:00.023\" + \"2016-05-25 13:30:00.023\" + \"2016-05-25 13:30:00.030\" + \"2016-05-25 13:30:00.048\" + \"2016-05-25 13:30:00.049\"]) + :ticker [:GOOG :MSFT :MSFT :GOOG :AAPL] + :bid [720.5 51.95 51.97 720.5 97.99] + :ask [720.93 51.96 51.98 720.93 98.01]})) + + (merge-asof trades quotes {:on :time}) + ;; time ticker_x price quantity ticker_y bid ask + ;; 0 2016-05-25 13:30:00.023 MSFT 51.95 75 MSFT 51.95 51.96 + ;; 1 2016-05-25 13:30:00.038 MSFT 51.95 155 MSFT 51.97 51.98 + ;; 2 2016-05-25 13:30:00.048 GOOG 720.77 100 GOOG 720.50 720.93 + ;; 3 2016-05-25 13:30:00.048 AAPL 98.00 100 GOOG 720.50 720.93 + + (merge-asof trades quotes {:on :time :allow-exact-matches false}) + ;; time ticker_x price quantity ticker_y bid ask + ;; 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN NaN + ;; 1 2016-05-25 13:30:00.038 MSFT 51.95 155 MSFT 51.97 51.98 + ;; 2 2016-05-25 13:30:00.048 GOOG 720.77 100 MSFT 51.97 51.98 + ;; 3 2016-05-25 13:30:00.048 AAPL 98.00 100 MSFT 51.97 51.98 + + (merge-asof trades quotes {:on :time :direction :forward}) + ;; time ticker_x price quantity ticker_y bid ask + ;; 0 2016-05-25 13:30:00.023 MSFT 51.95 75 GOOG 720.5 720.93 + ;; 1 2016-05-25 13:30:00.038 MSFT 51.95 155 GOOG 720.5 720.93 + ;; 2 2016-05-25 13:30:00.048 GOOG 720.77 100 GOOG 720.5 720.93 + ;; 3 2016-05-25 13:30:00.048 AAPL 98.00 100 GOOG 720.5 720.93 + + (merge-asof trades quotes {:on :time :by :ticker}) + ;; time ticker price quantity bid ask + ;; 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 + ;; 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 + ;; 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + ;; 3 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + ``` + " + [left right & [{:keys [on left-on right-on left-index right-index by + left-by right-by suffixes tolerance + allow-exact-matches direction] + :as attrs}]] (py/call-attr-kw u/pd "merge_asof" [left right] (u/keys->pyargs attrs))) (defn concatenate - [dfs-or-srss & [attrs]] + "Append `series`es and/or `data-frame`s along a wanted axis. + + **Arguments** + + - `dfs-or-srss` -> Iterable: a collection of multiple `series`/`data-frame` + + **Attrs** + + - `:axis` -> int, default 0: 0 = rows, 1 = columns + - `:join` -> {`:inner` `:outer`}, default `:outer`: the kind of join on other `:axis` + - `:ignore-index` -> bool, default `false`: whether to consider the index along + the wanted `:axis` + - `:keys` -> Iterable, default `nil`: this lets you build a hierarchical index + using the passed `:keys` as the outermost levels + - `:levels` -> Iterable, default `nil`: unique values for building a multi index + - `:names` -> Iterable, default `nil`: names of the levels in the hierarchical index + - `:verify-integrity` -> bool, default `false`: does the new `:axis` + contain duplicates? (P.S.: expensive operation) + - `:sort` -> bool, default `true`: sort the other `:axis` when `:join` is `:outer` + - `:copy` -> bool, default `true`: if `false` avoid copying when unnecessary + + **Examples** + + ``` + (concatenate [(series (range 3)) (series (range 3))]) + ;; 0 0 + ;; 1 1 + ;; 2 2 + ;; 0 0 + ;; 1 1 + ;; 2 2 + ;; dtype: int64 + + (concatenate [(series (range 3)) (series (range 3))] {:axis 1}) + ;; 0 1 + ;; 0 0 0 + ;; 1 1 1 + ;; 2 2 2 + + (concatenate [(data-frame {:a [1 2 3] :b [4 5 6]}) + (data-frame {:a [2 2 2] :b [3 3 3]})]) + ;; a b + ;; 0 1 4 + ;; 1 2 5 + ;; 2 3 6 + ;; 0 2 3 + ;; 1 2 3 + ;; 2 2 3 + + (concatenate [(data-frame {:a [1 2 3] :b [4 5 6]}) + (data-frame {:a [2 2 2] :b [3 3 3]})] + {:ignore-index true}) + ;; a b + ;; 0 1 4 + ;; 1 2 5 + ;; 2 3 6 + ;; 3 2 3 + ;; 4 2 3 + ;; 5 2 3 + ``` + " + [dfs-or-srss & [{:keys [axis join ignore-index keys levels + names verify-integrity sort copy] + :as attrs}]] (u/kw-call u/pd "concat" dfs-or-srss attrs)) (defn factorize diff --git a/src/panthera/pandas/utils.clj b/src/panthera/pandas/utils.clj index 2cb7b2c..81033d7 100644 --- a/src/panthera/pandas/utils.clj +++ b/src/panthera/pandas/utils.clj @@ -1,6 +1,7 @@ (ns panthera.pandas.utils (:require [libpython-clj.python :as py] + [libpython-clj.require :refer [require-python]] [camel-snake-kebab.core :as csk] [camel-snake-kebab.extras :as cske] [clojure.core.memoize :as m])) diff --git a/test/panthera/reshape_test.clj b/test/panthera/reshape_test.clj index fc22313..3f187ed 100644 --- a/test/panthera/reshape_test.clj +++ b/test/panthera/reshape_test.clj @@ -1,117 +1,275 @@ -(ns panthera.reshape-test - (:require - [clojure.test :refer :all] - [libpython-clj.python :as py] - [panthera.pandas.utils :as u :reload true] - [panthera.pandas.generics :as g] - [panthera.pandas.reshape :as r :reload true] - [panthera.pandas.math :as m :reload true])) - -(defn filter-nan - [d] - (into [] (comp (mapcat vals) (filter (complement #(.isNaN %)))) d)) - -(deftest crosstab - (are [r c o] - (= (u/->clj (r/crosstab r {:columns c})) o) - [[]] [[]] [] - [[1 2 2]] [[:a :b :a]] [{:a 1 :b 0} {:a 1 :b 1}] - (g/series [1 2 3]) [[:a :b :a]] [{:a 1 :b 0} {:a 0 :b 1} {:a 1 :b 0}]) - (are [r d o] - (= (filter-nan (u/->clj (r/crosstab r d))) o) - [[1 2 2]] {:columns [[:a :b :b]] - :values [10 20 30] - :aggfunc :mean} [10.0 25.0]) - (is (= (u/->clj - (r/crosstab [[1 2 2]] {:columns [[:a :b :a]] :margins true})) - [{:a 1 :b 0 :all 1} - {:a 1 :b 1 :all 2} - {:a 2 :b 1 :all 3}]))) - -(deftest pivot - (are [d o] - (= (u/->clj (r/pivot (g/data-frame {:foo [:one :one :one :two :two :two] - :bar [:a :b :c :a :b :c] - :baz [1 2 3 4 5 6] - :zoo [:x :y :z :q :w :t]}) - d)) o) - - {:columns :bar :index :foo} [{[:baz :a] 1, - [:baz :b] 2, - [:baz :c] 3, - [:zoo :a] "x", - [:zoo :b] "y", - [:zoo :c] "z"} - {[:baz :a] 4, - [:baz :b] 5, - [:baz :c] 6, - [:zoo :a] "q", - [:zoo :b] "w", - [:zoo :c] "t"}] - - {:index :foo :columns :bar :values [:baz :zoo]} [{[:baz :a] 1, - [:baz :b] 2, - [:baz :c] 3, - [:zoo :a] "x", - [:zoo :b] "y", - [:zoo :c] "z"} - {[:baz :a] 4, - [:baz :b] 5, - [:baz :c] 6, - [:zoo :a] "q", - [:zoo :b] "w", - [:zoo :c] "t"}])) - -(deftest cut - (is - (->> (u/->clj (r/cut (g/series [1 7 5 4 6 3]) 3)) - first - vals - first - (m/eq (u/simple-kw-call u/pd "Interval" {:left 0.994 :right 3.0})))) - (are [b d o] - (= (u/->clj (r/cut (g/series [1 7 5 4 6 3]) b d)) o) - 3 {:labels false} [{:unnamed 0} {:unnamed 2} {:unnamed 1} - {:unnamed 1} {:unnamed 2} {:unnamed 0}] - - 3 {:labels [:a :b :c]} [{:unnamed "a"} {:unnamed "c"} {:unnamed "b"} - {:unnamed "b"} {:unnamed "c"} {:unnamed "a"}] - - [0 3 5 7] {:labels false} [{:unnamed 0} {:unnamed 2} {:unnamed 1} - {:unnamed 1} {:unnamed 2} {:unnamed 0}])) - -(deftest qcut - (is - (->> (u/->clj (r/cut (g/series (range 5)) 4)) - first - vals - first - (m/eq (u/simple-kw-call u/pd "Interval" {:left -0.004 :right 1.0})))) - (are [b d o] - (= (u/->clj (r/cut (g/series (range 5)) b d)) o) - 3 {:labels false} [{:unnamed 0} {:unnamed 0} - {:unnamed 1} {:unnamed 2} - {:unnamed 2}] - - 3 {:labels [:low :medium :high]} [{:unnamed "low"} - {:unnamed "low"} - {:unnamed "medium"} - {:unnamed "high"} - {:unnamed "high"}])) - -(deftest merge-ordered - (let [a (g/data-frame - {:key [:a :c :e :a] - :lvalue [1 2 3 1] - :group [:a :a :a :b]}) - b (g/data-frame - {:key [:b :c :d] - :rvalue [1 2 3]})] - (are [d o] - (m/same? (r/merge-ordered a b d) (g/data-frame o)) - {} [{:key "a", :lvalue 1.0, :group "a", :rvalue ##NaN} - {:key "a", :lvalue 1.0, :group "b", :rvalue ##NaN} - {:key "b", :lvalue ##NaN, :group ##NaN, :rvalue 1.0} - {:key "c", :lvalue 2.0, :group "a", :rvalue 2.0} - {:key "d", :lvalue ##NaN, :group ##NaN, :rvalue 3.0} - {:key "e", :lvalue 3.0, :group "a", :rvalue ##NaN}]))) +(ns panthera.reshape-test + (:require + [clojure.test :refer :all] + [libpython-clj.python :as py] + [panthera.pandas.utils :as u :reload true] + [panthera.pandas.generics :as g] + [panthera.pandas.reshape :as r :reload true] + [panthera.pandas.math :as m :reload true] + [panthera.pandas.conversion :as c])) + +(defn filter-nan + [d] + (into [] (comp (mapcat vals) (filter (complement #(.isNaN %)))) d)) + +(deftest crosstab + (are [r c o] + (= (u/->clj (r/crosstab r {:columns c})) o) + [[]] [[]] [] + [[1 2 2]] [[:a :b :a]] [{:a 1 :b 0} {:a 1 :b 1}] + (g/series [1 2 3]) [[:a :b :a]] [{:a 1 :b 0} {:a 0 :b 1} {:a 1 :b 0}]) + (are [r d o] + (= (filter-nan (u/->clj (r/crosstab r d))) o) + [[1 2 2]] {:columns [[:a :b :b]] + :values [10 20 30] + :aggfunc :mean} [10.0 25.0]) + (is (= (u/->clj + (r/crosstab [[1 2 2]] {:columns [[:a :b :a]] :margins true})) + [{:a 1 :b 0 :all 1} + {:a 1 :b 1 :all 2} + {:a 2 :b 1 :all 3}]))) + +(deftest pivot + (are [d o] + (= (u/->clj (r/pivot (g/data-frame {:foo [:one :one :one :two :two :two] + :bar [:a :b :c :a :b :c] + :baz [1 2 3 4 5 6] + :zoo [:x :y :z :q :w :t]}) + d)) o) + + {:columns :bar :index :foo} [{[:baz :a] 1, + [:baz :b] 2, + [:baz :c] 3, + [:zoo :a] "x", + [:zoo :b] "y", + [:zoo :c] "z"} + {[:baz :a] 4, + [:baz :b] 5, + [:baz :c] 6, + [:zoo :a] "q", + [:zoo :b] "w", + [:zoo :c] "t"}] + + {:index :foo :columns :bar :values [:baz :zoo]} [{[:baz :a] 1, + [:baz :b] 2, + [:baz :c] 3, + [:zoo :a] "x", + [:zoo :b] "y", + [:zoo :c] "z"} + {[:baz :a] 4, + [:baz :b] 5, + [:baz :c] 6, + [:zoo :a] "q", + [:zoo :b] "w", + [:zoo :c] "t"}])) + +(deftest cut + (is + (->> (u/->clj (r/cut (g/series [1 7 5 4 6 3]) 3)) + first + vals + first + (m/eq (u/simple-kw-call u/pd "Interval" {:left 0.994 :right 3.0})))) + (are [b d o] + (= (u/->clj (r/cut (g/series [1 7 5 4 6 3]) b d)) o) + 3 {:labels false} [{:unnamed 0} {:unnamed 2} {:unnamed 1} + {:unnamed 1} {:unnamed 2} {:unnamed 0}] + + 3 {:labels [:a :b :c]} [{:unnamed "a"} {:unnamed "c"} {:unnamed "b"} + {:unnamed "b"} {:unnamed "c"} {:unnamed "a"}] + + [0 3 5 7] {:labels false} [{:unnamed 0} {:unnamed 2} {:unnamed 1} + {:unnamed 1} {:unnamed 2} {:unnamed 0}])) + +(deftest qcut + (is + (->> (u/->clj (r/cut (g/series (range 5)) 4)) + first + vals + first + (m/eq (u/simple-kw-call u/pd "Interval" {:left -0.004 :right 1.0})))) + (are [b d o] + (= (u/->clj (r/cut (g/series (range 5)) b d)) o) + 3 {:labels false} [{:unnamed 0} {:unnamed 0} + {:unnamed 1} {:unnamed 2} + {:unnamed 2}] + + 3 {:labels [:low :medium :high]} [{:unnamed "low"} + {:unnamed "low"} + {:unnamed "medium"} + {:unnamed "high"} + {:unnamed "high"}])) + +(deftest merge-ordered + (let [a (g/data-frame + {:key [:a :c :e :a] + :lvalue [1 2 3 1] + :group [:a :a :a :b]}) + b (g/data-frame + {:key [:b :c :d] + :rvalue [1 2 3]})] + (are [d o] + (m/same? (r/merge-ordered a b d) (g/data-frame o)) + {} [{:key "a", :lvalue 1.0, :group "a", :rvalue ##NaN} + {:key "a", :lvalue 1.0, :group "b", :rvalue ##NaN} + {:key "b", :lvalue ##NaN, :group ##NaN, :rvalue 1.0} + {:key "c", :lvalue 2.0, :group "a", :rvalue 2.0} + {:key "d", :lvalue ##NaN, :group ##NaN, :rvalue 3.0} + {:key "e", :lvalue 3.0, :group "a", :rvalue ##NaN}]))) + +(deftest merge-asof + (let [trades (g/data-frame + {:time (c/->datetime ["2016-05-25 13:30:00.023" + "2016-05-25 13:30:00.038" + "2016-05-25 13:30:00.048" + "2016-05-25 13:30:00.048"]) + :ticker [:MSFT :MSFT :GOOG :AAPL] + :price [51.95 51.95 720.77 98.00] + :quantity [75 155 100 100]}) + quotes (g/data-frame + {:time (c/->datetime ["2016-05-25 13:30:00.023" + "2016-05-25 13:30:00.023" + "2016-05-25 13:30:00.030" + "2016-05-25 13:30:00.048" + "2016-05-25 13:30:00.049"]) + :ticker [:GOOG :MSFT :MSFT :GOOG :AAPL] + :bid [720.5 51.95 51.97 720.5 97.99] + :ask [720.93 51.96 51.98 720.93 98.01]})] + (are [d o] + (m/same? (r/merge-asof trades quotes d) (g/data-frame o)) + {:on :time + :suffixes [:-x :-y]} [{:time (c/->datetime "2016-05-25 13:30:00.023000"), + :ticker-x "MSFT", + :price 51.95, + :quantity 75, + :ticker-y "MSFT", + :bid 51.95, + :ask 51.96} + {:time (c/->datetime "2016-05-25 13:30:00.038000"), + :ticker-x "MSFT", + :price 51.95, + :quantity 155, + :ticker-y "MSFT", + :bid 51.97, + :ask 51.98} + {:time (c/->datetime "2016-05-25 13:30:00.048000"), + :ticker-x "GOOG", + :price 720.77, + :quantity 100, + :ticker-y "GOOG", + :bid 720.5, + :ask 720.93} + {:time (c/->datetime "2016-05-25 13:30:00.048000"), + :ticker-x "AAPL", + :price 98.0, + :quantity 100, + :ticker-y "GOOG", + :bid 720.5, + :ask 720.93}] + + {:on :time + :allow-exact-matches false + :suffixes [:-x :-y]} [{:time (c/->datetime "2016-05-25 13:30:00.023000"), + :ticker-x "MSFT", + :price 51.95, + :quantity 75, + :ticker-y ##NaN, + :bid ##NaN, + :ask ##NaN} + {:time (c/->datetime "2016-05-25 13:30:00.038000"), + :ticker-x "MSFT", + :price 51.95, + :quantity 155, + :ticker-y "MSFT", + :bid 51.97, + :ask 51.98} + {:time (c/->datetime "2016-05-25 13:30:00.048000"), + :ticker-x "GOOG", + :price 720.77, + :quantity 100, + :ticker-y "MSFT", + :bid 51.97, + :ask 51.98} + {:time (c/->datetime "2016-05-25 13:30:00.048000"), + :ticker-x "AAPL", + :price 98.0, + :quantity 100, + :ticker-y "MSFT", + :bid 51.97, + :ask 51.98}] + + {:on :time + :direction :forward + :suffixes [:-x :-y]} [{:time (c/->datetime "2016-05-25 13:30:00.023000"), + :ticker-x "MSFT", + :price 51.95, + :quantity 75, + :ticker-y "GOOG", + :bid 720.5, + :ask 720.93} + {:time (c/->datetime "2016-05-25 13:30:00.038000"), + :ticker-x "MSFT", + :price 51.95, + :quantity 155, + :ticker-y "GOOG", + :bid 720.5, + :ask 720.93} + {:time (c/->datetime "2016-05-25 13:30:00.048000"), + :ticker-x "GOOG", + :price 720.77, + :quantity 100, + :ticker-y "GOOG", + :bid 720.5, + :ask 720.93} + {:time (c/->datetime "2016-05-25 13:30:00.048000"), + :ticker-x "AAPL", + :price 98.0, + :quantity 100, + :ticker-y "GOOG", + :bid 720.5, + :ask 720.93}] + {:on :time + :by :ticker + :suffixes [:-x :-y]} [{:time (c/->datetime "2016-05-25 13:30:00.023000"), + :ticker "MSFT", + :price 51.95, + :quantity 75, + :bid 51.95, + :ask 51.96} + {:time (c/->datetime "2016-05-25 13:30:00.038000"), + :ticker "MSFT", + :price 51.95, + :quantity 155, + :bid 51.97, + :ask 51.98} + {:time (c/->datetime "2016-05-25 13:30:00.048000"), + :ticker "GOOG", + :price 720.77, + :quantity 100, + :bid 720.5, + :ask 720.93} + {:time (c/->datetime "2016-05-25 13:30:00.048000"), + :ticker "AAPL", + :price 98.0, + :quantity 100, + :bid ##NaN, + :ask ##NaN}]))) + +(deftest concatenate + (are [d o] + (m/same? + (r/concatenate [(g/data-frame {:a [1 2 3] + :b [4 5 6]}) + (g/data-frame {:a [2 2 2] + :b [3 3 3]})] d) + (g/data-frame o)) + + {} [{:a 1, :b 4} {:a 2, :b 5} {:a 3, :b 6} + {:a 2, :b 3} {:a 2, :b 3} {:a 2, :b 3}] + + {:axis 1} [{:a 2, :b 3} {:a 2, :b 3} {:a 2, :b 3}] + + {:axis 1 + :ignore-index true} [{0 1, 1 4, 2 2, 3 3} + {0 2, 1 5, 2 2, 3 3} + {0 3, 1 6, 2 2, 3 3}])) From e16d254d7d8048af0381cbf2e3285a77db517fb2 Mon Sep 17 00:00:00 2001 From: Alan Marazzi Date: Fri, 13 Dec 2019 16:43:03 +0100 Subject: [PATCH 04/12] Add to-clj method --- src/panthera/pandas/generics.clj | 5 +-- src/panthera/pandas/utils.clj | 62 +++++++++++++++++++++----------- test/panthera/reshape_test.clj | 16 ++++++--- 3 files changed, 54 insertions(+), 29 deletions(-) diff --git a/src/panthera/pandas/generics.clj b/src/panthera/pandas/generics.clj index 9ff601d..2833284 100644 --- a/src/panthera/pandas/generics.clj +++ b/src/panthera/pandas/generics.clj @@ -45,10 +45,7 @@ (series [\"1.3\" \"3.0\"] {:dtype :float32}) ```" [data & [attrs]] - (u/kw-call u/pd "Series" data attrs) - ;[& [attrs]] - ;((py/$. u/pd Series) attrs) - ) + (u/kw-call u/pd "Series" data attrs)) (defn data-frame "Creates a panthera data-frame, the underlying backend is a diff --git a/src/panthera/pandas/utils.clj b/src/panthera/pandas/utils.clj index 81033d7..014746a 100644 --- a/src/panthera/pandas/utils.clj +++ b/src/panthera/pandas/utils.clj @@ -8,7 +8,9 @@ (py/initialize!) -(defonce builtins (py/import-module "builtins")) +(require-python '[builtins :as bt]) + +;(defonce builtins (py/import-module "builtins")) (defonce pd (py/import-module "pandas")) (defn slice @@ -33,14 +35,17 @@ (slice 1 10 2) ; every 2 values between 1 and 10 ```" - ([] - (py/call-attr builtins "slice" nil)) - ([start] - (py/call-attr builtins "slice" start)) - ([start stop] - (py/call-attr builtins "slice" start stop)) - ([start stop incr] - (py/call-attr builtins "slice" start stop incr))) + [& args] + (apply bt/slice args) + (comment + ([] + (py/call-attr builtins "slice" nil)) + ([start] + (py/call-attr builtins "slice" start)) + ([start stop] + (py/call-attr builtins "slice" start stop)) + ([start stop incr] + (py/call-attr builtins "slice" start stop incr)))) (defn pytype "Return the Python type of the given objects @@ -168,12 +173,27 @@ [obj] (identical? :data-frame (pytype obj))) +(defmulti to-clj + (fn [obj] (series? obj))) + +(defmethod to-clj false + [obj] + {:id (py/get-attr obj "index") + :cols (py/get-attr obj "columns") + :data (lazy-seq (py/get-attr obj "values"))}) + +(defmethod to-clj true + [obj] + {:id (py/get-attr obj "index") + :cols (or (py/get-attr obj "name") "unnamed") + :data (lazy-seq (py/get-attr obj "values"))}) + (defn ->clj "Convert the given panthera data-frame or series to a Clojure vector of maps. The idea is to have a common, simple and fast access point to conversion of the main data structures between languages. - - series: a series gets converted to a vector of maps with only one key and + - `series`: a `series` gets converted to a vector of maps with only one key and one value. If the series has a name that becomes the key of the maps, otherwise `->clj` falls back to the `:unnamed` key. - data-frame: a data-frame is converted to a vector of maps with names @@ -186,17 +206,19 @@ (->clj my-df) ```" - [df-or-srs] - (if (series? df-or-srs) - (let [nm (memo-columns-converter - (or (py/get-attr df-or-srs "name") + [df-or-srs & [clj?]] + (if-not clj? + (to-clj df-or-srs) + (if (series? df-or-srs) + (let [nm (memo-columns-converter + (or (py/get-attr df-or-srs "name") "unnamed"))] - (into [] (map #(assoc {} nm %)) - (vec df-or-srs))) - (let [ks (map memo-columns-converter - (py/get-attr df-or-srs "columns"))] - (into [] (map #(zipmap ks %)) - (py/get-attr df-or-srs "values"))))) + (into [] (map #(assoc {} nm %)) + (vec df-or-srs))) + (let [ks (map memo-columns-converter + (py/get-attr df-or-srs "columns"))] + (into [] (map #(zipmap ks %)) + (py/get-attr df-or-srs "values")))))) (defn simple-kw-call "Helper for a cleaner access to `call-attr-kw` from `libpython-clj`" diff --git a/test/panthera/reshape_test.clj b/test/panthera/reshape_test.clj index 3f187ed..e939196 100644 --- a/test/panthera/reshape_test.clj +++ b/test/panthera/reshape_test.clj @@ -256,20 +256,26 @@ :ask ##NaN}]))) (deftest concatenate - (are [d o] + (are [d o do] (m/same? (r/concatenate [(g/data-frame {:a [1 2 3] :b [4 5 6]}) (g/data-frame {:a [2 2 2] :b [3 3 3]})] d) - (g/data-frame o)) + (g/data-frame o do)) {} [{:a 1, :b 4} {:a 2, :b 5} {:a 3, :b 6} - {:a 2, :b 3} {:a 2, :b 3} {:a 2, :b 3}] + {:a 2, :b 3} {:a 2, :b 3} {:a 2, :b 3}] {:index [0 1 2 0 1 2]} - {:axis 1} [{:a 2, :b 3} {:a 2, :b 3} {:a 2, :b 3}] + {:axis 1} [[1 4 2 3] [2 5 2 3] [3 6 2 3]] {:columns [:a :b :a :b]} {:axis 1 :ignore-index true} [{0 1, 1 4, 2 2, 3 3} {0 2, 1 5, 2 2, 3 3} - {0 3, 1 6, 2 2, 3 3}])) + {0 3, 1 6, 2 2, 3 3}] {})) + + + + + + From c32d8d3ff7068a44958b899e62f33f707507fd01 Mon Sep 17 00:00:00 2001 From: alanmarazzi Date: Mon, 16 Dec 2019 23:31:03 +0100 Subject: [PATCH 05/12] Some preliminary minor corrections after ->clj change --- project.clj | 5 ++--- src/panthera/pandas/utils.clj | 19 ++++++++----------- test/panthera/generics_test.clj | 16 ++++++++-------- 3 files changed, 18 insertions(+), 22 deletions(-) diff --git a/project.clj b/project.clj index 62efb13..1fc5d9b 100644 --- a/project.clj +++ b/project.clj @@ -4,8 +4,7 @@ :scm {:name "git" :url "https://github.com/alanmarazzi/panthera"} :license {:name "EPL-2.0" :url "https://www.eclipse.org/legal/epl-2.0/"} - :dependencies [[cnuernber/libpython-clj "1.20"] + :dependencies [[cnuernber/libpython-clj "1.27"] [org.clojure/core.memoize "0.7.2"]] - :profiles {:dev {:dependencies [[org.clojure/clojure "1.10.1"]] - :resource-paths ["/home/alan/miniconda3/envs/panthera"]}}) + :profiles {:dev {:dependencies [[org.clojure/clojure "1.10.1"]]}}) diff --git a/src/panthera/pandas/utils.clj b/src/panthera/pandas/utils.clj index 014746a..bae98cf 100644 --- a/src/panthera/pandas/utils.clj +++ b/src/panthera/pandas/utils.clj @@ -35,17 +35,14 @@ (slice 1 10 2) ; every 2 values between 1 and 10 ```" - [& args] - (apply bt/slice args) - (comment - ([] - (py/call-attr builtins "slice" nil)) - ([start] - (py/call-attr builtins "slice" start)) - ([start stop] - (py/call-attr builtins "slice" start stop)) - ([start stop incr] - (py/call-attr builtins "slice" start stop incr)))) + ([] + (bt/slice nil)) + ([start] + (bt/slice start)) + ([start stop] + (bt/slice start stop)) + ([start stop incr] + (bt/slice start stop incr))) (defn pytype "Return the Python type of the given objects diff --git a/test/panthera/generics_test.clj b/test/panthera/generics_test.clj index e4efce0..aff9409 100644 --- a/test/panthera/generics_test.clj +++ b/test/panthera/generics_test.clj @@ -2,8 +2,8 @@ (:require [clojure.test :refer :all] [libpython-clj.python :as py] - [panthera.pandas.generics :as g] - [panthera.pandas.utils :as u] + [panthera.pandas.generics :as g :reload true] + [panthera.pandas.utils :as u :reload true] [panthera.pandas.math :as m])) (deftest series @@ -33,7 +33,7 @@ (to-array-2d [[1 2] [3 4]]) {:columns [:a :b]} (to-array-2d [[1 2] [3 4]]) {:dtype :int8}) (are [i m o] - (= (u/->clj (g/data-frame i m)) o) + (= (u/->clj (g/data-frame i m) true) o) [] {} [] [] {:columns [:a :b]} [] [{:a 1 :b 2} {:a 1 :b 2}] {} [{:a 1 :b 2} {:a 1 :b 2}] @@ -53,7 +53,7 @@ (deftest one-hot (are [i m o] - (= (u/->clj (g/one-hot (g/series i) m)) o) + (= (u/->clj (g/one-hot (g/series i) m) true) o) [] {} [] ["a" "b"] {} [{:a 1 :b 0} @@ -65,7 +65,7 @@ :pre-b 1}]) (are [i m o] (= (u/->clj (g/one-hot (g/data-frame i) - {:columns m})) o) + {:columns m}) true) o) [{:a 1 :b "c"} {:a 2 :b "d"}] [:b] @@ -131,10 +131,10 @@ (= (u/->clj (apply g/subset-rows (g/data-frame (->> (range 1 11) (partition 2) - to-array-2d)) s)) o) + to-array-2d)) s) true) o) [] (u/->clj (g/data-frame (->> (range 1 11) (partition 2) - to-array-2d))) + to-array-2d)) true) [1] [{0 1 1 2}] [1 3] [{0 3 1 4} {0 5 1 6}] [1 3 2] [{0 3 1 4}])) @@ -156,7 +156,7 @@ (g/data-frame (flatten (repeat 5 [{:a 1 :b 2} - {:a 2 :b 3}]))) n)) + {:a 2 :b 3}]))) n) true) o) nil (drop-last (flatten (repeat 3 [{:a 1 :b 2} From d3d4ff9d9d3696ff9806c8998151a05cc6cf9a12 Mon Sep 17 00:00:00 2001 From: Alan Marazzi Date: Tue, 17 Dec 2019 17:52:30 +0100 Subject: [PATCH 06/12] Start print rework for dataset --- project.clj | 1 + src/panthera/pandas/utils.clj | 42 +++++++++++++++++++++++++--------- src/panthera/script.clj | 43 +++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 11 deletions(-) create mode 100644 src/panthera/script.clj diff --git a/project.clj b/project.clj index 1fc5d9b..2a279bb 100644 --- a/project.clj +++ b/project.clj @@ -6,5 +6,6 @@ :url "https://www.eclipse.org/legal/epl-2.0/"} :dependencies [[cnuernber/libpython-clj "1.27"] [org.clojure/core.memoize "0.7.2"]] + :main panthera.script :profiles {:dev {:dependencies [[org.clojure/clojure "1.10.1"]]}}) diff --git a/src/panthera/pandas/utils.clj b/src/panthera/pandas/utils.clj index bae98cf..c6dc06c 100644 --- a/src/panthera/pandas/utils.clj +++ b/src/panthera/pandas/utils.clj @@ -175,15 +175,24 @@ (defmethod to-clj false [obj] - {:id (py/get-attr obj "index") - :cols (py/get-attr obj "columns") - :data (lazy-seq (py/get-attr obj "values"))}) + {:id (py/get-attr obj "index") + :columns (py/get-attr obj "columns") + :data (lazy-seq (py/get-attr obj "values"))} + (comment (->DATASET + (py/get-attr obj "index") + (py/get-attr obj "columns") + (lazy-seq (py/get-attr obj "values"))))) (defmethod to-clj true [obj] - {:id (py/get-attr obj "index") - :cols (or (py/get-attr obj "name") "unnamed") - :data (lazy-seq (py/get-attr obj "values"))}) + {:id (py/get-attr obj "index") + :columns (or (py/get-attr obj "name") "unnamed") + :data (lazy-seq (py/get-attr obj "values"))} + (comment + (->DATASET + (py/get-attr obj "index") + (or (py/get-attr obj "name") "unnamed") + (lazy-seq (py/get-attr obj "values"))))) (defn ->clj "Convert the given panthera data-frame or series to a Clojure vector of maps. @@ -193,18 +202,29 @@ - `series`: a `series` gets converted to a vector of maps with only one key and one value. If the series has a name that becomes the key of the maps, otherwise `->clj` falls back to the `:unnamed` key. - - data-frame: a data-frame is converted to a vector of maps with names + - `data-frame`: a `data-frame` is converted to a vector of maps with names of the columns as keys and values as the corresponding row/column value. - Examples: + With the default method you might incur a data loss: the index doesn't get + converted and in case you're using a hierarchical index you get only one level + out of it. To keep everything in one place you have to make `full?` true, in + this way you get back a map with keys `{:id :cols :data}`. + + **Arguments** + + - `df-or-srs` -> `data-frame` or `series` + - `full?` -> whether to use the full conversion + + **Examples** ``` (->clj my-srs) (->clj my-df) - ```" - [df-or-srs & [clj?]] - (if-not clj? + ``` + " + [df-or-srs & [full?]] + (if full? (to-clj df-or-srs) (if (series? df-or-srs) (let [nm (memo-columns-converter diff --git a/src/panthera/script.clj b/src/panthera/script.clj new file mode 100644 index 0000000..355f7a4 --- /dev/null +++ b/src/panthera/script.clj @@ -0,0 +1,43 @@ +(ns panthera.script + (:require + [panthera.pandas.generics :as g] + [libpython-clj.python :as py])) + +(defrecord Data [data]) + +(defmethod print-method Data [v ^java.io.Writer w] + (pr (vec (take 5 (:data v))))) + +(defrecord DATASET [id cols data]) + +(defmethod print-method DATASET [v ^java.io.Writer w] + (let [id (:id v) + cols (:cols v) + data (conj (vec (take 5 (:data v))) \u2026)] + (clojure.pprint/pprint {:id id :cols cols :data data}))) + +(defmethod print-dup DATASET [v ^java.io.Writer w] + (let [id (:id v) + cols (:cols v) + data (conj (vec (take 5 (:data v))) "...")] + (clojure.pprint/pprint {:id id :cols cols :data data}))) + +(defmulti to-clj + (fn [obj] (identical? :series (py/python-type obj)))) + +(defmethod to-clj false + [obj] + (->DATASET + (py/get-attr obj "index") + (py/get-attr obj "columns") + (lazy-seq (py/get-attr obj "values")))) + +(defmethod to-clj true + [obj] + (->DATASET + (py/get-attr obj "index") + (or (py/get-attr obj "name") "unnamed") + (lazy-seq (py/get-attr obj "values")))) + +(defn -main [& args] + (println (to-clj (g/series (vec (range 20)))))) From ba0e1b6cd533a1fb8052298dd8ad76a1d0f6e6b0 Mon Sep 17 00:00:00 2001 From: Alan Marazzi Date: Wed, 18 Dec 2019 18:23:24 +0100 Subject: [PATCH 07/12] Add & correct some tests --- project.clj | 2 +- src/panthera/pandas/generics.clj | 9 ++- src/panthera/pandas/math.clj | 6 +- src/panthera/pandas/reshape.clj | 94 ++++++++++++++++++++++++++++---- src/panthera/pandas/utils.clj | 53 +++++++++++++----- src/panthera/script.clj | 19 ++++--- test/panthera/generics_test.clj | 22 ++++---- test/panthera/math_test.clj | 4 +- test/panthera/reshape_test.clj | 24 +++++++- 9 files changed, 179 insertions(+), 54 deletions(-) diff --git a/project.clj b/project.clj index 2a279bb..c6800bc 100644 --- a/project.clj +++ b/project.clj @@ -4,7 +4,7 @@ :scm {:name "git" :url "https://github.com/alanmarazzi/panthera"} :license {:name "EPL-2.0" :url "https://www.eclipse.org/legal/epl-2.0/"} - :dependencies [[cnuernber/libpython-clj "1.27"] + :dependencies [[cnuernber/libpython-clj "1.28"] [org.clojure/core.memoize "0.7.2"]] :main panthera.script :profiles {:dev {:dependencies [[org.clojure/clojure "1.10.1"]]}}) diff --git a/src/panthera/pandas/generics.clj b/src/panthera/pandas/generics.clj index 2833284..e35188e 100644 --- a/src/panthera/pandas/generics.clj +++ b/src/panthera/pandas/generics.clj @@ -814,8 +814,8 @@ "value_counts" (dissoc attrs :clj))] (if (:clj attrs) - (zipmap - (map u/memo-columns-converter (vec (index v))) + (zipmap + (map u/memo-columns-converter (vec (index v))) (vec v)) v)) (recur (series seq-or-srs) [attrs]))) @@ -1195,3 +1195,8 @@ ```" [df-or-srs i j] (u/simple-kw-call df-or-srs "swaplevel" [] {"i" i "j" j})) + +(defn factorize + "TODO" + [seq-or-srs & [attrs]] + (u/kw-call u/pd "factorize" seq-or-srs attrs)) diff --git a/src/panthera/pandas/math.clj b/src/panthera/pandas/math.clj index 1d47f73..942a5bb 100644 --- a/src/panthera/pandas/math.clj +++ b/src/panthera/pandas/math.clj @@ -139,8 +139,7 @@ :min "cummin" :prod "cumprod" :sum "cumsum" - :diff "diff" - :cmpnd "compound"} k) + :diff "diff"} k) attrs))) (def cummax @@ -158,9 +157,6 @@ (def diff (base-cumulative :diff)) -(def compound - (base-cumulative :cmpnd)) - (defn describe [df-or-srs & [attrs]] (u/simple-kw-call df-or-srs "describe" attrs)) diff --git a/src/panthera/pandas/reshape.clj b/src/panthera/pandas/reshape.clj index fbdd4da..0430da8 100644 --- a/src/panthera/pandas/reshape.clj +++ b/src/panthera/pandas/reshape.clj @@ -2,9 +2,7 @@ (:require [libpython-clj.python :as py] [panthera.pandas.utils :as u] - [panthera.pandas.generics :as g] - - [libpython-clj.python.protocols :as p])) + [panthera.pandas.generics :as g])) (defn crosstab "Compute a cross tabulation of two (or more) factors. By default @@ -461,19 +459,95 @@ :as attrs}]] (u/kw-call u/pd "concat" dfs-or-srss attrs)) -(defn factorize - [seq-or-srs & [attrs]] - (u/kw-call u/pd "factorize" seq-or-srs attrs)) - (defn aggregate - [df-or-srs how & [attrs]] + "Aggregate data using one or more functions over a given axis. + + This is very similar to `reduce`, but works on `data-frames` as well. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + - `how` -> keyword, str, function, Iterable: how to aggregate data. This accepts + either panthera functions strings/keywords, a list of the previous and/or user + defined functions. Check examples for more info. + + **Attrs** + + - `:axis` -> {0 `:index` 1 `:columns`}, default 0: 0 = apply function along + cols; 1 = apply function along rows + + **Examples** + + ``` + (def a (data-frame + [[1, 2, 3] + [4, 5, 6] + [7, 8, 9] + [##NaN, ##NaN, ##NaN]] + {:columns [:A :B :C]})) + + (aggregate (series [1 2 3]) :sum) + ;; 6 + + (aggregate a [:sum :min]) + ;; A B C + ;; sum 12.0 15.0 18.0 + ;; min 1.0 2.0 3.0 + + ; if `how` needs arguments, you can pass them as `attrs` + (aggregate (series [1 2 3]) :cov {:other (series [4 5 6])}) + ;; 1.0 + + (aggregate (series [1 2 3]) inc) + ;; 0 2 + ;; 1 3 + ;; 2 4 + ;; dtype: int64 + ``` + " + [df-or-srs how & [{:keys [axis fn-args]} :as attrs]] (u/kw-call df-or-srs "agg" how attrs)) (defn remap - [df-or-srs mappings & [na-action]] - (py/call-attr df-or-srs "map" mappings (or na-action nil))) + "Remap values in a series. + + This is the same as using `map` on a sequence while using a map as the mapped + function: `(map {:a 1 :b 2} [:a :b]) => (1 2)` + + **Arguments** + + - `srs` -> `series` + - `mappings` -> map, function: the mapping correspondence + - `na-action` -> {`nil` `:ignore`}, default `nil`: `:ignore` doesn't pass missing + values to the `mappings` + + **Examples** + + ``` + (remap (series [:a :b :c]) {:a 1 :b 2 :c 3}) + ;; 0 1 + ;; 1 2 + ;; 2 3 + ;; dtype: int64 + + (remap (series [:a :b ##NaN]) #(str \"This is \" %)) + ;; 0 This is a + ;; 1 This is b + ;; 2 This is NaN + ;; dtype: object + + (remap (series [:a :b ##NaN]) #(str \"This is \" %) :ignore) + ;; 0 This is a + ;; 1 This is b + ;; 2 NaN + ;; dtype: object + ``` + " + [srs mappings & [na-action]] + (py/call-attr srs "map" mappings (or na-action nil))) (defn groupby + "" [df-or-srs by & [attrs]] (u/kw-call df-or-srs "groupby" by attrs)) diff --git a/src/panthera/pandas/utils.clj b/src/panthera/pandas/utils.clj index c6dc06c..3450b23 100644 --- a/src/panthera/pandas/utils.clj +++ b/src/panthera/pandas/utils.clj @@ -170,29 +170,56 @@ [obj] (identical? :data-frame (pytype obj))) +(defrecord DATASET [id cols data shape]) + +(defn pr-lazy-dataset + [data] + (let [cnt (first (:shape data))] + (if (> cnt 4) + (conj (vec (take 5 (:data data))) '...) + (vec (:data data))))) + +(defmethod print-method DATASET [v ^java.io.Writer w] + (let [id (:id v) + cols (:cols v) + shape (:shape v) + data (pr-lazy-dataset v)] + (clojure.pprint/pprint {:id id :cols cols :data data}))) + +(defmethod print-dup DATASET [v ^java.io.Writer w] + (let [id (:id v) + cols (:cols v) + shape (:shape v) + data (pr-lazy-dataset v)] + (clojure.pprint/pprint {:shape (vec shape) :id id :cols cols :data data}))) + +(defmethod clojure.pprint/simple-dispatch DATASET [v] + (let [id (:id v) + cols (:cols v) + shape (:shape v) + data (pr-lazy-dataset v)] + (clojure.pprint/pprint {:shape (vec shape) :id id :cols cols :data data}))) + (defmulti to-clj - (fn [obj] (series? obj))) + (fn [obj] (identical? :series (py/python-type obj)))) (defmethod to-clj false [obj] - {:id (py/get-attr obj "index") - :columns (py/get-attr obj "columns") - :data (lazy-seq (py/get-attr obj "values"))} - (comment (->DATASET - (py/get-attr obj "index") - (py/get-attr obj "columns") - (lazy-seq (py/get-attr obj "values"))))) + (let [cnt (py/get-attr obj "shape")] + (->DATASET + (py/get-attr obj "index") + (py/get-attr obj "columns") + (lazy-seq (py/get-attr obj "values")) + cnt))) (defmethod to-clj true [obj] - {:id (py/get-attr obj "index") - :columns (or (py/get-attr obj "name") "unnamed") - :data (lazy-seq (py/get-attr obj "values"))} - (comment + (let [cnt (py/get-attr obj "shape")] (->DATASET (py/get-attr obj "index") (or (py/get-attr obj "name") "unnamed") - (lazy-seq (py/get-attr obj "values"))))) + (lazy-seq (py/get-attr obj "values")) + cnt))) (defn ->clj "Convert the given panthera data-frame or series to a Clojure vector of maps. diff --git a/src/panthera/script.clj b/src/panthera/script.clj index 355f7a4..763062d 100644 --- a/src/panthera/script.clj +++ b/src/panthera/script.clj @@ -3,23 +3,28 @@ [panthera.pandas.generics :as g] [libpython-clj.python :as py])) -(defrecord Data [data]) - -(defmethod print-method Data [v ^java.io.Writer w] - (pr (vec (take 5 (:data v))))) - (defrecord DATASET [id cols data]) +(defn pr-lazy-dataset + [data] + (conj (vec (take 5 (:data data))) '...)) + (defmethod print-method DATASET [v ^java.io.Writer w] (let [id (:id v) cols (:cols v) - data (conj (vec (take 5 (:data v))) \u2026)] + data (pr-lazy-dataset v)] (clojure.pprint/pprint {:id id :cols cols :data data}))) (defmethod print-dup DATASET [v ^java.io.Writer w] (let [id (:id v) cols (:cols v) - data (conj (vec (take 5 (:data v))) "...")] + data (pr-lazy-dataset v)] + (clojure.pprint/pprint {:id id :cols cols :data data}))) + +(defmethod clojure.pprint/simple-dispatch DATASET [v] + (let [id (:id v) + cols (:cols v) + data (pr-lazy-dataset v)] (clojure.pprint/pprint {:id id :cols cols :data data}))) (defmulti to-clj diff --git a/test/panthera/generics_test.clj b/test/panthera/generics_test.clj index aff9409..1487138 100644 --- a/test/panthera/generics_test.clj +++ b/test/panthera/generics_test.clj @@ -33,7 +33,7 @@ (to-array-2d [[1 2] [3 4]]) {:columns [:a :b]} (to-array-2d [[1 2] [3 4]]) {:dtype :int8}) (are [i m o] - (= (u/->clj (g/data-frame i m) true) o) + (= (u/->clj (g/data-frame i m)) o) [] {} [] [] {:columns [:a :b]} [] [{:a 1 :b 2} {:a 1 :b 2}] {} [{:a 1 :b 2} {:a 1 :b 2}] @@ -53,7 +53,7 @@ (deftest one-hot (are [i m o] - (= (u/->clj (g/one-hot (g/series i) m) true) o) + (= (u/->clj (g/one-hot (g/series i) m)) o) [] {} [] ["a" "b"] {} [{:a 1 :b 0} @@ -65,7 +65,7 @@ :pre-b 1}]) (are [i m o] (= (u/->clj (g/one-hot (g/data-frame i) - {:columns m}) true) o) + {:columns m})) o) [{:a 1 :b "c"} {:a 2 :b "d"}] [:b] @@ -131,10 +131,10 @@ (= (u/->clj (apply g/subset-rows (g/data-frame (->> (range 1 11) (partition 2) - to-array-2d)) s) true) o) + to-array-2d)) s)) o) [] (u/->clj (g/data-frame (->> (range 1 11) (partition 2) - to-array-2d)) true) + to-array-2d))) [1] [{0 1 1 2}] [1 3] [{0 3 1 4} {0 5 1 6}] [1 3 2] [{0 3 1 4}])) @@ -156,7 +156,7 @@ (g/data-frame (flatten (repeat 5 [{:a 1 :b 2} - {:a 2 :b 3}]))) n) true) + {:a 2 :b 3}]))) n)) o) nil (drop-last (flatten (repeat 3 [{:a 1 :b 2} @@ -176,7 +176,7 @@ o) [{:a 1}] [:a] [{:a 1}] [{:a 1 :b 2 :c 3}] [:a :c] [{:a 1 :c 3}] - (repeat 5 {:a 1 :b 2}) [:b] (repeat 5 {:b 2}) + (vec (repeat 5 {:a 1 :b 2})) [:b] (vec (repeat 5 {:b 2})) [{:wEiR__.D 1 :b 2}] [:wEiR__.D] [{:w-ei-r-.-d 1}])) (deftest n-largest @@ -244,9 +244,7 @@ [1 1 2] {} {1 2 2 1} [:a :a :b :c] {} {:a 2 :b 1 :c 1} (repeat 50 :a) {} {:a 50} - [:a :a :b :c] {:normalize true} {:a 0.5 :b 0.25 :c 0.25} - ;(range 20) {:bins 4} {:a 0.5 :b 0.25 :c 0.25} Intervals are not handled - )) + [:a :a :b :c] {:normalize true} {:a 0.5 :b 0.25 :c 0.25})) (deftest reset-index (are [i m o] @@ -309,8 +307,8 @@ {:unnamed 18} {:unnamed 19}] (g/series (range 20)) 2 [{:unnamed 18} {:unnamed 19}] - (g/data-frame (repeat 10 {:a 1 :b 2})) nil (repeat 5 {:a 1 :b 2}) - (g/data-frame (repeat 10 {:a 1 :b 2})) 2 (repeat 2 {:a 1 :b 2}))) + (g/data-frame (vec (repeat 10 {:a 1 :b 2}))) nil (repeat 5 {:a 1 :b 2}) + (g/data-frame (vec (repeat 10 {:a 1 :b 2}))) 2 (repeat 2 {:a 1 :b 2}))) (deftest fill-na (are [v m o] diff --git a/test/panthera/math_test.clj b/test/panthera/math_test.clj index 1231723..0fa407e 100644 --- a/test/panthera/math_test.clj +++ b/test/panthera/math_test.clj @@ -114,6 +114,4 @@ :prod (repeat 10 0) :sum [0 1 3 6 10 15 21 28 36 45]) (is (= (drop 1 (vec ((#'m/base-cumulative :diff) (g/series (range 10))))) - (repeat 9 1.0))) - (is (= ((#'m/base-cumulative :cmpnd) (g/series (range 10))) - 3628799))) + (repeat 9 1.0)))) diff --git a/test/panthera/reshape_test.clj b/test/panthera/reshape_test.clj index e939196..67d587b 100644 --- a/test/panthera/reshape_test.clj +++ b/test/panthera/reshape_test.clj @@ -274,8 +274,30 @@ {0 2, 1 5, 2 2, 3 3} {0 3, 1 6, 2 2, 3 3}] {})) +(deftest aggregate + (are [v d o] + (= (m/same? + (r/aggregate (g/data-frame [[1, 2, 3], + [4, 5, 6], + [7, 8, 9], + [##NaN, ##NaN, ##NaN]] + {:columns [:A :B :C]}) v d) + o)) + :sum {} (g/series [12 15 18] {:index [:A :B :C]}) + [:sum :min] {} (g/data-frame + {:A [12 1] :B [15 2] :C [18 3]} + {:index [:sum :min]}) + :sum {:axis 1} (g/series [6 15 24 0]))) - +(deftest remap + (are [in mpgs ign o] + (= (m/same? + (r/remap + (g/series in) + mpgs ign) + o)) + [:a :b :c] {:a 1 :b 2 :c 3} nil (g/series [1 2 3]) + )) From d4442d14056d7043d9eabd69d396b00e2fa27c98 Mon Sep 17 00:00:00 2001 From: alanmarazzi Date: Wed, 18 Dec 2019 22:00:22 +0100 Subject: [PATCH 08/12] Remove :main from project --- project.clj | 1 - 1 file changed, 1 deletion(-) diff --git a/project.clj b/project.clj index c6800bc..b104d87 100644 --- a/project.clj +++ b/project.clj @@ -6,6 +6,5 @@ :url "https://www.eclipse.org/legal/epl-2.0/"} :dependencies [[cnuernber/libpython-clj "1.28"] [org.clojure/core.memoize "0.7.2"]] - :main panthera.script :profiles {:dev {:dependencies [[org.clojure/clojure "1.10.1"]]}}) From f1fbf566db6cb10d2093f4237898d7e064e0ba6c Mon Sep 17 00:00:00 2001 From: Alan Marazzi Date: Thu, 19 Dec 2019 15:41:25 +0100 Subject: [PATCH 09/12] Add other reshape tests & docs --- src/panthera/pandas/reshape.clj | 170 +++++++++++++++++++++++++++++++- src/panthera/script.clj | 48 --------- test/panthera/reshape_test.clj | 101 ++++++++++++++++--- 3 files changed, 253 insertions(+), 66 deletions(-) delete mode 100644 src/panthera/script.clj diff --git a/src/panthera/pandas/reshape.clj b/src/panthera/pandas/reshape.clj index 0430da8..1e62f1b 100644 --- a/src/panthera/pandas/reshape.clj +++ b/src/panthera/pandas/reshape.clj @@ -547,16 +547,178 @@ (py/call-attr srs "map" mappings (or na-action nil))) (defn groupby - "" - [df-or-srs by & [attrs]] + "Group `data-frame` or `series` by a given variable. + + Note that `groupby` does nothing by itself, this must be followed by another + operation like aggregation. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + - `by` -> str, keyword, Iterable, map, function: it can be a column, a list of + columns, a function used to group the index, a collection of values to use as + grouping variable + + **Attrs** + + - `:axis` -> {0 `:index` 1 `:columns`}: split along columns or rows + - `:level` -> int, str, keyword, Iterable: if multiple index, group by this + or these + - `:as-index` -> bool, default `true`: when `false` this becomes basically + as the SQL group by output + - `:sort` -> bool, default `true`: if `false` you get a performance improvement + - `:group-keys` -> bool, default `true`: add group keys to index when afterwards + you call `apply` + - `:squeeze` -> bool, default `false`: reduce dimensionality of the output if possible + - `:observed` -> bool, default `false`: this only applies to Categoricals: + if `true`, only show observed values for categorical groupers, + if `false`, show all values for categorical groupers + + **Examples** + + ``` + (def a (data-frame {:animal [:falcon :falcon :parrot :parrot] + :max-speed [380 370 24 26]})) + + (-> a (r/groupby :animal) m/mean) + max-speed + ;; animal + ;; falcon 375 + ;; parrot 25 + + (-> a (r/groupby :animal {:as-index false}) m/mean) + ;; animal max-speed + ;; 0 falcon 375 + ;; 1 parrot 25 + ``` + " + [df-or-srs by & [{:keys [axis level as-index sort group-keys + squeeze observed] :as attrs}]] (u/kw-call df-or-srs "groupby" by attrs)) (defn rolling - [df-or-srs window & [attrs]] + "Rolling window calculations + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + - `window` -> int, str. keyword: the size of the window. If str or keyword then + this is considered as a time offset (e.g. :2s = 2 seconds, :30D = 30 days; + check this for more options https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases) + + **Attrs** + + - `:min-periods` -> int: minimum number of observations to have a value. For + times the default is 1, otherwise the default is `window` + - `:center` -> bool, default `false`: if `false` the result is set at the right + edge of the window, otherwise it gets centered + - `:win-type` -> str, keyword: refer to https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows + - `:on`-> str, keyword: column to use for the rolling window, only in case this + is not the index + - `:axis` -> {0 `:index` 1 `:columns`}: split along columns or rows + - `:closed` -> {`:right` `:left` `:both` `:neither`}: where to make the interval + close + + **Examples** + ``` + (def a (data-frame {:b [0 1 2 3 4]} + {:index + (panthera.pandas.conversion/->datetime + (series + [\"20130101 09:00:00\" + \"20130101 09:00:02\" + \"20130101 09:00:03\" + \"20130101 09:00:05\" + \"20130101 09:00:06\"]))})) + + (sum (rolling a 2)) + ;; b + ;; 2013-01-01 09:00:00 NaN + ;; 2013-01-01 09:00:02 1.0 + ;; 2013-01-01 09:00:03 3.0 + ;; 2013-01-01 09:00:05 5.0 + ;; 2013-01-01 09:00:06 7.0 + + (sum (rolling a :2s)) + ;; b + ;; 2013-01-01 09:00:00 0.0 + ;; 2013-01-01 09:00:02 1.0 + ;; 2013-01-01 09:00:03 3.0 + ;; 2013-01-01 09:00:05 3.0 + ;; 2013-01-01 09:00:06 7.0 + + (sum (rolling a 2 {:win-type :triang})) + ;; b + ;; 2013-01-01 09:00:00 NaN + ;; 2013-01-01 09:00:02 0.5 + ;; 2013-01-01 09:00:03 1.5 + ;; 2013-01-01 09:00:05 2.5 + ;; 2013-01-01 09:00:06 3.5 + + (sum (rolling a 2 {:min-periods 1})) + ;; b + ;; 2013-01-01 09:00:00 0.0 + ;; 2013-01-01 09:00:02 1.0 + ;; 2013-01-01 09:00:03 3.0 + ;; 2013-01-01 09:00:05 5.0 + ;; 2013-01-01 09:00:06 7.0 + ``` + " + [df-or-srs window & [{:keys [min-periods center win-type on axis closed] + :as attrs}]] (u/kw-call df-or-srs "rolling" window attrs)) (defn ewm - [df-or-srs & [attrs]] + "Exponentially weighted functions. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + + **Attrs** + + - `:com` -> numeric: decay in terms of center of mass + - `:span` -> numeric: decay in terms of span + - `:halflife` -> numeric: decay in terms of half-life + - `:alpha` -> numeric: smoothing factor + - `:min-periods` -> int, default 0: minimum number of observations + - `:adjust` -> bool, default `true`: divide by decaying adjustment factor + in beginning periods to account for imbalance in relative weightings + - `:ignore-na` -> bool, default `false`: ignore missing values + - `:axis` -> {0 `:index` 1 `:columns`}: use columns or rows + + **Examples** + + ``` + (def a (g/data-frame {:b [0 1 2 ##NaN 4]})) + + (-> a (ewm {:com 0.5}) mean) + ;; b + ;; 0 0.000000 + ;; 1 0.750000 + ;; 2 1.615385 + ;; 3 1.615385 + ;; 4 3.670213 + + (-> a (ewm {:span 3}) mean) + ;; b + ;; 0 0.000000 + ;; 1 0.666667 + ;; 2 1.428571 + ;; 3 1.428571 + ;; 4 3.217391 + + (-> a (ewm {:com 0.5 :ignore-na true}) mean) + ;; b + ;; 0 0.000000 + ;; 1 0.750000 + ;; 2 1.615385 + ;; 3 1.615385 + ;; 4 3.225000 + ``` + " + [df-or-srs & [{:keys [com span halflife min-periods adjust ignore-na axis] + :as attrs}]] (u/simple-kw-call df-or-srs "ewm" attrs)) ; remove :inplace as an attr diff --git a/src/panthera/script.clj b/src/panthera/script.clj deleted file mode 100644 index 763062d..0000000 --- a/src/panthera/script.clj +++ /dev/null @@ -1,48 +0,0 @@ -(ns panthera.script - (:require - [panthera.pandas.generics :as g] - [libpython-clj.python :as py])) - -(defrecord DATASET [id cols data]) - -(defn pr-lazy-dataset - [data] - (conj (vec (take 5 (:data data))) '...)) - -(defmethod print-method DATASET [v ^java.io.Writer w] - (let [id (:id v) - cols (:cols v) - data (pr-lazy-dataset v)] - (clojure.pprint/pprint {:id id :cols cols :data data}))) - -(defmethod print-dup DATASET [v ^java.io.Writer w] - (let [id (:id v) - cols (:cols v) - data (pr-lazy-dataset v)] - (clojure.pprint/pprint {:id id :cols cols :data data}))) - -(defmethod clojure.pprint/simple-dispatch DATASET [v] - (let [id (:id v) - cols (:cols v) - data (pr-lazy-dataset v)] - (clojure.pprint/pprint {:id id :cols cols :data data}))) - -(defmulti to-clj - (fn [obj] (identical? :series (py/python-type obj)))) - -(defmethod to-clj false - [obj] - (->DATASET - (py/get-attr obj "index") - (py/get-attr obj "columns") - (lazy-seq (py/get-attr obj "values")))) - -(defmethod to-clj true - [obj] - (->DATASET - (py/get-attr obj "index") - (or (py/get-attr obj "name") "unnamed") - (lazy-seq (py/get-attr obj "values")))) - -(defn -main [& args] - (println (to-clj (g/series (vec (range 20)))))) diff --git a/test/panthera/reshape_test.clj b/test/panthera/reshape_test.clj index 67d587b..d1c3255 100644 --- a/test/panthera/reshape_test.clj +++ b/test/panthera/reshape_test.clj @@ -276,13 +276,13 @@ (deftest aggregate (are [v d o] - (= (m/same? - (r/aggregate (g/data-frame [[1, 2, 3], - [4, 5, 6], - [7, 8, 9], - [##NaN, ##NaN, ##NaN]] - {:columns [:A :B :C]}) v d) - o)) + (m/same? + (r/aggregate (g/data-frame [[1, 2, 3], + [4, 5, 6], + [7, 8, 9], + [##NaN, ##NaN, ##NaN]] + {:columns [:A :B :C]}) v d) + o) :sum {} (g/series [12 15 18] {:index [:A :B :C]}) @@ -294,10 +294,83 @@ (deftest remap (are [in mpgs ign o] - (= (m/same? - (r/remap - (g/series in) - mpgs ign) - o)) - [:a :b :c] {:a 1 :b 2 :c 3} nil (g/series [1 2 3]) - )) + (m/same? + (r/remap + (g/series in) + mpgs ign) + o) + [:a :b :c] {:a 1 :b 2 :c 3} nil (g/series [1 2 3]) + [:a :b ##NaN] #(str "Test " %) :ignore (g/series ["Test a" "Test b" ##NaN]))) + +(deftest groupby + (are [d f o] + (m/same? + (-> (g/data-frame {:animal [:falcon :falcon :parrot :parrot] + :max-speed [380 370 24 26]}) + (r/groupby :animal d) + f) + o) + + {} m/mean (g/data-frame {:max-speed [375 25]} + {:index (g/series [:falcon :parrot] {:name :animal})}) + + {:as-index false} m/mean (g/data-frame [{:animal "falcon" :max-speed 375} + {:animal "parrot" :max-speed 25}]) + + {} m/std (g/data-frame [{:max-speed 7.0710678118654755} + {:max-speed 1.4142135623730951}] + {:index (g/series [:falcon :parrot] {:name :animal})}))) + +(deftest rolling + (are [w d o] + (m/same? + (-> (g/data-frame {:b [0 1 2 3 4]} + {:index + (panthera.pandas.conversion/->datetime + (g/series + ["20130101 09:00:00" + "20130101 09:00:02" + "20130101 09:00:03" + "20130101 09:00:05" + "20130101 09:00:06"]))}) + (r/rolling w d) + m/sum) + (g/data-frame o + {:index + (panthera.pandas.conversion/->datetime + (g/series + ["20130101 09:00:00" + "20130101 09:00:02" + "20130101 09:00:03" + "20130101 09:00:05" + "20130101 09:00:06"]))})) + 2 {} [{:b ##NaN} {:b 1.0} {:b 3.0} {:b 5.0} {:b 7.0}] + :2s {} [{:b 0.0} {:b 1.0} {:b 3.0} {:b 3.0} {:b 7.0}] + 2 {:win-type :triang} [{:b ##NaN} {:b 0.5} {:b 1.5} {:b 2.5} {:b 3.5}] + 2 {:min-periods 1} [{:b 0.0} {:b 1.0} {:b 3.0} {:b 5.0} {:b 7.0}])) + +(deftest ewm + (are [d o] + (m/same? + (-> (g/data-frame {:b [0 1 2 ##NaN 4]}) + (r/ewm d) + m/mean) + (g/data-frame o)) + + {:com 0.5} [{:b 0.0} + {:b 0.7499999999999999} + {:b 1.6153846153846152} + {:b 1.6153846153846152} + {:b 3.670212765957447}] + + {:span 2} [{:b 0.0} + {:b 0.7499999999999999} + {:b 1.6153846153846152} + {:b 1.6153846153846152} + {:b 3.670212765957447}] + + {:com 0.5 :ignore-na true} [{:b 0.0} + {:b 0.7499999999999999} + {:b 1.6153846153846152} + {:b 1.6153846153846152} + {:b 3.2249999999999996}])) From a0d15207fca5374872812f0e750a9f9a31f6bb6b Mon Sep 17 00:00:00 2001 From: Alan Marazzi Date: Tue, 14 Jan 2020 17:43:51 +0100 Subject: [PATCH 10/12] Add dropna test --- src/panthera/pandas/reshape.clj | 7 ++++--- test/panthera/reshape_test.clj | 26 ++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/src/panthera/pandas/reshape.clj b/src/panthera/pandas/reshape.clj index 1e62f1b..f30264f 100644 --- a/src/panthera/pandas/reshape.clj +++ b/src/panthera/pandas/reshape.clj @@ -718,12 +718,13 @@ ``` " [df-or-srs & [{:keys [com span halflife min-periods adjust ignore-na axis] - :as attrs}]] + :as attrs}]] (u/simple-kw-call df-or-srs "ewm" attrs)) -; remove :inplace as an attr (defn dropna - [df-or-srs & [attrs]] + "Drop missing values" + [df-or-srs & [{:keys [axis how thresh subset] + :as attrs}]] (u/simple-kw-call df-or-srs "dropna" attrs)) (defn melt diff --git a/test/panthera/reshape_test.clj b/test/panthera/reshape_test.clj index d1c3255..f51a760 100644 --- a/test/panthera/reshape_test.clj +++ b/test/panthera/reshape_test.clj @@ -374,3 +374,29 @@ {:b 1.6153846153846152} {:b 1.6153846153846152} {:b 3.2249999999999996}])) + +(deftest dropna + (are [s o d] + (m/same? + (-> (g/series s) + r/dropna) + (g/series o d)) + [] [] {} + [1 nil 2] [1.0 2.0] {:index [0 2]}) + + (are [att out opt] + (m/same? + (-> (g/data-frame {:name ["Alfred" "Batman" "Robin"] + :toy [nil "Batmobile" "Whip"] + :born [nil "1940-04-25" nil]}) + (r/dropna att)) + (g/data-frame out opt)) + {} [{:name "Batman", :toy "Batmobile", :born "1940-04-25"}] {:index [1]} + {:axis 1} [{:name "Alfred"} {:name "Batman"} {:name "Robin"}] {} + {:how :all} [{:name "Alfred", :toy nil, :born nil} + {:name "Batman", :toy "Batmobile", :born "1940-04-25"} + {:name "Robin", :toy "Whip", :born nil}] {} + {:thresh 2} [{:name "Batman", :toy "Batmobile", :born "1940-04-25"} + {:name "Robin", :toy "Whip", :born nil}] {:index [1 2]} + {:subset [:toy]} [{:name "Batman", :toy "Batmobile", :born "1940-04-25"} + {:name "Robin", :toy "Whip", :born nil}] {:index [1 2]})) From accf473e3f366f890b85e05a0c3faa9f313b95a0 Mon Sep 17 00:00:00 2001 From: Alan Marazzi Date: Mon, 27 Jan 2020 15:53:21 +0100 Subject: [PATCH 11/12] Finalize tests & refactor utils conversions --- .gitignore | 3 +- src/panthera/pandas/generics.clj | 10 +- src/panthera/pandas/reshape.clj | 280 ++++++++++++++++++++++++++++++- src/panthera/pandas/utils.clj | 68 +++++--- src/panthera/panthera.clj | 19 ++- test/panthera/config.clj | 8 + test/panthera/generics_test.clj | 12 +- test/panthera/reshape_test.clj | 102 ++++++++++- test/panthera/utils_test.clj | 12 +- 9 files changed, 460 insertions(+), 54 deletions(-) create mode 100644 test/panthera/config.clj diff --git a/.gitignore b/.gitignore index 4a1c483..1fedb5d 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,5 @@ examples/.ipynb_checkpoints *.log .classpath .project -.settings \ No newline at end of file +.settings +resources/docs.md diff --git a/src/panthera/pandas/generics.clj b/src/panthera/pandas/generics.clj index e35188e..f9237ae 100644 --- a/src/panthera/pandas/generics.clj +++ b/src/panthera/pandas/generics.clj @@ -862,7 +862,7 @@ (to-csv \"mycsv.csv\" {:sep \";\" :index false}) ```" [df-or-srs filename & [attrs]] - (u/simple-kw-call df-or-srs "to_csv" attrs)) + (u/kw-call df-or-srs "to_csv" filename attrs)) (defn reset-index "Reset the index or part of it. This replaces the current index @@ -1200,3 +1200,11 @@ "TODO" [seq-or-srs & [attrs]] (u/kw-call u/pd "factorize" seq-or-srs attrs)) + +(defn rename + [df-or-srs & [attrs]] + (u/simple-kw-call df-or-srs "rename" attrs)) + +(defn to-excel + [df-or-srs filename & [attrs]] + (u/kw-call df-or-srs "to_excel" filename attrs)) diff --git a/src/panthera/pandas/reshape.clj b/src/panthera/pandas/reshape.clj index f30264f..1af3307 100644 --- a/src/panthera/pandas/reshape.clj +++ b/src/panthera/pandas/reshape.clj @@ -1,4 +1,6 @@ (ns panthera.pandas.reshape + (:refer-clojure + :exclude [drop]) (:require [libpython-clj.python :as py] [panthera.pandas.utils :as u] @@ -475,6 +477,8 @@ - `:axis` -> {0 `:index` 1 `:columns`}, default 0: 0 = apply function along cols; 1 = apply function along rows + - `fn-args` -> if the provided collapsing function needs arguments, just list + them freely (see examples) **Examples** @@ -505,7 +509,7 @@ ;; dtype: int64 ``` " - [df-or-srs how & [{:keys [axis fn-args]} :as attrs]] + [df-or-srs how & [{:keys [axis fn-args] :as attrs}]] (u/kw-call df-or-srs "agg" how attrs)) (defn remap @@ -721,26 +725,286 @@ :as attrs}]] (u/simple-kw-call df-or-srs "ewm" attrs)) +(defn drop + "Drop requested rows or columns. + + Remove rows or columns by specifying label names and corresponding axis, + or by specifying directly index or column names. When using a multi-index, + labels on different levels can be removed by specifying the level. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + - `labels` -> keyword, str, numeric, Iterable: index or labels to drop + + **Attrs** + + - `:axis` -> int, default 0: 0 = rows, 1 = columns + - `:level` -> numeric, keyword, str: level to drop from multi index + - `:errors` -> {`:ignore` `:raise`}, default `:raise`: ignore or raise errors + + **Examples** + + ``` + (require-python '[numpy :as np]) + (def df + (data-frame + (np/reshape (np/arange 12) [3 4]) + {:columns [:A :B :C :D]})) + + (drop df [:B :C] {:axis 1}) + ;; A D + ;; 0 0 3 + ;; 1 4 7 + ;; 2 8 11 + + (drop df [0 1]) + ;; A B C D + ;; 2 8 9 10 11 + ``` + " + [df-or-srs labels & [{:keys [axis level errors] :as attrs}]] + (u/kw-call df-or-srs "drop" labels attrs)) + +(defn drop-rows + "A shorthand for `(drop df [0 2] {:axis 0})` + + See [[drop]] docs for more info" + [df rows & [{:keys [level errors] :as attrs}]] + (drop df rows (merge attrs {:axis 0}))) + +(defn drop-cols + "A shorthand for `(drop df [:A :C] {:axis 1})` + + See [[drop]] docs for more info" + [df cols & [{:keys [level errors] :as attrs}]] + (drop df cols (merge attrs {:axis 1}))) + (defn dropna - "Drop missing values" + "Drop missing values. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + + **Attrs** + + - `:axis` -> int, default 0: 0 = rows, 1 = columns + - `:how` -> {`:any` `:all`}, default `:any`: drop when there are `:any` missing + values, or `:all` missing values + - `:thresh` -> numeric: require `:thresh` missing values to drop + - `:subset` -> Iterable: the subset to consider on opposite axis; e.g. if + you drop rows `:subset` are the columns to consider for dropping + + **Examples** + + ``` + (def df + (data-frame {:name [:Alfred :Batman :Robin] + :toy [nil :Batmobile :Whip] + :born [nil :1940-04-25 nil]}) + + (dropna df) + ;; name toy born + ;; 1 Batman Batmobile 1940-04-25 + ``` + " [df-or-srs & [{:keys [axis how thresh subset] :as attrs}]] (u/simple-kw-call df-or-srs "dropna" attrs)) (defn melt - [df & [attrs]] + "Unpivot a `data-frame` from wide format to long format. + + Basically reshape the `data-frame` to have one row per observation and one + column per variable + + **Arguments** + + - `df` -> `data-frame` + + **Attrs** + + - `:id-vars` -> Iterable: columns to use as identifiers + - `:value-vars` -> Iterable: columns to melt (unpivot), if not specified uses + all the columns not in `:id-vars` + - `:var-name` -> keyword, str, default `:variable`: name for the variable column + - `:value-name` -> keyword, str, default `:value`: name for the value column + - `:col-level` -> numeric, str: the level to use for melting + + **Examples** + + ``` + (def df + (transpose + (data-frame [[:a :b :c] [1 3 5] [2 4 6]] + {:columns [0 1 2] + :index [:A :B :C]}))) + + (melt df) + ;; variable value + ;; 0 A a + ;; 1 A b + ;; 2 A c + ;; 3 B 1 + ;; 4 B 3 + ;; 5 B 5 + ;; 6 C 2 + ;; 7 C 4 + ;; 8 C 6 + + (melt df {:id-vars [:A] :value-vars [:B]}) + ;; A variable value + ;; 0 a B 1 + ;; 1 b B 3 + ;; 2 c B 5 + + (melt df {:id-vars [:A] :value-vars [:B :C]}) + ;; A variable value + ;; 0 a B 1 + ;; 1 b B 3 + ;; 2 c B 5 + ;; 3 a C 2 + ;; 4 b C 4 + ;; 5 c C 6 + ``` + " + [df & [{:keys [id-vars value-vars var-name + value-name col-level] :as attrs}]] (u/simple-kw-call df "melt" attrs)) (defn assign + "Assign new columns to `df-or-srs` + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + - `cols` -> map: either a map `{:col-name value}`, or a map `{:col-name fn}` + + **Examples** + + ``` + (def df + (transpose + (data-frame [[:a :b :c] [1 3 5] [2 4 6]] + {:columns [0 1 2] + :index [:A :B :C]}))) + + (assign df {:D 3}) + ;; A B C D + ;; 0 a 1 2 3 + ;; 1 b 3 4 3 + ;; 2 c 5 6 3 + + (assign df {:D [1 2 3]}) + ;; A B C D + ;; 0 a 1 2 1 + ;; 1 b 3 4 2 + ;; 2 c 5 6 3 + + (assign df {:D #(-> (subset-cols % :C) (mul 2))}) + ;; A B C D + ;; 0 a 1 2 4 + ;; 1 b 3 4 8 + ;; 2 c 5 6 12 + ``` + " [df-or-srs cols] - (u/simple-kw-call df-or-srs "assign" - (u/keys->pyargs cols))) + (py/call-attr-kw df-or-srs "assign" [] cols)) + +(defn stack + "Stack the prescribed level(s) from columns to index. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + + **Attrs** + + - `:level` -> numeric, keyword, str, default -1: level to stack + - `:dropna` -> bool, default true: drop rows with missing values if generated + + **Examples** + + ``` + (def df + (data-frame [[0 1] [2 3]] + {:index [:cat :dog] + :columns [:weight :height]})) + + (stack df) + ;; cat weight 0 + ;; height 1 + ;; dog weight 2 + ;; height 3 + ;; dtype: int64 + ``` + " + [df-or-srs & [{:keys [level dropna] :as attrs}]] + (u/simple-kw-call df-or-srs "stack" attrs)) (defn unstack - [df-or-srs & [attrs]] + "Pivot a level of the (necessarily hierarchical) index labels, + returning a DataFrame having a new level of column labels whose inner-most + level consists of the pivoted index labels. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + + **Attrs** + + - `:level` -> numeric, keyword, str, default -1: level to unstack + - `:fill-value` -> any: replace missing values produced by `unstack` with this + + **Examples** + + ``` + (def s + (stack + (data-frame [[1 2] [3 4]] + {:index [:one :two] + :columns [:a :b]}))) + + (unstack s) + ;; a b + ;; one 1 2 + ;; two 3 4 + + (unstack s {:level 0}) + ;; one two + ;; a 1 3 + ;; b 2 4 + + (unstack (unstack s {:level 0})) + ;; one a 1 + ;; b 2 + ;; two a 3 + ;; b 4 + ;; dtype: int64 + ``` + " + [df-or-srs & [{:keys [level fill_value] :as attrs}]] (u/simple-kw-call df-or-srs "unstack" attrs)) (defn transpose - "Transpose the given panthera object" + "Transpose the given panthera object + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + + **Examples** + + ``` + (def df (data-frame [[1 2 3] [4 5 6] [7 8 9]])) + + (transpose df) + ;; 0 1 2 + ;; 0 1 4 7 + ;; 1 2 5 8 + ;; 2 3 6 9 + ``` + " [df-or-srs] - (py/call-attr df-or-srs "transpose")) + (py/get-attr df-or-srs "T")) diff --git a/src/panthera/pandas/utils.clj b/src/panthera/pandas/utils.clj index 3450b23..fab23f4 100644 --- a/src/panthera/pandas/utils.clj +++ b/src/panthera/pandas/utils.clj @@ -2,15 +2,11 @@ (:require [libpython-clj.python :as py] [libpython-clj.require :refer [require-python]] - [camel-snake-kebab.core :as csk] [camel-snake-kebab.extras :as cske] [clojure.core.memoize :as m])) -(py/initialize!) - (require-python '[builtins :as bt]) -;(defonce builtins (py/import-module "builtins")) (defonce pd (py/import-module "pandas")) (defn slice @@ -60,6 +56,17 @@ ([obj & objs] (map pytype (concat (vector obj) objs)))) +(def pystr->cljk + (comp + keyword + #(clojure.string/replace % #"_" "-") + #(clojure.string/replace % #" " "-"))) + +(def cljk->pystr + (comp + #(clojure.string/replace % #"-" "_") + name)) + (def memo-key-converter "Convert regular Clojure kebab-case keys to idiomatic Python snake_case strings. @@ -69,7 +76,7 @@ ``` (memo-key-converter :a-key) ; \"a_key\" ```" - (m/fifo csk/->snake_case_string {} :fifo/threshold 512)) + (m/fifo #(if (keyword? %) (cljk->pystr %) %) {} :fifo/threshold 512)) (def memo-columns-converter "Converts Python strings to idiomatic Clojure keys. @@ -79,14 +86,14 @@ ``` (memo-columns-converter \"a_name\") ; :a-name - (memo-columns-converter \"ALL_CAPS\") ; :all-caps + (memo-columns-converter \"ALL_CAPS\") ; :ALL-CAPS ```" (m/fifo #(cond (number? %) % - (string? %) (csk/->kebab-case-keyword %) + (string? %) (pystr->cljk %) (nil? %) nil - :else (mapv csk/->kebab-case-keyword %)) {} :fifo/threshold 512)) + :else (mapv pystr->cljk %)) {} :fifo/threshold 512)) (defn vec->pylist "Converts an iterable Clojure data structure to a Python list @@ -221,6 +228,33 @@ (lazy-seq (py/get-attr obj "values")) cnt))) +(defmulti kwrds? + (fn [obj keywords?] (boolean keywords?))) + +(defmethod kwrds? true + [obj keywords?] + (if (series? obj) + (let [nm (memo-columns-converter + (or (py/get-attr obj "name") + "unnamed"))] + (into [] (map #(assoc {} nm %)) + (vec obj))) + (let [ks (map memo-columns-converter + (py/get-attr obj "columns"))] + (into [] (map #(zipmap ks %)) + (py/get-attr obj "values"))))) + +(defmethod kwrds? false + [obj keywords?] + (if (series? obj) + (let [nm (or (py/get-attr obj "name") + "unnamed")] + (into [] (map #(assoc {} nm %)) + (vec obj))) + (let [ks (py/get-attr obj "columns")] + (into [] (map #(zipmap ks %)) + (py/get-attr obj "values"))))) + (defn ->clj "Convert the given panthera data-frame or series to a Clojure vector of maps. The idea is to have a common, simple and fast access point to conversion of @@ -240,7 +274,10 @@ **Arguments** - `df-or-srs` -> `data-frame` or `series` - - `full?` -> whether to use the full conversion + - `:full?` -> whether to use the full conversion, default false + - `:keywords?` -> wether to convert column names to keywords, default true + + N.B.: `:full?` usage excludes `:keywords?` **Examples** @@ -250,19 +287,10 @@ (->clj my-df) ``` " - [df-or-srs & [full?]] + [df-or-srs & {:keys [full? keywords?] :or {keywords? true}}] (if full? (to-clj df-or-srs) - (if (series? df-or-srs) - (let [nm (memo-columns-converter - (or (py/get-attr df-or-srs "name") - "unnamed"))] - (into [] (map #(assoc {} nm %)) - (vec df-or-srs))) - (let [ks (map memo-columns-converter - (py/get-attr df-or-srs "columns"))] - (into [] (map #(zipmap ks %)) - (py/get-attr df-or-srs "values")))))) + (kwrds? df-or-srs keywords?))) (defn simple-kw-call "Helper for a cleaner access to `call-attr-kw` from `libpython-clj`" diff --git a/src/panthera/panthera.clj b/src/panthera/panthera.clj index df25a53..79bf6b4 100644 --- a/src/panthera/panthera.clj +++ b/src/panthera/panthera.clj @@ -1,6 +1,6 @@ (ns panthera.panthera (:refer-clojure - :exclude [mod any?]) + :exclude [mod any? drop]) (:require [tech.parallel.utils :refer [export-symbols]] [panthera.pandas.generics] @@ -49,11 +49,12 @@ shape fill-na nbytes - ftype) + ftype + rename + to-excel) (export-symbols panthera.pandas.math - same? dot ne quantile @@ -64,12 +65,12 @@ add sum diff + same? ge cumprod clip cumsum eq - compound mean corr sub @@ -126,16 +127,18 @@ crosstab cut rolling + unstack concatenate remap - factorize + transpose qcut merge-ordered dropna merge-asof + assign ewm groupby melt - assign - unstack - transpose) + drop + drop-rows + drop-cols) diff --git a/test/panthera/config.clj b/test/panthera/config.clj new file mode 100644 index 0000000..ea80a13 --- /dev/null +++ b/test/panthera/config.clj @@ -0,0 +1,8 @@ +(ns panthera.config + (:require + [libpython-clj.python :as py])) + +(defn start-python! + [f] + (py/initialize!) + (f)) diff --git a/test/panthera/generics_test.clj b/test/panthera/generics_test.clj index 1487138..ea56a45 100644 --- a/test/panthera/generics_test.clj +++ b/test/panthera/generics_test.clj @@ -153,10 +153,11 @@ (are [n o] (= (u/->clj (g/head - (g/data-frame - (flatten - (repeat 5 [{:a 1 :b 2} - {:a 2 :b 3}]))) n)) + (g/data-frame + (vec + (flatten + (repeat 5 [{:a 1 :b 2} + {:a 2 :b 3}])))) n)) o) nil (drop-last (flatten (repeat 3 [{:a 1 :b 2} @@ -176,8 +177,7 @@ o) [{:a 1}] [:a] [{:a 1}] [{:a 1 :b 2 :c 3}] [:a :c] [{:a 1 :c 3}] - (vec (repeat 5 {:a 1 :b 2})) [:b] (vec (repeat 5 {:b 2})) - [{:wEiR__.D 1 :b 2}] [:wEiR__.D] [{:w-ei-r-.-d 1}])) + (vec (repeat 5 {:a 1 :b 2})) [:b] (vec (repeat 5 {:b 2})))) (deftest n-largest (are [m o] diff --git a/test/panthera/reshape_test.clj b/test/panthera/reshape_test.clj index f51a760..ecb4072 100644 --- a/test/panthera/reshape_test.clj +++ b/test/panthera/reshape_test.clj @@ -1,13 +1,18 @@ (ns panthera.reshape-test + (:refer-clojure + :exclude [drop]) (:require [clojure.test :refer :all] [libpython-clj.python :as py] + [libpython-clj.require :refer [require-python]] [panthera.pandas.utils :as u :reload true] [panthera.pandas.generics :as g] [panthera.pandas.reshape :as r :reload true] [panthera.pandas.math :as m :reload true] [panthera.pandas.conversion :as c])) +(require-python '[numpy :as np]) + (defn filter-nan [d] (into [] (comp (mapcat vals) (filter (complement #(.isNaN %)))) d)) @@ -25,9 +30,9 @@ :aggfunc :mean} [10.0 25.0]) (is (= (u/->clj (r/crosstab [[1 2 2]] {:columns [[:a :b :a]] :margins true})) - [{:a 1 :b 0 :all 1} - {:a 1 :b 1 :all 2} - {:a 2 :b 1 :all 3}]))) + [{:a 1 :b 0 :All 1} + {:a 1 :b 1 :All 2} + {:a 2 :b 1 :All 3}]))) (deftest pivot (are [d o] @@ -284,13 +289,13 @@ {:columns [:A :B :C]}) v d) o) - :sum {} (g/series [12 15 18] {:index [:A :B :C]}) + :sum {} (g/series [12.0 15 18] {:index [:A :B :C]}) [:sum :min] {} (g/data-frame - {:A [12 1] :B [15 2] :C [18 3]} + {:A [12.0 1] :B [15.0 2] :C [18.0 3]} {:index [:sum :min]}) - :sum {:axis 1} (g/series [6 15 24 0]))) + :sum {:axis 1} (g/series [6.0 15 24 0]))) (deftest remap (are [in mpgs ign o] @@ -400,3 +405,88 @@ {:name "Robin", :toy "Whip", :born nil}] {:index [1 2]} {:subset [:toy]} [{:name "Batman", :toy "Batmobile", :born "1940-04-25"} {:name "Robin", :toy "Whip", :born nil}] {:index [1 2]})) + +(deftest drop + (are [l d o df] + (m/same? + (r/drop + (g/data-frame + (py/$a (np/arange 12) np/reshape [3 4]) + {:columns [:A :B :C :D]}) l d) + (g/data-frame o df)) + [:B :C] {:axis 1} [{:A 0 :D 3} {:A 4 :D 7} {:A 8 :D 11}] {} + [0 1] {} [{"A" 8 "B" 9 "C" 10 "D" 11}] {:index [2]})) + +(deftest melt + (are [d o df] + (m/same? + (r/melt + (r/transpose + (g/data-frame [[:a :b :c] [1 3 5] [2 4 6]] + {:columns [0 1 2] + :index [:A :B :C]})) + d) + (g/data-frame o df)) + + {} [{:variable "A", :value "a"} + {:variable "A", :value "b"} + {:variable "A", :value "c"} + {:variable "B", :value 1} + {:variable "B", :value 3} + {:variable "B", :value 5} + {:variable "C", :value 2} + {:variable "C", :value 4} + {:variable "C", :value 6}] {} + + {:id-vars [:A] + :value-vars [:B]} [{:A "a", :variable "B", :value 1} + {:A "b", :variable "B", :value 3} + {:A "c", :variable "B", :value 5}] {:dtype np/object})) + +(deftest assign + (are [i o d] + (m/same? + (-> (g/data-frame [[:a 1 2] [:b 3 4] [:c 5 6]] + {:columns [:A :B :C]}) + (r/assign i)) + (g/data-frame o d)) + + {:D 3} [{:A "a", :B 1, :C 2, :D 3} + {:A "b", :B 3, :C 4, :D 3} + {:A "c", :B 5, :C 6, :D 3}] {} + + {:D [1 2 3]} [{:A "a", :B 1, :C 2, :D 1} + {:A "b", :B 3, :C 4, :D 2} + {:A "c", :B 5, :C 6, :D 3}] {} + + {:D #(-> (g/subset-cols % :C) + (m/mul 2))} [{:A "a", :B 1, :C 2, :D 4} + {:A "b", :B 3, :C 4, :D 8} + {:A "c", :B 5, :C 6, :D 12}] {})) + +(deftest stack + (is (m/same? + (r/stack + (g/data-frame [[0 1] [2 3]] + {:index [:cat :dog] + :columns [:weight :height]})) + (g/series [0 1 2 3] + {:index [[:cat :cat :dog :dog] + [:weight :height :weight :height]]})))) + +(deftest unstack + (are [d o df] + (m/same? + (r/unstack + (r/stack + (g/data-frame [[1 2] [3 4]] + {:index [:one :two] + :columns [:a :b]})) d) + (g/data-frame o df)) + {} [{:a 1 :b 2} {:a 3 :b 4}] {:index [:one :two]} + {:level 0} [{:one 1, :two 3} {:one 2, :two 4}] {:index [:a :b]})) + +(deftest transpose + (is (m/same? + (r/transpose (g/data-frame [[1 2 3] [4 5 6] [7 8 9]])) + (g/data-frame [[1 4 7] [2 5 8] [3 6 9]])))) diff --git a/test/panthera/utils_test.clj b/test/panthera/utils_test.clj index ce818f9..f41ec68 100644 --- a/test/panthera/utils_test.clj +++ b/test/panthera/utils_test.clj @@ -1,9 +1,12 @@ (ns panthera.utils-test (:require - [clojure.test :refer :all] + [clojure.test :refer :all] + [panthera.config :refer [start-python!]] [libpython-clj.python :as py] [panthera.pandas.utils :as u])) +(use-fixtures :once start-python!) + (deftest pytype (are [t d] (identical? t (u/pytype d)) @@ -44,8 +47,7 @@ {} {} {:a 1} {"a" 1} {:a 1 :b 2} {"a" 1 "b" 2} - {:a-k 1} {"a_k" 1} - {(keyword "with spaces") 1} {"with_spaces" 1})) + {:a-k 1} {"a_k" 1})) (deftest memo-columns-converter (are [i o] @@ -54,7 +56,9 @@ nil nil "a" :a "col_1" :col-1 - ["multi" "col"] [:multi :col])) + ["multi" "col"] [:multi :col] + "ALL_CAPS" :ALL-CAPS + "WeIrD_caPs" :WeIrD-caPs)) (deftest ->clj (is (= (u/->clj From 25cc1d01e705c582435eb45f31409a51a931c81f Mon Sep 17 00:00:00 2001 From: Alan Marazzi Date: Wed, 29 Jan 2020 16:10:23 +0100 Subject: [PATCH 12/12] Finalize docs & tests --- README.md | 229 ++-- examples/panthera-intro.ipynb | 19 +- project.clj | 21 +- src/panthera/numpy.clj | 266 ++-- src/panthera/pandas/math.clj | 464 +++---- src/panthera/pandas/reshape.clj | 2020 +++++++++++++++---------------- src/panthera/pandas/utils.clj | 610 +++++----- src/panthera/panthera.clj | 290 ++--- test/panthera/config.clj | 16 +- test/panthera/generics_test.clj | 772 ++++++------ test/panthera/utils_test.clj | 144 +-- 11 files changed, 2387 insertions(+), 2464 deletions(-) diff --git a/README.md b/README.md index 2715ef8..a5263fd 100644 --- a/README.md +++ b/README.md @@ -1,149 +1,80 @@ -# panthera - -![panthera-logo](https://github.com/alanmarazzi/panthera/blob/master/resources/panthera.png) - -> **Hic sunt leones** - -Latin phrase reported on many maps indicating *Terra incognita*, unexplored or harsh land. - -## What - -Dataframes in Clojure. Through [pandas](https://github.com/pandas-dev/pandas). On Python. - -## Disclaimer - -This is very alpha, things will change fast, will break and the API is neither complete, nor settled. Since a few people have started playing with this there's a Clojars project available. Please give feedback if you're using this, every kind of contribution is appreciated (for more info check the [Contributing](#contributing) section). At the moment everything is mostly undocumented and untested, I'm currently adding them. - -[![Clojars Project](https://img.shields.io/clojars/v/panthera.svg)](https://clojars.org/panthera) - -## Get started - -**Panthera** uses the great [libpython-clj](https://github.com/cnuernber/libpython-clj) as a backend to access Python and get [pandas](https://github.com/pandas-dev/pandas) and [numpy](https://github.com/numpy/numpy) functionality. - -### System level - -If you usually don't develop in Python then a system level install might be a good solution (though always discouraged), if this is your case then follow the subsequent steps. - -To get started you need python, pandas and numpy (the latter comes with the former) on your path. Usually a: - -```bash -sudo apt install libpython3.6-dev -pip3 install numpy pandas xlrd # the latter is for Excel files, if you don't care you can do without -``` - -### Environments - -If you want to have different Python environments, then getting **panthera** to work correctly is a bit more tricky. - -First create your new environment with at least python=3.6, numpy and pandas. (This was tested both on GNU/Linux and WSL with [conda](https://docs.conda.io/projects/conda/en/latest/), but there's no reason why it shouldn't work with other env management tools. On other systems, [Docker is your best bet](https://github.com/scicloj/docker-hub/tree/master/panthera)): - -```bash -conda create -n panthera python=3.6 numpy pandas -``` - -Then check the path to the newly created environment: - -```bash -conda activate panthera -which python -``` - -Now you just have to add to one of your profiles the path to the wanted python executable: - -```bash -{:dev {:resource-paths ["/home/user/miniconda3/envs/panthera"]}} -``` - -You can create different profiles with different paths according to what you need. Now if you want to make it possible to work with **panthera** without having to activate your environments you have 2 choices: - -- assign `PYTHONHOME` env variable to your environment - -```bash -PYTHONHOME="/home/user/miniconda3/envs/panthera" lein whatever -``` - -- assign `PYTHONHOME` env variable before requiring **panthera** - -```bash -(System/setProperty "PYTONHOME" "/home/user/miniconda3/envs/panthera") -``` - -### The actual code - -After this you can start playing around with **panthera** - -```clojure -(require '[panthera.panthera :as pt]) - -(-> (pt/read-csv "mycsv.csv") - (pt/subset-cols "Col1" "Col2" "Col3") - pt/median) -``` - -The above chain will read your csv file as a DataFrame, select only the given columns and then return a Series with the median of each column. - -`panthera.panthera` is the home of the main API, and you can find everything there. The advice is to never `:use` or `:refer :all` the namespace because there are some functions named as core Clojure functions such as `mod` which in this case does the same thing as the core one, but in this case it is vectorized and it works only if the first argument is a Python object. - -## Numpy - -All of Numpy is wrapped and accessible through a single interface from `panthera.numpy`. - -```clojure -(require '[panthera.numpy :refer [npy doc]]) - -(npy :power {:args [[1 2 3] 3]}) -;=> [1 8 27] - -(npy :power) -; This arity returns the actual numpy object that can be passed around to other functions as an argument -``` - -To access functions inside submodules pass to `npy` a sequence of keys leading to the wanted function: - -```clojure -(npy [:linalg :svd] {:args [[1 2 3] [4 5 6]]}) -``` - -You can check the original docstring for every module and function with the `doc` helper - -```clojure -(doc :power) - -(doc [:linalg :eigh]) -``` - -To see what is available and how everything works check the [official docs](https://docs.scipy.org/doc/numpy/reference/) online. - -## Contributing - -Please let me know about any issues, quirks, ideas or even just to say that you're doing something cool with this! I accept issues, PRs or direct messages (you can find me also on https://clojurians.slack.com and on https://clojurians.zulipchat.com). - -## Examples - -You can find some examples in the [examples](https://github.com/alanmarazzi/panthera/tree/master/examples) folder. At the moment that's the best way to start with panthera. - -- [panthera intro](https://github.com/alanmarazzi/panthera/blob/master/examples/panthera-intro.ipynb) ([nbviewer](https://nbviewer.jupyter.org/github/alanmarazzi/panthera/blob/master/examples/panthera-intro.ipynb)) -- [basic concepts (serieses & data-frames)](https://github.com/alanmarazzi/panthera/blob/master/examples/basic-concepts.ipynb) ([nbviewer](https://nbviewer.jupyter.org/github/alanmarazzi/panthera/blob/master/examples/basic-concepts.ipynb)) -- [general Python package wrapper](https://github.com/alanmarazzi/panthera/blob/master/src/panthera/numpy.clj#L84) - an example about how to use panthera to wrap other Python libraries - -## Why "panthera"? - -Pandas is derived from "panel data" and somehow is supposed to mean "Python data analysis library" as well. Though it shouldn't have nothing to do with the cute Chinese bears, there are [logos showing a bear](https://michaelsaruggia.com/wp-content/uploads/2019/03/pandas-python.jpg). - -Panthera doesn't pretend to be a clever wordplay because it doesn't need to. First off [panthera is latin](https://en.wiktionary.org/wiki/panthera) and it literally means "large cat", second though pandas are surely cute, pantherae are way cooler (and [snow leopards](https://en.wikipedia.org/wiki/Snow_leopard) also happen to be among the very few predators of pandas, but that's just a case...). - -## Special thanks - -- [libpython-clj](https://github.com/cnuernber/libpython-clj) -- [pandas](https://pandas.pydata.org/) -- [numpy](https://www.numpy.org/) -- [clojure](https://clojure.org/) -- [logo](https://www.vecteezy.com) - -## License - -Copyright © 2019 Alan Marazzi - -This program and the accompanying materials are made available under the -terms of the Eclipse Public License 2.0 which is available at -http://www.eclipse.org/legal/epl-2.0. +# panthera + +![panthera-logo](https://github.com/alanmarazzi/panthera/blob/master/resources/panthera.png) + +> **Hic sunt leones** + +Latin phrase reported on many maps indicating *Terra incognita*, unexplored or harsh land. + +## What + +Dataframes in Clojure. Through [pandas](https://github.com/pandas-dev/pandas). On Python. + +## Disclaimer + +This is alpha, things will change fast, will break and the API is neither complete, nor settled. Since a few people have started playing with this there's a Clojars project available. Please give feedback if you're using this, every kind of contribution is appreciated (for more info check the [Contributing](#contributing) section). At the moment everything is mostly undocumented and untested, I'm currently adding them. + +[![Clojars Project](https://img.shields.io/clojars/v/panthera.svg)](https://clojars.org/panthera) + +## Get started + +**Panthera** uses the great [libpython-clj](https://github.com/cnuernber/libpython-clj) as a backend to access Python and get [pandas](https://github.com/pandas-dev/pandas) and [numpy](https://github.com/numpy/numpy) functionality. + +### N.B.: check [libpython-clj](https://github.com/cnuernber/libpython-clj) repo on how to install and start a Clojure/Python session. + +### The actual code + +After this you can start playing around with **panthera** + +```clojure +(require '[[panthera.panthera :as pt] + [libpython-clj.python :refer [initialize!]]) + +(initialize!) + +(-> (pt/read-csv "mycsv.csv") + (pt/subset-cols "Col1" "Col2" "Col3") + pt/median) +``` + +The above chain will read your csv file as a DataFrame, select only the given columns and then return a Series with the median of each column. + +`panthera.panthera` is the home of the main API, and you can find everything there. The advice is to never `:use` or `:refer :all` the namespace because there are some functions named as core Clojure functions such as `mod` which in this case does the same thing as the core one, but in this case it is vectorized and it works only if the first argument is a Python object. + +## Numpy + +All of Numpy is accessible through [libpython-clj](https://github.com/cnuernber/libpython-clj) interop, check the repo for more info. + +## Contributing + +Please let me know about any issues, quirks, ideas or even just to say that you're doing something cool with this! I accept issues, PRs or direct messages (you can find me also on https://clojurians.slack.com and on https://clojurians.zulipchat.com). + +## Examples + +You can find some examples in the [examples](https://github.com/alanmarazzi/panthera/tree/master/examples) folder. At the moment that's the best way to start with panthera. + +- [panthera intro](https://github.com/alanmarazzi/panthera/blob/master/examples/panthera-intro.ipynb) ([nbviewer](https://nbviewer.jupyter.org/github/alanmarazzi/panthera/blob/master/examples/panthera-intro.ipynb)) +- [basic concepts (serieses & data-frames)](https://github.com/alanmarazzi/panthera/blob/master/examples/basic-concepts.ipynb) ([nbviewer](https://nbviewer.jupyter.org/github/alanmarazzi/panthera/blob/master/examples/basic-concepts.ipynb)) +- [general Python package wrapper](https://github.com/alanmarazzi/panthera/blob/master/src/panthera/numpy.clj#L84) - an example about how to use panthera to wrap other Python libraries + +## Why "panthera"? + +Pandas is derived from "panel data" and somehow is supposed to mean "Python data analysis library" as well. Though it shouldn't have nothing to do with the cute Chinese bears, there are [logos showing a bear](https://michaelsaruggia.com/wp-content/uploads/2019/03/pandas-python.jpg). + +Panthera doesn't pretend to be a clever wordplay because it doesn't need to. First off [panthera is latin](https://en.wiktionary.org/wiki/panthera) and it literally means "large cat", second though pandas are surely cute, pantherae are way cooler (and [snow leopards](https://en.wikipedia.org/wiki/Snow_leopard) also happen to be among the very few predators of pandas, but that's just a case...). + +## Special thanks + +- [libpython-clj](https://github.com/cnuernber/libpython-clj) +- [pandas](https://pandas.pydata.org/) +- [numpy](https://www.numpy.org/) +- [clojure](https://clojure.org/) +- [logo](https://www.vecteezy.com) + +## License + +Copyright © 2020 Alan Marazzi + +This program and the accompanying materials are made available under the +terms of the Eclipse Public License 2.0 which is available at +http://www.eclipse.org/legal/epl-2.0. diff --git a/examples/panthera-intro.ipynb b/examples/panthera-intro.ipynb index 919c5bd..6f6a4b2 100644 --- a/examples/panthera-intro.ipynb +++ b/examples/panthera-intro.ipynb @@ -71,20 +71,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "nil" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "(require '[panthera.panthera :as pt])" ] @@ -3913,9 +3902,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Clojure (clojupyter-v0.2.2)", + "display_name": "Lein-Clojure", "language": "clojure", - "name": "clojupyter" + "name": "lein-clojure" }, "language_info": { "file_extension": ".clj", diff --git a/project.clj b/project.clj index b104d87..67451db 100644 --- a/project.clj +++ b/project.clj @@ -1,10 +1,11 @@ -(defproject panthera "0.1-alpha.17" - :description "Data Frames in Clojure (with Pandas) + NumPy" - :url "https://github.com/alanmarazzi/panthera" - :scm {:name "git" :url "https://github.com/alanmarazzi/panthera"} - :license {:name "EPL-2.0" - :url "https://www.eclipse.org/legal/epl-2.0/"} - :dependencies [[cnuernber/libpython-clj "1.28"] - [org.clojure/core.memoize "0.7.2"]] - :profiles {:dev {:dependencies [[org.clojure/clojure "1.10.1"]]}}) - +(defproject panthera "0.1-alpha.17" + :description "Data Frames in Clojure (with Pandas) + NumPy" + :url "https://github.com/alanmarazzi/panthera" + :scm {:name "git" :url "https://github.com/alanmarazzi/panthera"} + :license {:name "EPL-2.0" + :url "https://www.eclipse.org/legal/epl-2.0/"} + :dependencies [[cnuernber/libpython-clj "1.32"] + [org.clojure/core.memoize "0.7.2"]] + :profiles {:dev {:dependencies [[org.clojure/clojure "1.10.1"]] + :plugins [[lein-jupyter "0.1.16"]]}}) + diff --git a/src/panthera/numpy.clj b/src/panthera/numpy.clj index 4113746..d9fdbae 100644 --- a/src/panthera/numpy.clj +++ b/src/panthera/numpy.clj @@ -1,133 +1,133 @@ -(ns panthera.numpy - (:require - [libpython-clj.python :as py] - [panthera.pandas.utils :as u])) - -(defonce numpy (py/import-module "numpy")) - -(defn py-get-in - "A similar to `get-in` implementation for Python modules, - classes and functions." - [py-module v] - (let [mods (drop-last v)] - ((apply comp - (reverse - (map (fn [x] #(py/get-attr % x)) mods))) py-module))) - -(defn doc - "Use this to see modules and functions original docstrings. - - **Examples** - - ``` - (doc :power) - - (doc :linalg) - - (doc [:linalg :svd]) - ```" - [ks] - (if (seqable? ks) - (println - (py/get-attr - (py/get-attr - (py-get-in numpy ks) - (last ks)) - "__doc__")) - (println (py/get-attr (py/get-attr numpy ks) "__doc__")))) - -(defn module - [py-module] - (fn [x] - (fn - ([] - (if (seqable? x) - (let [ks (map u/memo-key-converter x)] - (py/get-attr (py-get-in py-module ks) (last ks))) - (py/get-attr py-module (u/memo-key-converter x)))) - ([attrs] - (if (seqable? x) - (let [ks (map u/memo-key-converter x)] - (py/call-attr-kw (py-get-in py-module ks) (last ks) - (vec (:args attrs)) - (u/keys->pyargs (dissoc attrs :args)))) - (py/call-attr-kw py-module (u/memo-key-converter x) - (vec (:args attrs)) - (u/keys->pyargs (dissoc attrs :args)))))))) - -(defn npy - "General method to access Numpy functions and attributes. - - By calling `(npy k)` you get either the value associated with that attribute - (such as `(npy :nan)`) or the native Python function associated with that key. - This is useful to pass functions around to other methods. - - By calling `(npy k {:args [my-args] :other-arg 2})` you're calling that method - with the given arguments. `:args` is a conveniency argument to pass positional - arguments to functions in the same order as you'd pass them to Numpy. - This is because many Numpy functions have native C implementations that - accept only positional arguments. - - For example `(npy :power {:args [[1 2] 2]})` will give back as a result - `[1 4]` because we square (second element of `:args`) all the elements in the - given `Iterable` (first element of `:args`) - - - If you need to access a function in a submodule just pass a sequence of keys - to `npy`, such as `(npy [:linalg :svd])`. The functioning of this is the same - as above, but you'll be acting on the `:svd` function inside the `:linalg` - submodule." - ([k] (((module numpy) k))) - ([k attrs] (((module numpy) k) attrs))) - - -(comment - "An example on how to wrap another Python library, in this case scikit-learn" - - ; sklearn architecture is very convoluted, modules aren't loaded by default - ; but only by explicit import. So we import everything as below - (py/run-simple-string "from sklearn import *") - (defonce sk (py/import-module "sklearn")) - - (defn sklearn - ([k] ((module sk) k)) - ([k args] (((module sk) k) args))) - - (def pokemon (pt/read-csv "resources/pokemon.csv")) - - (def split (sklearn [:model_selection :train_test_split] - {:args [(pt/subset-cols pokemon - "HP" "Attack" - "Defense" "Sp. Atk" - "Sp. Def" "Speed") - (pt/subset-cols pokemon "Legendary")] - :test_size 0.3})) - - (defn train-test - [split k] - ((k {:x-train first - :x-test second - :y-train #(% 2) - :y-test last}) split)) - - (def logistic (sklearn [:linear_model :LogisticRegression] - {:n_jobs -1 :solver "lbfgs"})) - - (defn fit - [model x y] - (py/call-attr model "fit" x y)) - - (def model (fit logistic (train-test split :x-train) - (train-test split :y-train))) - - (defn predict - [model x] - (py/call-attr model "predict" x)) - - (predict model (train-test split :x-test)) - - (defn score - [model x y] - (py/call-attr model "score" x y)) - - (score model (train-test split :x-test) (train-test split :y-test))) +(ns panthera.numpy + (:require + [libpython-clj.python :as py] + [panthera.pandas.utils :as u])) + +(defonce numpy (py/import-module "numpy")) + +(defn py-get-in + "A similar to `get-in` implementation for Python modules, + classes and functions." + [py-module v] + (let [mods (drop-last v)] + ((apply comp + (reverse + (map (fn [x] #(py/get-attr % x)) mods))) py-module))) + +(defn doc + "Use this to see modules and functions original docstrings. + + **Examples** + + ``` + (doc :power) + + (doc :linalg) + + (doc [:linalg :svd]) + ```" + [ks] + (if (seqable? ks) + (println + (py/get-attr + (py/get-attr + (py-get-in numpy ks) + (last ks)) + "__doc__")) + (println (py/get-attr (py/get-attr numpy ks) "__doc__")))) + +(defn module + [py-module] + (fn [x] + (fn + ([] + (if (seqable? x) + (let [ks (map u/memo-key-converter x)] + (py/get-attr (py-get-in py-module ks) (last ks))) + (py/get-attr py-module (u/memo-key-converter x)))) + ([attrs] + (if (seqable? x) + (let [ks (map u/memo-key-converter x)] + (py/call-attr-kw (py-get-in py-module ks) (last ks) + (vec (:args attrs)) + (u/keys->pyargs (dissoc attrs :args)))) + (py/call-attr-kw py-module (u/memo-key-converter x) + (vec (:args attrs)) + (u/keys->pyargs (dissoc attrs :args)))))))) + +(defn npy + "General method to access Numpy functions and attributes. + + By calling `(npy k)` you get either the value associated with that attribute + (such as `(npy :nan)`) or the native Python function associated with that key. + This is useful to pass functions around to other methods. + + By calling `(npy k {:args [my-args] :other-arg 2})` you're calling that method + with the given arguments. `:args` is a conveniency argument to pass positional + arguments to functions in the same order as you'd pass them to Numpy. + This is because many Numpy functions have native C implementations that + accept only positional arguments. + + For example `(npy :power {:args [[1 2] 2]})` will give back as a result + `[1 4]` because we square (second element of `:args`) all the elements in the + given `Iterable` (first element of `:args`) + + + If you need to access a function in a submodule just pass a sequence of keys + to `npy`, such as `(npy [:linalg :svd])`. The functioning of this is the same + as above, but you'll be acting on the `:svd` function inside the `:linalg` + submodule." + ([k] (((module numpy) k))) + ([k attrs] (((module numpy) k) attrs))) + + +(comment + "An example on how to wrap another Python library, in this case scikit-learn" + + ; sklearn architecture is very convoluted, modules aren't loaded by default + ; but only by explicit import. So we import everything as below + (py/run-simple-string "from sklearn import *") + (defonce sk (py/import-module "sklearn")) + + (defn sklearn + ([k] ((module sk) k)) + ([k args] (((module sk) k) args))) + + (def pokemon (pt/read-csv "resources/pokemon.csv")) + + (def split (sklearn [:model_selection :train_test_split] + {:args [(pt/subset-cols pokemon + "HP" "Attack" + "Defense" "Sp. Atk" + "Sp. Def" "Speed") + (pt/subset-cols pokemon "Legendary")] + :test_size 0.3})) + + (defn train-test + [split k] + ((k {:x-train first + :x-test second + :y-train #(% 2) + :y-test last}) split)) + + (def logistic (sklearn [:linear_model :LogisticRegression] + {:n_jobs -1 :solver "lbfgs"})) + + (defn fit + [model x y] + (py/call-attr model "fit" x y)) + + (def model (fit logistic (train-test split :x-train) + (train-test split :y-train))) + + (defn predict + [model x] + (py/call-attr model "predict" x)) + + (predict model (train-test split :x-test)) + + (defn score + [model x y] + (py/call-attr model "score" x y)) + + (score model (train-test split :x-test) (train-test split :y-test))) diff --git a/src/panthera/pandas/math.clj b/src/panthera/pandas/math.clj index 942a5bb..a511878 100644 --- a/src/panthera/pandas/math.clj +++ b/src/panthera/pandas/math.clj @@ -1,232 +1,232 @@ -(ns panthera.pandas.math - (:refer-clojure - :exclude [mod]) - (:require - [libpython-clj.python :as py] - [panthera.pandas.utils :as u])) - -(defn same? - "This works differently than `eq`: the latter checks equality - value by value, `same?` checks that the given `series`es or `data-frame`s contain - the same exact values. This works even with missing values." - [left right] - (py/call-attr left :equals right)) - -(defn- base-math - [k] - (fn [& args] - (reduce - #(py/call-attr - %1 - ({:+ "__add__" - :- "__sub__" - :* "__mul__" - :div "__div__" - :fld "__floordiv__" - :mod "__mod__" - :** "__pow__" - :< "__lt__" - :> "__gt__" - :<= "__le__" - :>= "__ge__" - :!= "__ne__" - := "__eq__" - :dot "__matmul__"} k) - %2) args))) - -(defn ops - [df-or-srs other op & [attrs]] - (u/kw-call - df-or-srs - ({:+ "__add__" - :- "__sub__" - :* "__mul__" - :div "__div__" - :fld "__floordiv__" - :mod "__mod__" - :** "__pow__" - :< "__lt__" - :> "__gt__" - :<= "__le__" - :>= "__ge__" - :!= "__ne__" - := "__eq__" - :dot "__matmul__"} op) - other - attrs)) - -(def add - (base-math :+)) - -(def sub - (base-math :-)) - -(def mul - (base-math :*)) - -(def div - (base-math :div)) - -(def floor-div - (base-math :fld)) - -(def mod - (base-math :mod)) - -(def pow - (base-math :**)) - -(def lt - (base-math :<)) - -(def gt - (base-math :>)) - -(def le - (base-math :<=)) - -(def ge - (base-math :>=)) - -(def eq - (base-math :=)) - -(def ne - (base-math :!=)) - -(def dot - (base-math :dot)) - -(defn abs - [df-or-srs] - (py/call-attr df-or-srs "abs")) - -(defn autocorr - [srs & [lag]] - (py/call-attr srs "autocorr" (or lag 1))) - -(defn between - ([srs left right] - (py/call-attr srs "between" left right)) - ([srs left right inclusive] - (py/call-attr srs "between" left right inclusive))) - -(defn clip - [df-or-srs & [attrs]] - (u/simple-kw-call df-or-srs "clip" attrs)) - -(defn corr - [df-or-srs & args] - (if (= :data-frame (u/pytype df-or-srs)) - (u/simple-kw-call df-or-srs "corr" (first args)) - (u/kw-call df-or-srs "corr" (first args) (second args)))) - -(defn cnt - [df-or-srs & [attrs]] - (u/simple-kw-call df-or-srs "count" attrs)) - -(defn cov - [df-or-srs & args] - (if (= :data-frame (u/pytype df-or-srs)) - (u/simple-kw-call df-or-srs "cov" (first args)) - (u/kw-call df-or-srs "cov" (first args) (second args)))) - -(defn- base-cumulative - [k] - (fn [df-or-srs & [attrs]] - (u/simple-kw-call df-or-srs - ({:max "cummax" - :min "cummin" - :prod "cumprod" - :sum "cumsum" - :diff "diff"} k) - attrs))) - -(def cummax - (base-cumulative :max)) - -(def cummin - (base-cumulative :min)) - -(def cumprod - (base-cumulative :prod)) - -(def cumsum - (base-cumulative :sum)) - -(def diff - (base-cumulative :diff)) - -(defn describe - [df-or-srs & [attrs]] - (u/simple-kw-call df-or-srs "describe" attrs)) - -(defn- other-ops - [k] - (fn [df-or-srs & [attrs]] - (u/simple-kw-call df-or-srs - ({:sum "sum" - :kurt "kurtosis" - :mad "mad" - :max "max" - :min "min" - :mean "mean" - :median "median" - :mode "mode" - :pct "pct_change" - :quant "quantile" - :rank "rank" - :round "round" - :sem "sem" - :skew "skew" - :std "std" - :var "var"} k) - attrs))) - -(def sum - (other-ops :sum)) - -(def kurtosis - (other-ops :kurt)) - -(def mean-abs-dev - (other-ops :mad)) - -(def maximum - (other-ops :max)) - -(def minimum - (other-ops :min)) - -(def mean - (other-ops :mean)) - -(def median - (other-ops :median)) - -(def mode - (other-ops :mode)) - -(def pct-change - (other-ops :pct)) - -(def quantile - (other-ops :quant)) - -(def rank - (other-ops :rank)) - -(def round - (other-ops :round)) - -(def sem - (other-ops :sem)) - -(def skew - (other-ops :skew)) - -(def std - (other-ops :std)) - -(def var - (other-ops :var)) +(ns panthera.pandas.math + (:refer-clojure + :exclude [mod]) + (:require + [libpython-clj.python :as py] + [panthera.pandas.utils :as u])) + +(defn same? + "This works differently than `eq`: the latter checks equality + value by value, `same?` checks that the given `series`es or `data-frame`s contain + the same exact values. This works even with missing values." + [left right] + (py/call-attr left :equals right)) + +(defn- base-math + [k] + (fn [& args] + (reduce + #(py/call-attr + %1 + ({:+ "__add__" + :- "__sub__" + :* "__mul__" + :div "__div__" + :fld "__floordiv__" + :mod "__mod__" + :** "__pow__" + :< "__lt__" + :> "__gt__" + :<= "__le__" + :>= "__ge__" + :!= "__ne__" + := "__eq__" + :dot "__matmul__"} k) + %2) args))) + +(defn ops + [df-or-srs other op & [attrs]] + (u/kw-call + df-or-srs + ({:+ "__add__" + :- "__sub__" + :* "__mul__" + :div "__div__" + :fld "__floordiv__" + :mod "__mod__" + :** "__pow__" + :< "__lt__" + :> "__gt__" + :<= "__le__" + :>= "__ge__" + :!= "__ne__" + := "__eq__" + :dot "__matmul__"} op) + other + attrs)) + +(def add + (base-math :+)) + +(def sub + (base-math :-)) + +(def mul + (base-math :*)) + +(def div + (base-math :div)) + +(def floor-div + (base-math :fld)) + +(def mod + (base-math :mod)) + +(def pow + (base-math :**)) + +(def lt + (base-math :<)) + +(def gt + (base-math :>)) + +(def le + (base-math :<=)) + +(def ge + (base-math :>=)) + +(def eq + (base-math :=)) + +(def ne + (base-math :!=)) + +(def dot + (base-math :dot)) + +(defn abs + [df-or-srs] + (py/call-attr df-or-srs "abs")) + +(defn autocorr + [srs & [lag]] + (py/call-attr srs "autocorr" (or lag 1))) + +(defn between + ([srs left right] + (py/call-attr srs "between" left right)) + ([srs left right inclusive] + (py/call-attr srs "between" left right inclusive))) + +(defn clip + [df-or-srs & [attrs]] + (u/simple-kw-call df-or-srs "clip" attrs)) + +(defn corr + [df-or-srs & args] + (if (= :data-frame (u/pytype df-or-srs)) + (u/simple-kw-call df-or-srs "corr" (first args)) + (u/kw-call df-or-srs "corr" (first args) (second args)))) + +(defn cnt + [df-or-srs & [attrs]] + (u/simple-kw-call df-or-srs "count" attrs)) + +(defn cov + [df-or-srs & args] + (if (= :data-frame (u/pytype df-or-srs)) + (u/simple-kw-call df-or-srs "cov" (first args)) + (u/kw-call df-or-srs "cov" (first args) (second args)))) + +(defn- base-cumulative + [k] + (fn [df-or-srs & [attrs]] + (u/simple-kw-call df-or-srs + ({:max "cummax" + :min "cummin" + :prod "cumprod" + :sum "cumsum" + :diff "diff"} k) + attrs))) + +(def cummax + (base-cumulative :max)) + +(def cummin + (base-cumulative :min)) + +(def cumprod + (base-cumulative :prod)) + +(def cumsum + (base-cumulative :sum)) + +(def diff + (base-cumulative :diff)) + +(defn describe + [df-or-srs & [attrs]] + (u/simple-kw-call df-or-srs "describe" attrs)) + +(defn- other-ops + [k] + (fn [df-or-srs & [attrs]] + (u/simple-kw-call df-or-srs + ({:sum "sum" + :kurt "kurtosis" + :mad "mad" + :max "max" + :min "min" + :mean "mean" + :median "median" + :mode "mode" + :pct "pct_change" + :quant "quantile" + :rank "rank" + :round "round" + :sem "sem" + :skew "skew" + :std "std" + :var "var"} k) + attrs))) + +(def sum + (other-ops :sum)) + +(def kurtosis + (other-ops :kurt)) + +(def mean-abs-dev + (other-ops :mad)) + +(def maximum + (other-ops :max)) + +(def minimum + (other-ops :min)) + +(def mean + (other-ops :mean)) + +(def median + (other-ops :median)) + +(def mode + (other-ops :mode)) + +(def pct-change + (other-ops :pct)) + +(def quantile + (other-ops :quant)) + +(def rank + (other-ops :rank)) + +(def round + (other-ops :round)) + +(def sem + (other-ops :sem)) + +(def skew + (other-ops :skew)) + +(def std + (other-ops :std)) + +(def var + (other-ops :var)) diff --git a/src/panthera/pandas/reshape.clj b/src/panthera/pandas/reshape.clj index 1af3307..1234015 100644 --- a/src/panthera/pandas/reshape.clj +++ b/src/panthera/pandas/reshape.clj @@ -1,1010 +1,1010 @@ -(ns panthera.pandas.reshape - (:refer-clojure - :exclude [drop]) - (:require - [libpython-clj.python :as py] - [panthera.pandas.utils :as u] - [panthera.pandas.generics :as g])) - -(defn crosstab - "Compute a cross tabulation of two (or more) factors. By default - computes a frequency table of the factors unless an array of values and an - aggregation function are passed. - - **Arguments** - - - `seq-or-srs` -> seqable, `series` - - **Attrs** - - - `:columns` -> Iterable, `series`, Iterable of Iter/srs: values to group by - - `:values` -> Iterable, `series`, Iterable of Iter/srs: values to group - according to factors, requires `:aggfunc` - - `:rownames` -> Iterable, `series`: the names of `seq-or-srs` - - `:colnames` -> Iterable, `series`: the names of `:columns` - - `:aggfunc` -> function, keyword, str: the aggregation function, requires - `:values`. It can be a panthera function (`sum`), a numpy function (`(npy :sum)`), - the name of a numpy function (`:mean` or \"mean\") or a Clojure function. In the - latter case be aware that you have to reduce over a map. - - `:margins` -> bool, default `false`: add subtotals - - `:margins-name`: str, default \"All\": name of the row/column holding totals - when `:margins` true - - `:dropna` -> bool, default `true`: exclude columns with all missing values - - `:normalize` -> bool, {`:all` `:index` `columns`}, {0 1}, default `false`: - normalize by dividing all values by the sum of values - - **Examples** - - ``` - (crosstab [[1 2 2]] {:columns [[:a :b :a]]}) - ;; col_0 a b - ;; row_0 - ;; 1 1 0 - ;; 2 1 1 - - (crosstab [[1 2 2]] {:columns [[:a :b :a]] - :rownames [:myrows] - :colnames [:mycols]}) - ;; mycols a b - ;; myrows - ;; 1 1 0 - ;; 2 1 1 - - (crosstab [[1 2 2]] {:columns [[:a :b :b]] - :values [10 20 30] - :aggfunc :mean}) - ;; col_0 a b - ;; row_0 - ;; 1 10.0 NaN - ;; 2 NaN 25.0 - - (crosstab [[1 2 2]] {:columns [[:a :b :a]] - :margins true}) - ;; col_0 a b All - ;; row_0 - ;; 1 1 0 1 - ;; 2 1 1 2 - ;; All 2 1 3 - ``` - " - [seq-or-srs & [{:keys [columns values rownames colnames aggfunc - margins margins-name dropna normalize] - :as attrs}]] - (u/kw-call u/pd "crosstab" seq-or-srs attrs)) - -(defn pivot - "Returns a stacked `data-frame`: basically changes it from long format to wide. - - **Arguments** - - - `df` -> `data-frame` - - **Attrs** - - - `:index` -> str, keyword, default `nil`: the column to use as the new index. - When `nil` uses the current one - - `:columns` -> str, keyword: columns to use for the new `data-frame` - - `:values` -> str, keyword, Iterable, default `nil`: columns to use to populate - values. If `nil` all remaining columns will be used - - **Examples** - - ``` - (def df (data-frame {:foo [:one :one :one :two :two :two] - :bar [:a :b :c :a :b :c] - :baz [1 2 3 4 5 6] - :zoo [:x :y :z :q :w :t]})) - - (pivot df {:columns :bar :index :foo}) - ;; baz zoo - ;; bar a b c a b c - ;; foo - ;; one 1 2 3 x y z - ;; two 4 5 6 q w t - - (pivot df {:index :foo :columns :bar :values [:baz :zoo]}) - ;; baz zoo - ;; bar a b c a b c - ;; foo - ;; one 1 2 3 x y z - ;; two 4 5 6 q w t - ``` - " - [df & [{:keys [index columns values] - :as attrs}]] - (u/simple-kw-call df "pivot" attrs)) - -(defn cut - "Bin the given values into categories. - - Use this when you want to go from continuous values to ordered categories. For - example, you could go from age to age ranges. - - N.B.: `cut` converts your values to a [`Categorical`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Categorical.html#pandas.Categorical) type. This - means that you can choose whether you want a label back or just the new value. - - **Arguments** - - - `seq-or-srs` -> seqable or `series` - - `bins` -> int, Iterable, `series`: how to bin the data. If int defines the number - of equal-width bins, otherwise values are treated as bins edges - - **Attrs** - - - `:right` -> bool, default `true`: include the rightmost edge? - - `:labels` -> Iterable, bool: if Iterable, specifies the labels for the bins, - if false it doesn't return the labels, only the values (**N.B.: the suggestion - is to work with `{:labels false}` as much as possible, especially if you have to - convert things to Clojure at some point**) - - `:retbins` -> bool, default `false`: return bins? - - `:precision` -> int, default 3: the precision of the bins labels - - `:include-lowest` -> bool, default `false`: should the first interval be left-inclusive? - - `:duplicates` -> {`:raise`, `:drop`, `nil`}: ff bin edges are not unique, - raise error or drop non-uniques - - **Examples** - - ``` - (def s (series [1 7 5 4 6 3])) - - (cut s 3) - ;; 0 (0.994, 3.0] - ;; 1 (5.0, 7.0] - ;; 2 (3.0, 5.0] - ;; 3 (3.0, 5.0] - ;; 4 (5.0, 7.0] - ;; 5 (0.994, 3.0] - ;; dtype: category - ;; Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]] - - (cut s [3 5 7]) - ;; 0 NaN - ;; 1 (5.0, 7.0] - ;; 2 (3.0, 5.0] - ;; 3 (3.0, 5.0] - ;; 4 (5.0, 7.0] - ;; 5 NaN - ;; dtype: category - ;; Categories (2, interval[int64]): [(3, 5] < (5, 7]] - - (cut s 3 {:labels false}) - ;; 0 0 - ;; 1 2 - ;; 2 1 - ;; 3 1 - ;; 4 2 - ;; 5 0 - ;; dtype: int64 - ``` - " - [seq-or-srs bins & [{:keys [right labels retbins precision - include-lowest duplicates] - :as attrs}]] - (py/call-attr-kw u/pd "cut" [seq-or-srs bins] - (u/keys->pyargs attrs))) - -(defn qcut - "Bin values into quantiles. - - The same as `cut`, but categories are quantiles. - - **Arguments** - - - `seq-or-srs` -> seqable or `series` - - `q` -> int, Iterable: either number of quantiles or Iterable of quantiles - - **Attrs** - - - `:labels` -> Iterable, bool: if Iterable, specifies the labels for the bins, - if false it doesn't return the labels, only the values (**N.B.: the suggestion - is to work with `{:labels false}` as much as possible, especially if you have to - convert things to Clojure at some point**) - - `:retbins` -> bool, default `false`: return bins? - - `:precision` -> int, default 3: the precision of the bins labels - - `:duplicates` -> {`:raise`, `:drop`, `nil`}: ff bin edges are not unique, - raise error or drop non-uniques - - **Examples** - - ``` - (qcut (range 5) 4) - ;; [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]] - ;; Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] < (2.0, 3.0] < (3.0, 4.0]] - - (qcut (range 5) 3 {:labels [:low :medium :high]}) - ;; [low, low, medium, high, high] - ;; Categories (3, object): [low < medium < high] - - (qcut (range 5) 3 {:labels false}) - ;; [0 0 1 2 2] - ``` - " - [seq-or-srs q & [{:keys [labels retbins precision duplicates] - :as attrs}]] - (py/call-attr-kw u/pd "qcut" [seq-or-srs q] - (u/keys->pyargs attrs))) - -(defn merge-ordered - "Merge two `data-frames` together, facilities to deal with ordered data. - - **Arguments** - - - `left` -> `data-frame` - - `right` -> `data-frame` - - **Attrs** - - - `:on` -> str, keyword, Iterable: column names to be joined on. They must be the - same in both `left` and `right` - - `:left-on` -> str, keyword, Iterable, `series`: columns to join on the `left`, - use this if you have different columns names - - `:right-on` -> str, keyword, Iterable, `series`: columns to join on the `right`, - use this if you have different columns names - - `:left-by` -> str, keyword, Iterable, `series`: groupby `left` on the given - columns and then join piece by piece - - `:right-by` -> str, keyword, Iterable, `series`: groupby `right` on the given - columns and then join piece by piece - - `:fill-method` -> {`:ffill` `nil`}, default `nil`: forward fill missing data - - `:suffixes` -> Iterable, default [`:_x` `:_y`]: the suffixes to add to overlapping - column names - - `:how` -> {`:left` `:right` `:outer` `:inner`}, default `:outer`: kind of join - - **Examples** - ``` - (def A - (data-frame - {:key [:a :c :e :a] - :lvalue [1 2 3 1] - :group [:a :a :a :b]})) - - (def B - (data-frame - {:key [:b :c :d] - :rvalue [1 2 3]})) - - (merge-ordered A B) - ;; key lvalue group rvalue - ;; 0 a 1.0 a NaN - ;; 1 a 1.0 b NaN - ;; 2 b NaN NaN 1.0 - ;; 3 c 2.0 a 2.0 - ;; 4 d NaN NaN 3.0 - ;; 5 e 3.0 a NaN - - (merge-ordered A B {:fill-method :ffill}) - ;; key lvalue group rvalue - ;; 0 a 1 a NaN - ;; 1 a 1 b NaN - ;; 2 b 1 b 1.0 - ;; 3 c 2 a 2.0 - ;; 4 d 2 a 3.0 - ;; 5 e 3 a 3.0 - - (merge-ordered A B {:fill-method :ffill :left-by \"group\"}) - ;; key lvalue group rvalue - ;; 0 a 1 a NaN - ;; 1 b 1 a 1.0 - ;; 2 c 2 a 2.0 - ;; 3 d 2 a 3.0 - ;; 4 e 3 a 3.0 - ;; 5 a 1 b NaN - ;; 6 b 1 b 1.0 - ;; 7 c 1 b 2.0 - ;; 8 d 1 b 3.0 - - (merge-ordered A B {:left-on :lvalue :right-on :rvalue}) - ;; key_x lvalue group key_y rvalue - ;; 0 a 1 a b 1 - ;; 1 a 1 b b 1 - ;; 2 c 2 a c 2 - ;; 3 e 3 a d 3 - ``` - " - [left right & [{:keys [on left-on right-on left-by right-by - fill-method suffixes how] - :as attrs}]] - (py/call-attr-kw u/pd "merge_ordered" [left right] - (u/keys->pyargs attrs))) - -(defn merge-asof - "Similar to a left join, but merges on nearest key rather than equal. - - **Arguments** - - - `left` -> `data-frame`: sorted by key - - `right` -> `data-frame`: sorted by key - - **Attrs** - - - `:on` str, keyword -> column name to join on. Must be in both `data-frames` and - it must be ordered and numeric (dates, int, etc) - - `:left-on` -> str, keyword: column name to join in left `data-frame`. The - requirements are the same as for `:on` - - `:right-on` -> str, keyword: column name to join in right `data-frame`. The - requirements are the same as for `:on` - - `:left-index` -> bool: index of left `data-frame` is the join key? - - `:right-index` -> bool: index of right `data-frame` is the join key? - - `:by` -> str, keyword, Iterable, `series`: match these columns before merging - - `:left-by` -> str, keyword, Iterable. `series`: as `:by` but only for left `data-frame` - - `:right-by` -> str, keyword, Iterable. `series`: as `:by` but only for right `data-frame` - - `:suffixes` -> Iterable: suffix to add to overlapping column names, must - have length 2 and the first one is `left` and second one is `right` - - `:tolerance` -> depends on key: the tolerance for merging - - `:allow-exact-matches` -> bool, default `true`: allow matching with same `:on` value? - - `:direction` -> {`:backward` `:forward` `:nearest`}, default `:backward`: search for - prior, subsequent or closest matches - - **Examples** - - ``` - (def trades - (data-frame - {:time (->datetime [\"2016-05-25 13:30:00.023\" - \"2016-05-25 13:30:00.038\" - \"2016-05-25 13:30:00.048\" - \"2016-05-25 13:30:00.048\"]) - :ticker [:MSFT :MSFT :GOOG :AAPL] - :price [51.95 51.95 720.77 98.00] - :quantity [75 155 100 100]})) - - (def quotes - (data-frame - {:time (->datetime [\"2016-05-25 13:30:00.023\" - \"2016-05-25 13:30:00.023\" - \"2016-05-25 13:30:00.030\" - \"2016-05-25 13:30:00.048\" - \"2016-05-25 13:30:00.049\"]) - :ticker [:GOOG :MSFT :MSFT :GOOG :AAPL] - :bid [720.5 51.95 51.97 720.5 97.99] - :ask [720.93 51.96 51.98 720.93 98.01]})) - - (merge-asof trades quotes {:on :time}) - ;; time ticker_x price quantity ticker_y bid ask - ;; 0 2016-05-25 13:30:00.023 MSFT 51.95 75 MSFT 51.95 51.96 - ;; 1 2016-05-25 13:30:00.038 MSFT 51.95 155 MSFT 51.97 51.98 - ;; 2 2016-05-25 13:30:00.048 GOOG 720.77 100 GOOG 720.50 720.93 - ;; 3 2016-05-25 13:30:00.048 AAPL 98.00 100 GOOG 720.50 720.93 - - (merge-asof trades quotes {:on :time :allow-exact-matches false}) - ;; time ticker_x price quantity ticker_y bid ask - ;; 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN NaN - ;; 1 2016-05-25 13:30:00.038 MSFT 51.95 155 MSFT 51.97 51.98 - ;; 2 2016-05-25 13:30:00.048 GOOG 720.77 100 MSFT 51.97 51.98 - ;; 3 2016-05-25 13:30:00.048 AAPL 98.00 100 MSFT 51.97 51.98 - - (merge-asof trades quotes {:on :time :direction :forward}) - ;; time ticker_x price quantity ticker_y bid ask - ;; 0 2016-05-25 13:30:00.023 MSFT 51.95 75 GOOG 720.5 720.93 - ;; 1 2016-05-25 13:30:00.038 MSFT 51.95 155 GOOG 720.5 720.93 - ;; 2 2016-05-25 13:30:00.048 GOOG 720.77 100 GOOG 720.5 720.93 - ;; 3 2016-05-25 13:30:00.048 AAPL 98.00 100 GOOG 720.5 720.93 - - (merge-asof trades quotes {:on :time :by :ticker}) - ;; time ticker price quantity bid ask - ;; 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 - ;; 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 - ;; 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 - ;; 3 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN - ``` - " - [left right & [{:keys [on left-on right-on left-index right-index by - left-by right-by suffixes tolerance - allow-exact-matches direction] - :as attrs}]] - (py/call-attr-kw u/pd "merge_asof" [left right] - (u/keys->pyargs attrs))) - -(defn concatenate - "Append `series`es and/or `data-frame`s along a wanted axis. - - **Arguments** - - - `dfs-or-srss` -> Iterable: a collection of multiple `series`/`data-frame` - - **Attrs** - - - `:axis` -> int, default 0: 0 = rows, 1 = columns - - `:join` -> {`:inner` `:outer`}, default `:outer`: the kind of join on other `:axis` - - `:ignore-index` -> bool, default `false`: whether to consider the index along - the wanted `:axis` - - `:keys` -> Iterable, default `nil`: this lets you build a hierarchical index - using the passed `:keys` as the outermost levels - - `:levels` -> Iterable, default `nil`: unique values for building a multi index - - `:names` -> Iterable, default `nil`: names of the levels in the hierarchical index - - `:verify-integrity` -> bool, default `false`: does the new `:axis` - contain duplicates? (P.S.: expensive operation) - - `:sort` -> bool, default `true`: sort the other `:axis` when `:join` is `:outer` - - `:copy` -> bool, default `true`: if `false` avoid copying when unnecessary - - **Examples** - - ``` - (concatenate [(series (range 3)) (series (range 3))]) - ;; 0 0 - ;; 1 1 - ;; 2 2 - ;; 0 0 - ;; 1 1 - ;; 2 2 - ;; dtype: int64 - - (concatenate [(series (range 3)) (series (range 3))] {:axis 1}) - ;; 0 1 - ;; 0 0 0 - ;; 1 1 1 - ;; 2 2 2 - - (concatenate [(data-frame {:a [1 2 3] :b [4 5 6]}) - (data-frame {:a [2 2 2] :b [3 3 3]})]) - ;; a b - ;; 0 1 4 - ;; 1 2 5 - ;; 2 3 6 - ;; 0 2 3 - ;; 1 2 3 - ;; 2 2 3 - - (concatenate [(data-frame {:a [1 2 3] :b [4 5 6]}) - (data-frame {:a [2 2 2] :b [3 3 3]})] - {:ignore-index true}) - ;; a b - ;; 0 1 4 - ;; 1 2 5 - ;; 2 3 6 - ;; 3 2 3 - ;; 4 2 3 - ;; 5 2 3 - ``` - " - [dfs-or-srss & [{:keys [axis join ignore-index keys levels - names verify-integrity sort copy] - :as attrs}]] - (u/kw-call u/pd "concat" dfs-or-srss attrs)) - -(defn aggregate - "Aggregate data using one or more functions over a given axis. - - This is very similar to `reduce`, but works on `data-frames` as well. - - **Arguments** - - - `df-or-srs` -> `data-frame`, `series` - - `how` -> keyword, str, function, Iterable: how to aggregate data. This accepts - either panthera functions strings/keywords, a list of the previous and/or user - defined functions. Check examples for more info. - - **Attrs** - - - `:axis` -> {0 `:index` 1 `:columns`}, default 0: 0 = apply function along - cols; 1 = apply function along rows - - `fn-args` -> if the provided collapsing function needs arguments, just list - them freely (see examples) - - **Examples** - - ``` - (def a (data-frame - [[1, 2, 3] - [4, 5, 6] - [7, 8, 9] - [##NaN, ##NaN, ##NaN]] - {:columns [:A :B :C]})) - - (aggregate (series [1 2 3]) :sum) - ;; 6 - - (aggregate a [:sum :min]) - ;; A B C - ;; sum 12.0 15.0 18.0 - ;; min 1.0 2.0 3.0 - - ; if `how` needs arguments, you can pass them as `attrs` - (aggregate (series [1 2 3]) :cov {:other (series [4 5 6])}) - ;; 1.0 - - (aggregate (series [1 2 3]) inc) - ;; 0 2 - ;; 1 3 - ;; 2 4 - ;; dtype: int64 - ``` - " - [df-or-srs how & [{:keys [axis fn-args] :as attrs}]] - (u/kw-call df-or-srs "agg" how attrs)) - -(defn remap - "Remap values in a series. - - This is the same as using `map` on a sequence while using a map as the mapped - function: `(map {:a 1 :b 2} [:a :b]) => (1 2)` - - **Arguments** - - - `srs` -> `series` - - `mappings` -> map, function: the mapping correspondence - - `na-action` -> {`nil` `:ignore`}, default `nil`: `:ignore` doesn't pass missing - values to the `mappings` - - **Examples** - - ``` - (remap (series [:a :b :c]) {:a 1 :b 2 :c 3}) - ;; 0 1 - ;; 1 2 - ;; 2 3 - ;; dtype: int64 - - (remap (series [:a :b ##NaN]) #(str \"This is \" %)) - ;; 0 This is a - ;; 1 This is b - ;; 2 This is NaN - ;; dtype: object - - (remap (series [:a :b ##NaN]) #(str \"This is \" %) :ignore) - ;; 0 This is a - ;; 1 This is b - ;; 2 NaN - ;; dtype: object - ``` - " - [srs mappings & [na-action]] - (py/call-attr srs "map" mappings (or na-action nil))) - -(defn groupby - "Group `data-frame` or `series` by a given variable. - - Note that `groupby` does nothing by itself, this must be followed by another - operation like aggregation. - - **Arguments** - - - `df-or-srs` -> `data-frame`, `series` - - `by` -> str, keyword, Iterable, map, function: it can be a column, a list of - columns, a function used to group the index, a collection of values to use as - grouping variable - - **Attrs** - - - `:axis` -> {0 `:index` 1 `:columns`}: split along columns or rows - - `:level` -> int, str, keyword, Iterable: if multiple index, group by this - or these - - `:as-index` -> bool, default `true`: when `false` this becomes basically - as the SQL group by output - - `:sort` -> bool, default `true`: if `false` you get a performance improvement - - `:group-keys` -> bool, default `true`: add group keys to index when afterwards - you call `apply` - - `:squeeze` -> bool, default `false`: reduce dimensionality of the output if possible - - `:observed` -> bool, default `false`: this only applies to Categoricals: - if `true`, only show observed values for categorical groupers, - if `false`, show all values for categorical groupers - - **Examples** - - ``` - (def a (data-frame {:animal [:falcon :falcon :parrot :parrot] - :max-speed [380 370 24 26]})) - - (-> a (r/groupby :animal) m/mean) - max-speed - ;; animal - ;; falcon 375 - ;; parrot 25 - - (-> a (r/groupby :animal {:as-index false}) m/mean) - ;; animal max-speed - ;; 0 falcon 375 - ;; 1 parrot 25 - ``` - " - [df-or-srs by & [{:keys [axis level as-index sort group-keys - squeeze observed] :as attrs}]] - (u/kw-call df-or-srs "groupby" by attrs)) - -(defn rolling - "Rolling window calculations - - **Arguments** - - - `df-or-srs` -> `data-frame`, `series` - - `window` -> int, str. keyword: the size of the window. If str or keyword then - this is considered as a time offset (e.g. :2s = 2 seconds, :30D = 30 days; - check this for more options https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases) - - **Attrs** - - - `:min-periods` -> int: minimum number of observations to have a value. For - times the default is 1, otherwise the default is `window` - - `:center` -> bool, default `false`: if `false` the result is set at the right - edge of the window, otherwise it gets centered - - `:win-type` -> str, keyword: refer to https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows - - `:on`-> str, keyword: column to use for the rolling window, only in case this - is not the index - - `:axis` -> {0 `:index` 1 `:columns`}: split along columns or rows - - `:closed` -> {`:right` `:left` `:both` `:neither`}: where to make the interval - close - - **Examples** - ``` - (def a (data-frame {:b [0 1 2 3 4]} - {:index - (panthera.pandas.conversion/->datetime - (series - [\"20130101 09:00:00\" - \"20130101 09:00:02\" - \"20130101 09:00:03\" - \"20130101 09:00:05\" - \"20130101 09:00:06\"]))})) - - (sum (rolling a 2)) - ;; b - ;; 2013-01-01 09:00:00 NaN - ;; 2013-01-01 09:00:02 1.0 - ;; 2013-01-01 09:00:03 3.0 - ;; 2013-01-01 09:00:05 5.0 - ;; 2013-01-01 09:00:06 7.0 - - (sum (rolling a :2s)) - ;; b - ;; 2013-01-01 09:00:00 0.0 - ;; 2013-01-01 09:00:02 1.0 - ;; 2013-01-01 09:00:03 3.0 - ;; 2013-01-01 09:00:05 3.0 - ;; 2013-01-01 09:00:06 7.0 - - (sum (rolling a 2 {:win-type :triang})) - ;; b - ;; 2013-01-01 09:00:00 NaN - ;; 2013-01-01 09:00:02 0.5 - ;; 2013-01-01 09:00:03 1.5 - ;; 2013-01-01 09:00:05 2.5 - ;; 2013-01-01 09:00:06 3.5 - - (sum (rolling a 2 {:min-periods 1})) - ;; b - ;; 2013-01-01 09:00:00 0.0 - ;; 2013-01-01 09:00:02 1.0 - ;; 2013-01-01 09:00:03 3.0 - ;; 2013-01-01 09:00:05 5.0 - ;; 2013-01-01 09:00:06 7.0 - ``` - " - [df-or-srs window & [{:keys [min-periods center win-type on axis closed] - :as attrs}]] - (u/kw-call df-or-srs "rolling" window attrs)) - -(defn ewm - "Exponentially weighted functions. - - **Arguments** - - - `df-or-srs` -> `data-frame`, `series` - - **Attrs** - - - `:com` -> numeric: decay in terms of center of mass - - `:span` -> numeric: decay in terms of span - - `:halflife` -> numeric: decay in terms of half-life - - `:alpha` -> numeric: smoothing factor - - `:min-periods` -> int, default 0: minimum number of observations - - `:adjust` -> bool, default `true`: divide by decaying adjustment factor - in beginning periods to account for imbalance in relative weightings - - `:ignore-na` -> bool, default `false`: ignore missing values - - `:axis` -> {0 `:index` 1 `:columns`}: use columns or rows - - **Examples** - - ``` - (def a (g/data-frame {:b [0 1 2 ##NaN 4]})) - - (-> a (ewm {:com 0.5}) mean) - ;; b - ;; 0 0.000000 - ;; 1 0.750000 - ;; 2 1.615385 - ;; 3 1.615385 - ;; 4 3.670213 - - (-> a (ewm {:span 3}) mean) - ;; b - ;; 0 0.000000 - ;; 1 0.666667 - ;; 2 1.428571 - ;; 3 1.428571 - ;; 4 3.217391 - - (-> a (ewm {:com 0.5 :ignore-na true}) mean) - ;; b - ;; 0 0.000000 - ;; 1 0.750000 - ;; 2 1.615385 - ;; 3 1.615385 - ;; 4 3.225000 - ``` - " - [df-or-srs & [{:keys [com span halflife min-periods adjust ignore-na axis] - :as attrs}]] - (u/simple-kw-call df-or-srs "ewm" attrs)) - -(defn drop - "Drop requested rows or columns. - - Remove rows or columns by specifying label names and corresponding axis, - or by specifying directly index or column names. When using a multi-index, - labels on different levels can be removed by specifying the level. - - **Arguments** - - - `df-or-srs` -> `data-frame`, `series` - - `labels` -> keyword, str, numeric, Iterable: index or labels to drop - - **Attrs** - - - `:axis` -> int, default 0: 0 = rows, 1 = columns - - `:level` -> numeric, keyword, str: level to drop from multi index - - `:errors` -> {`:ignore` `:raise`}, default `:raise`: ignore or raise errors - - **Examples** - - ``` - (require-python '[numpy :as np]) - (def df - (data-frame - (np/reshape (np/arange 12) [3 4]) - {:columns [:A :B :C :D]})) - - (drop df [:B :C] {:axis 1}) - ;; A D - ;; 0 0 3 - ;; 1 4 7 - ;; 2 8 11 - - (drop df [0 1]) - ;; A B C D - ;; 2 8 9 10 11 - ``` - " - [df-or-srs labels & [{:keys [axis level errors] :as attrs}]] - (u/kw-call df-or-srs "drop" labels attrs)) - -(defn drop-rows - "A shorthand for `(drop df [0 2] {:axis 0})` - - See [[drop]] docs for more info" - [df rows & [{:keys [level errors] :as attrs}]] - (drop df rows (merge attrs {:axis 0}))) - -(defn drop-cols - "A shorthand for `(drop df [:A :C] {:axis 1})` - - See [[drop]] docs for more info" - [df cols & [{:keys [level errors] :as attrs}]] - (drop df cols (merge attrs {:axis 1}))) - -(defn dropna - "Drop missing values. - - **Arguments** - - - `df-or-srs` -> `data-frame`, `series` - - **Attrs** - - - `:axis` -> int, default 0: 0 = rows, 1 = columns - - `:how` -> {`:any` `:all`}, default `:any`: drop when there are `:any` missing - values, or `:all` missing values - - `:thresh` -> numeric: require `:thresh` missing values to drop - - `:subset` -> Iterable: the subset to consider on opposite axis; e.g. if - you drop rows `:subset` are the columns to consider for dropping - - **Examples** - - ``` - (def df - (data-frame {:name [:Alfred :Batman :Robin] - :toy [nil :Batmobile :Whip] - :born [nil :1940-04-25 nil]}) - - (dropna df) - ;; name toy born - ;; 1 Batman Batmobile 1940-04-25 - ``` - " - [df-or-srs & [{:keys [axis how thresh subset] - :as attrs}]] - (u/simple-kw-call df-or-srs "dropna" attrs)) - -(defn melt - "Unpivot a `data-frame` from wide format to long format. - - Basically reshape the `data-frame` to have one row per observation and one - column per variable - - **Arguments** - - - `df` -> `data-frame` - - **Attrs** - - - `:id-vars` -> Iterable: columns to use as identifiers - - `:value-vars` -> Iterable: columns to melt (unpivot), if not specified uses - all the columns not in `:id-vars` - - `:var-name` -> keyword, str, default `:variable`: name for the variable column - - `:value-name` -> keyword, str, default `:value`: name for the value column - - `:col-level` -> numeric, str: the level to use for melting - - **Examples** - - ``` - (def df - (transpose - (data-frame [[:a :b :c] [1 3 5] [2 4 6]] - {:columns [0 1 2] - :index [:A :B :C]}))) - - (melt df) - ;; variable value - ;; 0 A a - ;; 1 A b - ;; 2 A c - ;; 3 B 1 - ;; 4 B 3 - ;; 5 B 5 - ;; 6 C 2 - ;; 7 C 4 - ;; 8 C 6 - - (melt df {:id-vars [:A] :value-vars [:B]}) - ;; A variable value - ;; 0 a B 1 - ;; 1 b B 3 - ;; 2 c B 5 - - (melt df {:id-vars [:A] :value-vars [:B :C]}) - ;; A variable value - ;; 0 a B 1 - ;; 1 b B 3 - ;; 2 c B 5 - ;; 3 a C 2 - ;; 4 b C 4 - ;; 5 c C 6 - ``` - " - [df & [{:keys [id-vars value-vars var-name - value-name col-level] :as attrs}]] - (u/simple-kw-call df "melt" attrs)) - -(defn assign - "Assign new columns to `df-or-srs` - - **Arguments** - - - `df-or-srs` -> `data-frame`, `series` - - `cols` -> map: either a map `{:col-name value}`, or a map `{:col-name fn}` - - **Examples** - - ``` - (def df - (transpose - (data-frame [[:a :b :c] [1 3 5] [2 4 6]] - {:columns [0 1 2] - :index [:A :B :C]}))) - - (assign df {:D 3}) - ;; A B C D - ;; 0 a 1 2 3 - ;; 1 b 3 4 3 - ;; 2 c 5 6 3 - - (assign df {:D [1 2 3]}) - ;; A B C D - ;; 0 a 1 2 1 - ;; 1 b 3 4 2 - ;; 2 c 5 6 3 - - (assign df {:D #(-> (subset-cols % :C) (mul 2))}) - ;; A B C D - ;; 0 a 1 2 4 - ;; 1 b 3 4 8 - ;; 2 c 5 6 12 - ``` - " - [df-or-srs cols] - (py/call-attr-kw df-or-srs "assign" [] cols)) - -(defn stack - "Stack the prescribed level(s) from columns to index. - - **Arguments** - - - `df-or-srs` -> `data-frame`, `series` - - **Attrs** - - - `:level` -> numeric, keyword, str, default -1: level to stack - - `:dropna` -> bool, default true: drop rows with missing values if generated - - **Examples** - - ``` - (def df - (data-frame [[0 1] [2 3]] - {:index [:cat :dog] - :columns [:weight :height]})) - - (stack df) - ;; cat weight 0 - ;; height 1 - ;; dog weight 2 - ;; height 3 - ;; dtype: int64 - ``` - " - [df-or-srs & [{:keys [level dropna] :as attrs}]] - (u/simple-kw-call df-or-srs "stack" attrs)) - -(defn unstack - "Pivot a level of the (necessarily hierarchical) index labels, - returning a DataFrame having a new level of column labels whose inner-most - level consists of the pivoted index labels. - - **Arguments** - - - `df-or-srs` -> `data-frame`, `series` - - **Attrs** - - - `:level` -> numeric, keyword, str, default -1: level to unstack - - `:fill-value` -> any: replace missing values produced by `unstack` with this - - **Examples** - - ``` - (def s - (stack - (data-frame [[1 2] [3 4]] - {:index [:one :two] - :columns [:a :b]}))) - - (unstack s) - ;; a b - ;; one 1 2 - ;; two 3 4 - - (unstack s {:level 0}) - ;; one two - ;; a 1 3 - ;; b 2 4 - - (unstack (unstack s {:level 0})) - ;; one a 1 - ;; b 2 - ;; two a 3 - ;; b 4 - ;; dtype: int64 - ``` - " - [df-or-srs & [{:keys [level fill_value] :as attrs}]] - (u/simple-kw-call df-or-srs "unstack" attrs)) - -(defn transpose - "Transpose the given panthera object - - **Arguments** - - - `df-or-srs` -> `data-frame`, `series` - - **Examples** - - ``` - (def df (data-frame [[1 2 3] [4 5 6] [7 8 9]])) - - (transpose df) - ;; 0 1 2 - ;; 0 1 4 7 - ;; 1 2 5 8 - ;; 2 3 6 9 - ``` - " - [df-or-srs] - (py/get-attr df-or-srs "T")) +(ns panthera.pandas.reshape + (:refer-clojure + :exclude [drop]) + (:require + [libpython-clj.python :as py] + [panthera.pandas.utils :as u] + [panthera.pandas.generics :as g])) + +(defn crosstab + "Compute a cross tabulation of two (or more) factors. By default + computes a frequency table of the factors unless an array of values and an + aggregation function are passed. + + **Arguments** + + - `seq-or-srs` -> seqable, `series` + + **Attrs** + + - `:columns` -> Iterable, `series`, Iterable of Iter/srs: values to group by + - `:values` -> Iterable, `series`, Iterable of Iter/srs: values to group + according to factors, requires `:aggfunc` + - `:rownames` -> Iterable, `series`: the names of `seq-or-srs` + - `:colnames` -> Iterable, `series`: the names of `:columns` + - `:aggfunc` -> function, keyword, str: the aggregation function, requires + `:values`. It can be a panthera function (`sum`), a numpy function (`(npy :sum)`), + the name of a numpy function (`:mean` or \"mean\") or a Clojure function. In the + latter case be aware that you have to reduce over a map. + - `:margins` -> bool, default `false`: add subtotals + - `:margins-name`: str, default \"All\": name of the row/column holding totals + when `:margins` true + - `:dropna` -> bool, default `true`: exclude columns with all missing values + - `:normalize` -> bool, {`:all` `:index` `columns`}, {0 1}, default `false`: + normalize by dividing all values by the sum of values + + **Examples** + + ``` + (crosstab [[1 2 2]] {:columns [[:a :b :a]]}) + ;; col_0 a b + ;; row_0 + ;; 1 1 0 + ;; 2 1 1 + + (crosstab [[1 2 2]] {:columns [[:a :b :a]] + :rownames [:myrows] + :colnames [:mycols]}) + ;; mycols a b + ;; myrows + ;; 1 1 0 + ;; 2 1 1 + + (crosstab [[1 2 2]] {:columns [[:a :b :b]] + :values [10 20 30] + :aggfunc :mean}) + ;; col_0 a b + ;; row_0 + ;; 1 10.0 NaN + ;; 2 NaN 25.0 + + (crosstab [[1 2 2]] {:columns [[:a :b :a]] + :margins true}) + ;; col_0 a b All + ;; row_0 + ;; 1 1 0 1 + ;; 2 1 1 2 + ;; All 2 1 3 + ``` + " + [seq-or-srs & [{:keys [columns values rownames colnames aggfunc + margins margins-name dropna normalize] + :as attrs}]] + (u/kw-call u/pd "crosstab" seq-or-srs attrs)) + +(defn pivot + "Returns a stacked `data-frame`: basically changes it from long format to wide. + + **Arguments** + + - `df` -> `data-frame` + + **Attrs** + + - `:index` -> str, keyword, default `nil`: the column to use as the new index. + When `nil` uses the current one + - `:columns` -> str, keyword: columns to use for the new `data-frame` + - `:values` -> str, keyword, Iterable, default `nil`: columns to use to populate + values. If `nil` all remaining columns will be used + + **Examples** + + ``` + (def df (data-frame {:foo [:one :one :one :two :two :two] + :bar [:a :b :c :a :b :c] + :baz [1 2 3 4 5 6] + :zoo [:x :y :z :q :w :t]})) + + (pivot df {:columns :bar :index :foo}) + ;; baz zoo + ;; bar a b c a b c + ;; foo + ;; one 1 2 3 x y z + ;; two 4 5 6 q w t + + (pivot df {:index :foo :columns :bar :values [:baz :zoo]}) + ;; baz zoo + ;; bar a b c a b c + ;; foo + ;; one 1 2 3 x y z + ;; two 4 5 6 q w t + ``` + " + [df & [{:keys [index columns values] + :as attrs}]] + (u/simple-kw-call df "pivot" attrs)) + +(defn cut + "Bin the given values into categories. + + Use this when you want to go from continuous values to ordered categories. For + example, you could go from age to age ranges. + + N.B.: `cut` converts your values to a [`Categorical`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Categorical.html#pandas.Categorical) type. This + means that you can choose whether you want a label back or just the new value. + + **Arguments** + + - `seq-or-srs` -> seqable or `series` + - `bins` -> int, Iterable, `series`: how to bin the data. If int defines the number + of equal-width bins, otherwise values are treated as bins edges + + **Attrs** + + - `:right` -> bool, default `true`: include the rightmost edge? + - `:labels` -> Iterable, bool: if Iterable, specifies the labels for the bins, + if false it doesn't return the labels, only the values (**N.B.: the suggestion + is to work with `{:labels false}` as much as possible, especially if you have to + convert things to Clojure at some point**) + - `:retbins` -> bool, default `false`: return bins? + - `:precision` -> int, default 3: the precision of the bins labels + - `:include-lowest` -> bool, default `false`: should the first interval be left-inclusive? + - `:duplicates` -> {`:raise`, `:drop`, `nil`}: ff bin edges are not unique, + raise error or drop non-uniques + + **Examples** + + ``` + (def s (series [1 7 5 4 6 3])) + + (cut s 3) + ;; 0 (0.994, 3.0] + ;; 1 (5.0, 7.0] + ;; 2 (3.0, 5.0] + ;; 3 (3.0, 5.0] + ;; 4 (5.0, 7.0] + ;; 5 (0.994, 3.0] + ;; dtype: category + ;; Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]] + + (cut s [3 5 7]) + ;; 0 NaN + ;; 1 (5.0, 7.0] + ;; 2 (3.0, 5.0] + ;; 3 (3.0, 5.0] + ;; 4 (5.0, 7.0] + ;; 5 NaN + ;; dtype: category + ;; Categories (2, interval[int64]): [(3, 5] < (5, 7]] + + (cut s 3 {:labels false}) + ;; 0 0 + ;; 1 2 + ;; 2 1 + ;; 3 1 + ;; 4 2 + ;; 5 0 + ;; dtype: int64 + ``` + " + [seq-or-srs bins & [{:keys [right labels retbins precision + include-lowest duplicates] + :as attrs}]] + (py/call-attr-kw u/pd "cut" [seq-or-srs bins] + (u/keys->pyargs attrs))) + +(defn qcut + "Bin values into quantiles. + + The same as `cut`, but categories are quantiles. + + **Arguments** + + - `seq-or-srs` -> seqable or `series` + - `q` -> int, Iterable: either number of quantiles or Iterable of quantiles + + **Attrs** + + - `:labels` -> Iterable, bool: if Iterable, specifies the labels for the bins, + if false it doesn't return the labels, only the values (**N.B.: the suggestion + is to work with `{:labels false}` as much as possible, especially if you have to + convert things to Clojure at some point**) + - `:retbins` -> bool, default `false`: return bins? + - `:precision` -> int, default 3: the precision of the bins labels + - `:duplicates` -> {`:raise`, `:drop`, `nil`}: ff bin edges are not unique, + raise error or drop non-uniques + + **Examples** + + ``` + (qcut (range 5) 4) + ;; [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]] + ;; Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] < (2.0, 3.0] < (3.0, 4.0]] + + (qcut (range 5) 3 {:labels [:low :medium :high]}) + ;; [low, low, medium, high, high] + ;; Categories (3, object): [low < medium < high] + + (qcut (range 5) 3 {:labels false}) + ;; [0 0 1 2 2] + ``` + " + [seq-or-srs q & [{:keys [labels retbins precision duplicates] + :as attrs}]] + (py/call-attr-kw u/pd "qcut" [seq-or-srs q] + (u/keys->pyargs attrs))) + +(defn merge-ordered + "Merge two `data-frames` together, facilities to deal with ordered data. + + **Arguments** + + - `left` -> `data-frame` + - `right` -> `data-frame` + + **Attrs** + + - `:on` -> str, keyword, Iterable: column names to be joined on. They must be the + same in both `left` and `right` + - `:left-on` -> str, keyword, Iterable, `series`: columns to join on the `left`, + use this if you have different columns names + - `:right-on` -> str, keyword, Iterable, `series`: columns to join on the `right`, + use this if you have different columns names + - `:left-by` -> str, keyword, Iterable, `series`: groupby `left` on the given + columns and then join piece by piece + - `:right-by` -> str, keyword, Iterable, `series`: groupby `right` on the given + columns and then join piece by piece + - `:fill-method` -> {`:ffill` `nil`}, default `nil`: forward fill missing data + - `:suffixes` -> Iterable, default [`:_x` `:_y`]: the suffixes to add to overlapping + column names + - `:how` -> {`:left` `:right` `:outer` `:inner`}, default `:outer`: kind of join + + **Examples** + ``` + (def A + (data-frame + {:key [:a :c :e :a] + :lvalue [1 2 3 1] + :group [:a :a :a :b]})) + + (def B + (data-frame + {:key [:b :c :d] + :rvalue [1 2 3]})) + + (merge-ordered A B) + ;; key lvalue group rvalue + ;; 0 a 1.0 a NaN + ;; 1 a 1.0 b NaN + ;; 2 b NaN NaN 1.0 + ;; 3 c 2.0 a 2.0 + ;; 4 d NaN NaN 3.0 + ;; 5 e 3.0 a NaN + + (merge-ordered A B {:fill-method :ffill}) + ;; key lvalue group rvalue + ;; 0 a 1 a NaN + ;; 1 a 1 b NaN + ;; 2 b 1 b 1.0 + ;; 3 c 2 a 2.0 + ;; 4 d 2 a 3.0 + ;; 5 e 3 a 3.0 + + (merge-ordered A B {:fill-method :ffill :left-by \"group\"}) + ;; key lvalue group rvalue + ;; 0 a 1 a NaN + ;; 1 b 1 a 1.0 + ;; 2 c 2 a 2.0 + ;; 3 d 2 a 3.0 + ;; 4 e 3 a 3.0 + ;; 5 a 1 b NaN + ;; 6 b 1 b 1.0 + ;; 7 c 1 b 2.0 + ;; 8 d 1 b 3.0 + + (merge-ordered A B {:left-on :lvalue :right-on :rvalue}) + ;; key_x lvalue group key_y rvalue + ;; 0 a 1 a b 1 + ;; 1 a 1 b b 1 + ;; 2 c 2 a c 2 + ;; 3 e 3 a d 3 + ``` + " + [left right & [{:keys [on left-on right-on left-by right-by + fill-method suffixes how] + :as attrs}]] + (py/call-attr-kw u/pd "merge_ordered" [left right] + (u/keys->pyargs attrs))) + +(defn merge-asof + "Similar to a left join, but merges on nearest key rather than equal. + + **Arguments** + + - `left` -> `data-frame`: sorted by key + - `right` -> `data-frame`: sorted by key + + **Attrs** + + - `:on` str, keyword -> column name to join on. Must be in both `data-frames` and + it must be ordered and numeric (dates, int, etc) + - `:left-on` -> str, keyword: column name to join in left `data-frame`. The + requirements are the same as for `:on` + - `:right-on` -> str, keyword: column name to join in right `data-frame`. The + requirements are the same as for `:on` + - `:left-index` -> bool: index of left `data-frame` is the join key? + - `:right-index` -> bool: index of right `data-frame` is the join key? + - `:by` -> str, keyword, Iterable, `series`: match these columns before merging + - `:left-by` -> str, keyword, Iterable. `series`: as `:by` but only for left `data-frame` + - `:right-by` -> str, keyword, Iterable. `series`: as `:by` but only for right `data-frame` + - `:suffixes` -> Iterable: suffix to add to overlapping column names, must + have length 2 and the first one is `left` and second one is `right` + - `:tolerance` -> depends on key: the tolerance for merging + - `:allow-exact-matches` -> bool, default `true`: allow matching with same `:on` value? + - `:direction` -> {`:backward` `:forward` `:nearest`}, default `:backward`: search for + prior, subsequent or closest matches + + **Examples** + + ``` + (def trades + (data-frame + {:time (->datetime [\"2016-05-25 13:30:00.023\" + \"2016-05-25 13:30:00.038\" + \"2016-05-25 13:30:00.048\" + \"2016-05-25 13:30:00.048\"]) + :ticker [:MSFT :MSFT :GOOG :AAPL] + :price [51.95 51.95 720.77 98.00] + :quantity [75 155 100 100]})) + + (def quotes + (data-frame + {:time (->datetime [\"2016-05-25 13:30:00.023\" + \"2016-05-25 13:30:00.023\" + \"2016-05-25 13:30:00.030\" + \"2016-05-25 13:30:00.048\" + \"2016-05-25 13:30:00.049\"]) + :ticker [:GOOG :MSFT :MSFT :GOOG :AAPL] + :bid [720.5 51.95 51.97 720.5 97.99] + :ask [720.93 51.96 51.98 720.93 98.01]})) + + (merge-asof trades quotes {:on :time}) + ;; time ticker_x price quantity ticker_y bid ask + ;; 0 2016-05-25 13:30:00.023 MSFT 51.95 75 MSFT 51.95 51.96 + ;; 1 2016-05-25 13:30:00.038 MSFT 51.95 155 MSFT 51.97 51.98 + ;; 2 2016-05-25 13:30:00.048 GOOG 720.77 100 GOOG 720.50 720.93 + ;; 3 2016-05-25 13:30:00.048 AAPL 98.00 100 GOOG 720.50 720.93 + + (merge-asof trades quotes {:on :time :allow-exact-matches false}) + ;; time ticker_x price quantity ticker_y bid ask + ;; 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN NaN + ;; 1 2016-05-25 13:30:00.038 MSFT 51.95 155 MSFT 51.97 51.98 + ;; 2 2016-05-25 13:30:00.048 GOOG 720.77 100 MSFT 51.97 51.98 + ;; 3 2016-05-25 13:30:00.048 AAPL 98.00 100 MSFT 51.97 51.98 + + (merge-asof trades quotes {:on :time :direction :forward}) + ;; time ticker_x price quantity ticker_y bid ask + ;; 0 2016-05-25 13:30:00.023 MSFT 51.95 75 GOOG 720.5 720.93 + ;; 1 2016-05-25 13:30:00.038 MSFT 51.95 155 GOOG 720.5 720.93 + ;; 2 2016-05-25 13:30:00.048 GOOG 720.77 100 GOOG 720.5 720.93 + ;; 3 2016-05-25 13:30:00.048 AAPL 98.00 100 GOOG 720.5 720.93 + + (merge-asof trades quotes {:on :time :by :ticker}) + ;; time ticker price quantity bid ask + ;; 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 + ;; 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 + ;; 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + ;; 3 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + ``` + " + [left right & [{:keys [on left-on right-on left-index right-index by + left-by right-by suffixes tolerance + allow-exact-matches direction] + :as attrs}]] + (py/call-attr-kw u/pd "merge_asof" [left right] + (u/keys->pyargs attrs))) + +(defn concatenate + "Append `series`es and/or `data-frame`s along a wanted axis. + + **Arguments** + + - `dfs-or-srss` -> Iterable: a collection of multiple `series`/`data-frame` + + **Attrs** + + - `:axis` -> int, default 0: 0 = rows, 1 = columns + - `:join` -> {`:inner` `:outer`}, default `:outer`: the kind of join on other `:axis` + - `:ignore-index` -> bool, default `false`: whether to consider the index along + the wanted `:axis` + - `:keys` -> Iterable, default `nil`: this lets you build a hierarchical index + using the passed `:keys` as the outermost levels + - `:levels` -> Iterable, default `nil`: unique values for building a multi index + - `:names` -> Iterable, default `nil`: names of the levels in the hierarchical index + - `:verify-integrity` -> bool, default `false`: does the new `:axis` + contain duplicates? (P.S.: expensive operation) + - `:sort` -> bool, default `true`: sort the other `:axis` when `:join` is `:outer` + - `:copy` -> bool, default `true`: if `false` avoid copying when unnecessary + + **Examples** + + ``` + (concatenate [(series (range 3)) (series (range 3))]) + ;; 0 0 + ;; 1 1 + ;; 2 2 + ;; 0 0 + ;; 1 1 + ;; 2 2 + ;; dtype: int64 + + (concatenate [(series (range 3)) (series (range 3))] {:axis 1}) + ;; 0 1 + ;; 0 0 0 + ;; 1 1 1 + ;; 2 2 2 + + (concatenate [(data-frame {:a [1 2 3] :b [4 5 6]}) + (data-frame {:a [2 2 2] :b [3 3 3]})]) + ;; a b + ;; 0 1 4 + ;; 1 2 5 + ;; 2 3 6 + ;; 0 2 3 + ;; 1 2 3 + ;; 2 2 3 + + (concatenate [(data-frame {:a [1 2 3] :b [4 5 6]}) + (data-frame {:a [2 2 2] :b [3 3 3]})] + {:ignore-index true}) + ;; a b + ;; 0 1 4 + ;; 1 2 5 + ;; 2 3 6 + ;; 3 2 3 + ;; 4 2 3 + ;; 5 2 3 + ``` + " + [dfs-or-srss & [{:keys [axis join ignore-index keys levels + names verify-integrity sort copy] + :as attrs}]] + (u/kw-call u/pd "concat" dfs-or-srss attrs)) + +(defn aggregate + "Aggregate data using one or more functions over a given axis. + + This is very similar to `reduce`, but works on `data-frames` as well. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + - `how` -> keyword, str, function, Iterable: how to aggregate data. This accepts + either panthera functions strings/keywords, a list of the previous and/or user + defined functions. Check examples for more info. + + **Attrs** + + - `:axis` -> {0 `:index` 1 `:columns`}, default 0: 0 = apply function along + cols; 1 = apply function along rows + - `fn-args` -> if the provided collapsing function needs arguments, just list + them freely (see examples) + + **Examples** + + ``` + (def a (data-frame + [[1, 2, 3] + [4, 5, 6] + [7, 8, 9] + [##NaN, ##NaN, ##NaN]] + {:columns [:A :B :C]})) + + (aggregate (series [1 2 3]) :sum) + ;; 6 + + (aggregate a [:sum :min]) + ;; A B C + ;; sum 12.0 15.0 18.0 + ;; min 1.0 2.0 3.0 + + ; if `how` needs arguments, you can pass them as `attrs` + (aggregate (series [1 2 3]) :cov {:other (series [4 5 6])}) + ;; 1.0 + + (aggregate (series [1 2 3]) inc) + ;; 0 2 + ;; 1 3 + ;; 2 4 + ;; dtype: int64 + ``` + " + [df-or-srs how & [{:keys [axis fn-args] :as attrs}]] + (u/kw-call df-or-srs "agg" how attrs)) + +(defn remap + "Remap values in a series. + + This is the same as using `map` on a sequence while using a map as the mapped + function: `(map {:a 1 :b 2} [:a :b]) => (1 2)` + + **Arguments** + + - `srs` -> `series` + - `mappings` -> map, function: the mapping correspondence + - `na-action` -> {`nil` `:ignore`}, default `nil`: `:ignore` doesn't pass missing + values to the `mappings` + + **Examples** + + ``` + (remap (series [:a :b :c]) {:a 1 :b 2 :c 3}) + ;; 0 1 + ;; 1 2 + ;; 2 3 + ;; dtype: int64 + + (remap (series [:a :b ##NaN]) #(str \"This is \" %)) + ;; 0 This is a + ;; 1 This is b + ;; 2 This is NaN + ;; dtype: object + + (remap (series [:a :b ##NaN]) #(str \"This is \" %) :ignore) + ;; 0 This is a + ;; 1 This is b + ;; 2 NaN + ;; dtype: object + ``` + " + [srs mappings & [na-action]] + (py/call-attr srs "map" mappings (or na-action nil))) + +(defn groupby + "Group `data-frame` or `series` by a given variable. + + Note that `groupby` does nothing by itself, this must be followed by another + operation like aggregation. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + - `by` -> str, keyword, Iterable, map, function: it can be a column, a list of + columns, a function used to group the index, a collection of values to use as + grouping variable + + **Attrs** + + - `:axis` -> {0 `:index` 1 `:columns`}: split along columns or rows + - `:level` -> int, str, keyword, Iterable: if multiple index, group by this + or these + - `:as-index` -> bool, default `true`: when `false` this becomes basically + as the SQL group by output + - `:sort` -> bool, default `true`: if `false` you get a performance improvement + - `:group-keys` -> bool, default `true`: add group keys to index when afterwards + you call `apply` + - `:squeeze` -> bool, default `false`: reduce dimensionality of the output if possible + - `:observed` -> bool, default `false`: this only applies to Categoricals: + if `true`, only show observed values for categorical groupers, + if `false`, show all values for categorical groupers + + **Examples** + + ``` + (def a (data-frame {:animal [:falcon :falcon :parrot :parrot] + :max-speed [380 370 24 26]})) + + (-> a (r/groupby :animal) m/mean) + max-speed + ;; animal + ;; falcon 375 + ;; parrot 25 + + (-> a (r/groupby :animal {:as-index false}) m/mean) + ;; animal max-speed + ;; 0 falcon 375 + ;; 1 parrot 25 + ``` + " + [df-or-srs by & [{:keys [axis level as-index sort group-keys + squeeze observed] :as attrs}]] + (u/kw-call df-or-srs "groupby" by attrs)) + +(defn rolling + "Rolling window calculations + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + - `window` -> int, str. keyword: the size of the window. If str or keyword then + this is considered as a time offset (e.g. :2s = 2 seconds, :30D = 30 days; + check this for more options https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases) + + **Attrs** + + - `:min-periods` -> int: minimum number of observations to have a value. For + times the default is 1, otherwise the default is `window` + - `:center` -> bool, default `false`: if `false` the result is set at the right + edge of the window, otherwise it gets centered + - `:win-type` -> str, keyword: refer to https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows + - `:on`-> str, keyword: column to use for the rolling window, only in case this + is not the index + - `:axis` -> {0 `:index` 1 `:columns`}: split along columns or rows + - `:closed` -> {`:right` `:left` `:both` `:neither`}: where to make the interval + close + + **Examples** + ``` + (def a (data-frame {:b [0 1 2 3 4]} + {:index + (panthera.pandas.conversion/->datetime + (series + [\"20130101 09:00:00\" + \"20130101 09:00:02\" + \"20130101 09:00:03\" + \"20130101 09:00:05\" + \"20130101 09:00:06\"]))})) + + (sum (rolling a 2)) + ;; b + ;; 2013-01-01 09:00:00 NaN + ;; 2013-01-01 09:00:02 1.0 + ;; 2013-01-01 09:00:03 3.0 + ;; 2013-01-01 09:00:05 5.0 + ;; 2013-01-01 09:00:06 7.0 + + (sum (rolling a :2s)) + ;; b + ;; 2013-01-01 09:00:00 0.0 + ;; 2013-01-01 09:00:02 1.0 + ;; 2013-01-01 09:00:03 3.0 + ;; 2013-01-01 09:00:05 3.0 + ;; 2013-01-01 09:00:06 7.0 + + (sum (rolling a 2 {:win-type :triang})) + ;; b + ;; 2013-01-01 09:00:00 NaN + ;; 2013-01-01 09:00:02 0.5 + ;; 2013-01-01 09:00:03 1.5 + ;; 2013-01-01 09:00:05 2.5 + ;; 2013-01-01 09:00:06 3.5 + + (sum (rolling a 2 {:min-periods 1})) + ;; b + ;; 2013-01-01 09:00:00 0.0 + ;; 2013-01-01 09:00:02 1.0 + ;; 2013-01-01 09:00:03 3.0 + ;; 2013-01-01 09:00:05 5.0 + ;; 2013-01-01 09:00:06 7.0 + ``` + " + [df-or-srs window & [{:keys [min-periods center win-type on axis closed] + :as attrs}]] + (u/kw-call df-or-srs "rolling" window attrs)) + +(defn ewm + "Exponentially weighted functions. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + + **Attrs** + + - `:com` -> numeric: decay in terms of center of mass + - `:span` -> numeric: decay in terms of span + - `:halflife` -> numeric: decay in terms of half-life + - `:alpha` -> numeric: smoothing factor + - `:min-periods` -> int, default 0: minimum number of observations + - `:adjust` -> bool, default `true`: divide by decaying adjustment factor + in beginning periods to account for imbalance in relative weightings + - `:ignore-na` -> bool, default `false`: ignore missing values + - `:axis` -> {0 `:index` 1 `:columns`}: use columns or rows + + **Examples** + + ``` + (def a (g/data-frame {:b [0 1 2 ##NaN 4]})) + + (-> a (ewm {:com 0.5}) mean) + ;; b + ;; 0 0.000000 + ;; 1 0.750000 + ;; 2 1.615385 + ;; 3 1.615385 + ;; 4 3.670213 + + (-> a (ewm {:span 3}) mean) + ;; b + ;; 0 0.000000 + ;; 1 0.666667 + ;; 2 1.428571 + ;; 3 1.428571 + ;; 4 3.217391 + + (-> a (ewm {:com 0.5 :ignore-na true}) mean) + ;; b + ;; 0 0.000000 + ;; 1 0.750000 + ;; 2 1.615385 + ;; 3 1.615385 + ;; 4 3.225000 + ``` + " + [df-or-srs & [{:keys [com span halflife min-periods adjust ignore-na axis] + :as attrs}]] + (u/simple-kw-call df-or-srs "ewm" attrs)) + +(defn drop + "Drop requested rows or columns. + + Remove rows or columns by specifying label names and corresponding axis, + or by specifying directly index or column names. When using a multi-index, + labels on different levels can be removed by specifying the level. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + - `labels` -> keyword, str, numeric, Iterable: index or labels to drop + + **Attrs** + + - `:axis` -> int, default 0: 0 = rows, 1 = columns + - `:level` -> numeric, keyword, str: level to drop from multi index + - `:errors` -> {`:ignore` `:raise`}, default `:raise`: ignore or raise errors + + **Examples** + + ``` + (require-python '[numpy :as np]) + (def df + (data-frame + (np/reshape (np/arange 12) [3 4]) + {:columns [:A :B :C :D]})) + + (drop df [:B :C] {:axis 1}) + ;; A D + ;; 0 0 3 + ;; 1 4 7 + ;; 2 8 11 + + (drop df [0 1]) + ;; A B C D + ;; 2 8 9 10 11 + ``` + " + [df-or-srs labels & [{:keys [axis level errors] :as attrs}]] + (u/kw-call df-or-srs "drop" labels attrs)) + +(defn drop-rows + "A shorthand for `(drop df [0 2] {:axis 0})` + + See [[drop]] docs for more info" + [df rows & [{:keys [level errors] :as attrs}]] + (drop df rows (merge attrs {:axis 0}))) + +(defn drop-cols + "A shorthand for `(drop df [:A :C] {:axis 1})` + + See [[drop]] docs for more info" + [df cols & [{:keys [level errors] :as attrs}]] + (drop df cols (merge attrs {:axis 1}))) + +(defn dropna + "Drop missing values. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + + **Attrs** + + - `:axis` -> int, default 0: 0 = rows, 1 = columns + - `:how` -> {`:any` `:all`}, default `:any`: drop when there are `:any` missing + values, or `:all` missing values + - `:thresh` -> numeric: require `:thresh` missing values to drop + - `:subset` -> Iterable: the subset to consider on opposite axis; e.g. if + you drop rows `:subset` are the columns to consider for dropping + + **Examples** + + ``` + (def df + (data-frame {:name [:Alfred :Batman :Robin] + :toy [nil :Batmobile :Whip] + :born [nil :1940-04-25 nil]}) + + (dropna df) + ;; name toy born + ;; 1 Batman Batmobile 1940-04-25 + ``` + " + [df-or-srs & [{:keys [axis how thresh subset] + :as attrs}]] + (u/simple-kw-call df-or-srs "dropna" attrs)) + +(defn melt + "Unpivot a `data-frame` from wide format to long format. + + Basically reshape the `data-frame` to have one row per observation and one + column per variable + + **Arguments** + + - `df` -> `data-frame` + + **Attrs** + + - `:id-vars` -> Iterable: columns to use as identifiers + - `:value-vars` -> Iterable: columns to melt (unpivot), if not specified uses + all the columns not in `:id-vars` + - `:var-name` -> keyword, str, default `:variable`: name for the variable column + - `:value-name` -> keyword, str, default `:value`: name for the value column + - `:col-level` -> numeric, str: the level to use for melting + + **Examples** + + ``` + (def df + (transpose + (data-frame [[:a :b :c] [1 3 5] [2 4 6]] + {:columns [0 1 2] + :index [:A :B :C]}))) + + (melt df) + ;; variable value + ;; 0 A a + ;; 1 A b + ;; 2 A c + ;; 3 B 1 + ;; 4 B 3 + ;; 5 B 5 + ;; 6 C 2 + ;; 7 C 4 + ;; 8 C 6 + + (melt df {:id-vars [:A] :value-vars [:B]}) + ;; A variable value + ;; 0 a B 1 + ;; 1 b B 3 + ;; 2 c B 5 + + (melt df {:id-vars [:A] :value-vars [:B :C]}) + ;; A variable value + ;; 0 a B 1 + ;; 1 b B 3 + ;; 2 c B 5 + ;; 3 a C 2 + ;; 4 b C 4 + ;; 5 c C 6 + ``` + " + [df & [{:keys [id-vars value-vars var-name + value-name col-level] :as attrs}]] + (u/simple-kw-call df "melt" attrs)) + +(defn assign + "Assign new columns to `df-or-srs` + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + - `cols` -> map: either a map `{:col-name value}`, or a map `{:col-name fn}` + + **Examples** + + ``` + (def df + (transpose + (data-frame [[:a :b :c] [1 3 5] [2 4 6]] + {:columns [0 1 2] + :index [:A :B :C]}))) + + (assign df {:D 3}) + ;; A B C D + ;; 0 a 1 2 3 + ;; 1 b 3 4 3 + ;; 2 c 5 6 3 + + (assign df {:D [1 2 3]}) + ;; A B C D + ;; 0 a 1 2 1 + ;; 1 b 3 4 2 + ;; 2 c 5 6 3 + + (assign df {:D #(-> (subset-cols % :C) (mul 2))}) + ;; A B C D + ;; 0 a 1 2 4 + ;; 1 b 3 4 8 + ;; 2 c 5 6 12 + ``` + " + [df-or-srs cols] + (py/call-attr-kw df-or-srs "assign" [] cols)) + +(defn stack + "Stack the prescribed level(s) from columns to index. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + + **Attrs** + + - `:level` -> numeric, keyword, str, default -1: level to stack + - `:dropna` -> bool, default true: drop rows with missing values if generated + + **Examples** + + ``` + (def df + (data-frame [[0 1] [2 3]] + {:index [:cat :dog] + :columns [:weight :height]})) + + (stack df) + ;; cat weight 0 + ;; height 1 + ;; dog weight 2 + ;; height 3 + ;; dtype: int64 + ``` + " + [df-or-srs & [{:keys [level dropna] :as attrs}]] + (u/simple-kw-call df-or-srs "stack" attrs)) + +(defn unstack + "Pivot a level of the (necessarily hierarchical) index labels, + returning a DataFrame having a new level of column labels whose inner-most + level consists of the pivoted index labels. + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + + **Attrs** + + - `:level` -> numeric, keyword, str, default -1: level to unstack + - `:fill-value` -> any: replace missing values produced by `unstack` with this + + **Examples** + + ``` + (def s + (stack + (data-frame [[1 2] [3 4]] + {:index [:one :two] + :columns [:a :b]}))) + + (unstack s) + ;; a b + ;; one 1 2 + ;; two 3 4 + + (unstack s {:level 0}) + ;; one two + ;; a 1 3 + ;; b 2 4 + + (unstack (unstack s {:level 0})) + ;; one a 1 + ;; b 2 + ;; two a 3 + ;; b 4 + ;; dtype: int64 + ``` + " + [df-or-srs & [{:keys [level fill_value] :as attrs}]] + (u/simple-kw-call df-or-srs "unstack" attrs)) + +(defn transpose + "Transpose the given panthera object + + **Arguments** + + - `df-or-srs` -> `data-frame`, `series` + + **Examples** + + ``` + (def df (data-frame [[1 2 3] [4 5 6] [7 8 9]])) + + (transpose df) + ;; 0 1 2 + ;; 0 1 4 7 + ;; 1 2 5 8 + ;; 2 3 6 9 + ``` + " + [df-or-srs] + (py/get-attr df-or-srs "T")) diff --git a/src/panthera/pandas/utils.clj b/src/panthera/pandas/utils.clj index fab23f4..9e32256 100644 --- a/src/panthera/pandas/utils.clj +++ b/src/panthera/pandas/utils.clj @@ -1,305 +1,305 @@ -(ns panthera.pandas.utils - (:require - [libpython-clj.python :as py] - [libpython-clj.require :refer [require-python]] - [camel-snake-kebab.extras :as cske] - [clojure.core.memoize :as m])) - -(require-python '[builtins :as bt]) - -(defonce pd (py/import-module "pandas")) - -(defn slice - "Returns a Python slice. This is what you'd get by doing something like - `1:10` and it is similar to `(range 1 10)`, but works with everything - not only numbers, so `(slice \"a\" \"f\")` would mean - [\"a\" \"b\" \"c\" \"d\" \"e\" \"f\"]. Use this for subsetting arrays, - serieses and data-frames. - - Example: - - ``` - (slice) ; the empty slice, it means every index - - (slice 5) ; every index up to 5 - - (slice 3 5) ; every index from 3 to 5 - - (slice \"2019-10-11\" \"2019-12-3\") ; works with dates as well - - (slice \"a\" \"d\") ; works with strings - - (slice 1 10 2) ; every 2 values between 1 and 10 - ```" - ([] - (bt/slice nil)) - ([start] - (bt/slice start)) - ([start stop] - (bt/slice start stop)) - ([start stop incr] - (bt/slice start stop incr))) - -(defn pytype - "Return the Python type of the given objects - - Examples: - - ``` - (pytype obj) - - (pytype my-df my-srs this) - ```" - ([] nil) - ([obj] - (py/python-type obj)) - ([obj & objs] - (map pytype (concat (vector obj) objs)))) - -(def pystr->cljk - (comp - keyword - #(clojure.string/replace % #"_" "-") - #(clojure.string/replace % #" " "-"))) - -(def cljk->pystr - (comp - #(clojure.string/replace % #"-" "_") - name)) - -(def memo-key-converter - "Convert regular Clojure kebab-case keys to idiomatic - Python snake_case strings. - - Example: - - ``` - (memo-key-converter :a-key) ; \"a_key\" - ```" - (m/fifo #(if (keyword? %) (cljk->pystr %) %) {} :fifo/threshold 512)) - -(def memo-columns-converter - "Converts Python strings to idiomatic Clojure keys. - - Examples: - - ``` - (memo-columns-converter \"a_name\") ; :a-name - - (memo-columns-converter \"ALL_CAPS\") ; :ALL-CAPS - ```" - (m/fifo - #(cond - (number? %) % - (string? %) (pystr->cljk %) - (nil? %) nil - :else (mapv pystr->cljk %)) {} :fifo/threshold 512)) - -(defn vec->pylist - "Converts an iterable Clojure data structure to a Python list - - Example: - - ``` - (vec->pylist my-df) - ```" - [v] - (py/->py-list v)) - -(defn nested-vector? - "Check if the given argument is a nested vector or not. - - Example: - - ``` - (nested-vector? [[1 2] [3 4]]) - ```" - [v] - (some vector? v)) - -(defn nested-slice? - "Check if the given value contains at least one `:slice`. - - Example: - - ``` - (nested-slice? [(slice 3 5) (slice)]) - ```" - [v] - (some #(identical? :slice (pytype %)) v)) - -(defn vals->pylist - "Takes some values and dispatches them to the right conversion to a Python - data structure. - - Examples: - - ``` - (vals->pylist [1 2 3]) - - (vals->pylist [[1 2] [3 4]]) - - (vals->pylist [(slice 1 5) (slice)]) - ```" - [obj] - (cond - (not (coll? obj)) obj - (map? obj) obj - (nested-vector? obj) (to-array-2d obj) - (vector? obj) (if (nested-slice? obj) - obj - (py/->py-list obj)) - :else obj)) - -(defn keys->pyargs - "Takes a map as an argument and converts keys to Python strings - and values to the proper data structure. - - Examples: - - ``` - (keys->pyargs {:a 1 :a-key [1 2 3] \"c\" (slice)}) - ```" - [m] - (let [nm (reduce-kv - (fn [m k v] - (assoc m k (vals->pylist v))) - {} m)] - (cske/transform-keys memo-key-converter nm))) - -(defn series? - "Check if the given argument is a series" - [obj] - (identical? :series (pytype obj))) - -(defn data-frame? - "Check if the given argument is a data-frame" - [obj] - (identical? :data-frame (pytype obj))) - -(defrecord DATASET [id cols data shape]) - -(defn pr-lazy-dataset - [data] - (let [cnt (first (:shape data))] - (if (> cnt 4) - (conj (vec (take 5 (:data data))) '...) - (vec (:data data))))) - -(defmethod print-method DATASET [v ^java.io.Writer w] - (let [id (:id v) - cols (:cols v) - shape (:shape v) - data (pr-lazy-dataset v)] - (clojure.pprint/pprint {:id id :cols cols :data data}))) - -(defmethod print-dup DATASET [v ^java.io.Writer w] - (let [id (:id v) - cols (:cols v) - shape (:shape v) - data (pr-lazy-dataset v)] - (clojure.pprint/pprint {:shape (vec shape) :id id :cols cols :data data}))) - -(defmethod clojure.pprint/simple-dispatch DATASET [v] - (let [id (:id v) - cols (:cols v) - shape (:shape v) - data (pr-lazy-dataset v)] - (clojure.pprint/pprint {:shape (vec shape) :id id :cols cols :data data}))) - -(defmulti to-clj - (fn [obj] (identical? :series (py/python-type obj)))) - -(defmethod to-clj false - [obj] - (let [cnt (py/get-attr obj "shape")] - (->DATASET - (py/get-attr obj "index") - (py/get-attr obj "columns") - (lazy-seq (py/get-attr obj "values")) - cnt))) - -(defmethod to-clj true - [obj] - (let [cnt (py/get-attr obj "shape")] - (->DATASET - (py/get-attr obj "index") - (or (py/get-attr obj "name") "unnamed") - (lazy-seq (py/get-attr obj "values")) - cnt))) - -(defmulti kwrds? - (fn [obj keywords?] (boolean keywords?))) - -(defmethod kwrds? true - [obj keywords?] - (if (series? obj) - (let [nm (memo-columns-converter - (or (py/get-attr obj "name") - "unnamed"))] - (into [] (map #(assoc {} nm %)) - (vec obj))) - (let [ks (map memo-columns-converter - (py/get-attr obj "columns"))] - (into [] (map #(zipmap ks %)) - (py/get-attr obj "values"))))) - -(defmethod kwrds? false - [obj keywords?] - (if (series? obj) - (let [nm (or (py/get-attr obj "name") - "unnamed")] - (into [] (map #(assoc {} nm %)) - (vec obj))) - (let [ks (py/get-attr obj "columns")] - (into [] (map #(zipmap ks %)) - (py/get-attr obj "values"))))) - -(defn ->clj - "Convert the given panthera data-frame or series to a Clojure vector of maps. - The idea is to have a common, simple and fast access point to conversion of - the main data structures between languages. - - - `series`: a `series` gets converted to a vector of maps with only one key and - one value. If the series has a name that becomes the key of the maps, - otherwise `->clj` falls back to the `:unnamed` key. - - `data-frame`: a `data-frame` is converted to a vector of maps with names - of the columns as keys and values as the corresponding row/column value. - - With the default method you might incur a data loss: the index doesn't get - converted and in case you're using a hierarchical index you get only one level - out of it. To keep everything in one place you have to make `full?` true, in - this way you get back a map with keys `{:id :cols :data}`. - - **Arguments** - - - `df-or-srs` -> `data-frame` or `series` - - `:full?` -> whether to use the full conversion, default false - - `:keywords?` -> wether to convert column names to keywords, default true - - N.B.: `:full?` usage excludes `:keywords?` - - **Examples** - - ``` - (->clj my-srs) - - (->clj my-df) - ``` - " - [df-or-srs & {:keys [full? keywords?] :or {keywords? true}}] - (if full? - (to-clj df-or-srs) - (kwrds? df-or-srs keywords?))) - -(defn simple-kw-call - "Helper for a cleaner access to `call-attr-kw` from `libpython-clj`" - [df kw & [attrs]] - (py/call-attr-kw df kw [] - (keys->pyargs attrs))) - -(defn kw-call - "Helper for a cleaner access to `call-attr-kw` from `libpython-clj`" - [df kw pos & [attrs]] - (py/call-attr-kw df kw [(vals->pylist pos)] - (keys->pyargs attrs))) +(ns panthera.pandas.utils + (:require + [libpython-clj.python :as py] + [libpython-clj.require :refer [require-python]] + [camel-snake-kebab.extras :as cske] + [clojure.core.memoize :as m])) + +(defonce builtins (py/import-module "builtins")) + +(defonce pd (py/import-module "pandas")) + +(defn slice + "Returns a Python slice. This is what you'd get by doing something like + `1:10` and it is similar to `(range 1 10)`, but works with everything + not only numbers, so `(slice \"a\" \"f\")` would mean + [\"a\" \"b\" \"c\" \"d\" \"e\" \"f\"]. Use this for subsetting arrays, + serieses and data-frames. + + Example: + + ``` + (slice) ; the empty slice, it means every index + + (slice 5) ; every index up to 5 + + (slice 3 5) ; every index from 3 to 5 + + (slice \"2019-10-11\" \"2019-12-3\") ; works with dates as well + + (slice \"a\" \"d\") ; works with strings + + (slice 1 10 2) ; every 2 values between 1 and 10 + ```" + ([] + (py/call-attr builtins "slice" nil)) + ([start] + (py/call-attr builtins "slice" start)) + ([start stop] + (py/call-attr builtins "slice" start stop)) + ([start stop incr] + (py/call-attr builtins "slice" start stop incr))) + +(defn pytype + "Return the Python type of the given objects + + Examples: + + ``` + (pytype obj) + + (pytype my-df my-srs this) + ```" + ([] nil) + ([obj] + (py/python-type obj)) + ([obj & objs] + (map pytype (concat (vector obj) objs)))) + +(def pystr->cljk + (comp + keyword + #(clojure.string/replace % #"_" "-") + #(clojure.string/replace % #" " "-"))) + +(def cljk->pystr + (comp + #(clojure.string/replace % #"-" "_") + name)) + +(def memo-key-converter + "Convert regular Clojure kebab-case keys to idiomatic + Python snake_case strings. + + Example: + + ``` + (memo-key-converter :a-key) ; \"a_key\" + ```" + (m/fifo #(if (keyword? %) (cljk->pystr %) %) {} :fifo/threshold 512)) + +(def memo-columns-converter + "Converts Python strings to idiomatic Clojure keys. + + Examples: + + ``` + (memo-columns-converter \"a_name\") ; :a-name + + (memo-columns-converter \"ALL_CAPS\") ; :ALL-CAPS + ```" + (m/fifo + #(cond + (number? %) % + (string? %) (pystr->cljk %) + (nil? %) nil + :else (mapv pystr->cljk %)) {} :fifo/threshold 512)) + +(defn vec->pylist + "Converts an iterable Clojure data structure to a Python list + + Example: + + ``` + (vec->pylist my-df) + ```" + [v] + (py/->py-list v)) + +(defn nested-vector? + "Check if the given argument is a nested vector or not. + + Example: + + ``` + (nested-vector? [[1 2] [3 4]]) + ```" + [v] + (some vector? v)) + +(defn nested-slice? + "Check if the given value contains at least one `:slice`. + + Example: + + ``` + (nested-slice? [(slice 3 5) (slice)]) + ```" + [v] + (some #(identical? :slice (pytype %)) v)) + +(defn vals->pylist + "Takes some values and dispatches them to the right conversion to a Python + data structure. + + Examples: + + ``` + (vals->pylist [1 2 3]) + + (vals->pylist [[1 2] [3 4]]) + + (vals->pylist [(slice 1 5) (slice)]) + ```" + [obj] + (cond + (not (coll? obj)) obj + (map? obj) obj + (nested-vector? obj) (to-array-2d obj) + (vector? obj) (if (nested-slice? obj) + obj + (py/->py-list obj)) + :else obj)) + +(defn keys->pyargs + "Takes a map as an argument and converts keys to Python strings + and values to the proper data structure. + + Examples: + + ``` + (keys->pyargs {:a 1 :a-key [1 2 3] \"c\" (slice)}) + ```" + [m] + (let [nm (reduce-kv + (fn [m k v] + (assoc m k (vals->pylist v))) + {} m)] + (cske/transform-keys memo-key-converter nm))) + +(defn series? + "Check if the given argument is a series" + [obj] + (identical? :series (pytype obj))) + +(defn data-frame? + "Check if the given argument is a data-frame" + [obj] + (identical? :data-frame (pytype obj))) + +(defrecord DATASET [id cols data shape]) + +(defn pr-lazy-dataset + [data] + (let [cnt (first (:shape data))] + (if (> cnt 4) + (conj (vec (take 5 (:data data))) '...) + (vec (:data data))))) + +(defmethod print-method DATASET [v ^java.io.Writer w] + (let [id (:id v) + cols (:cols v) + shape (:shape v) + data (pr-lazy-dataset v)] + (clojure.pprint/pprint {:id id :cols cols :data data}))) + +(defmethod print-dup DATASET [v ^java.io.Writer w] + (let [id (:id v) + cols (:cols v) + shape (:shape v) + data (pr-lazy-dataset v)] + (clojure.pprint/pprint {:shape (vec shape) :id id :cols cols :data data}))) + +(defmethod clojure.pprint/simple-dispatch DATASET [v] + (let [id (:id v) + cols (:cols v) + shape (:shape v) + data (pr-lazy-dataset v)] + (clojure.pprint/pprint {:shape (vec shape) :id id :cols cols :data data}))) + +(defmulti to-clj + (fn [obj] (identical? :series (py/python-type obj)))) + +(defmethod to-clj false + [obj] + (let [cnt (py/get-attr obj "shape")] + (->DATASET + (py/get-attr obj "index") + (py/get-attr obj "columns") + (lazy-seq (py/get-attr obj "values")) + cnt))) + +(defmethod to-clj true + [obj] + (let [cnt (py/get-attr obj "shape")] + (->DATASET + (py/get-attr obj "index") + (or (py/get-attr obj "name") "unnamed") + (lazy-seq (py/get-attr obj "values")) + cnt))) + +(defmulti kwrds? + (fn [obj keywords?] (boolean keywords?))) + +(defmethod kwrds? true + [obj keywords?] + (if (series? obj) + (let [nm (memo-columns-converter + (or (py/get-attr obj "name") + "unnamed"))] + (into [] (map #(assoc {} nm %)) + (vec obj))) + (let [ks (map memo-columns-converter + (py/get-attr obj "columns"))] + (into [] (map #(zipmap ks %)) + (py/get-attr obj "values"))))) + +(defmethod kwrds? false + [obj keywords?] + (if (series? obj) + (let [nm (or (py/get-attr obj "name") + "unnamed")] + (into [] (map #(assoc {} nm %)) + (vec obj))) + (let [ks (py/get-attr obj "columns")] + (into [] (map #(zipmap ks %)) + (py/get-attr obj "values"))))) + +(defn ->clj + "Convert the given panthera data-frame or series to a Clojure vector of maps. + The idea is to have a common, simple and fast access point to conversion of + the main data structures between languages. + + - `series`: a `series` gets converted to a vector of maps with only one key and + one value. If the series has a name that becomes the key of the maps, + otherwise `->clj` falls back to the `:unnamed` key. + - `data-frame`: a `data-frame` is converted to a vector of maps with names + of the columns as keys and values as the corresponding row/column value. + + With the default method you might incur a data loss: the index doesn't get + converted and in case you're using a hierarchical index you get only one level + out of it. To keep everything in one place you have to make `full?` true, in + this way you get back a map with keys `{:id :cols :data}`. + + **Arguments** + + - `df-or-srs` -> `data-frame` or `series` + - `:full?` -> whether to use the full conversion, default false + - `:keywords?` -> wether to convert column names to keywords, default true + + N.B.: `:full?` usage excludes `:keywords?` + + **Examples** + + ``` + (->clj my-srs) + + (->clj my-df) + ``` + " + [df-or-srs & {:keys [full? keywords?] :or {keywords? true}}] + (if full? + (to-clj df-or-srs) + (kwrds? df-or-srs keywords?))) + +(defn simple-kw-call + "Helper for a cleaner access to `call-attr-kw` from `libpython-clj`" + [df kw & [attrs]] + (py/call-attr-kw df kw [] + (keys->pyargs attrs))) + +(defn kw-call + "Helper for a cleaner access to `call-attr-kw` from `libpython-clj`" + [df kw pos & [attrs]] + (py/call-attr-kw df kw [(vals->pylist pos)] + (keys->pyargs attrs))) diff --git a/src/panthera/panthera.clj b/src/panthera/panthera.clj index 79bf6b4..1fbb13f 100644 --- a/src/panthera/panthera.clj +++ b/src/panthera/panthera.clj @@ -1,144 +1,146 @@ -(ns panthera.panthera - (:refer-clojure - :exclude [mod any? drop]) - (:require - [tech.parallel.utils :refer [export-symbols]] - [panthera.pandas.generics] - [panthera.pandas.math] - [panthera.pandas.utils] - [panthera.pandas.conversion] - [panthera.pandas.reshape])) - -(export-symbols - panthera.pandas.generics - n-rows - one-hot - hasnans? - swap-level - cross-section - n-unique - n-smallest - any? - subset-cols - n-largest - names - read-csv - select-rows - unique - filter-rows - dtype - value-counts - index - series - all? - read-excel - set-index - to-csv - data-frame - subset-rows - decreasing? - n-cols - head - increasing? - memory-usage - values - tail - reset-index - unique? - not-na? - shape - fill-na - nbytes - ftype - rename - to-excel) - -(export-symbols - panthera.pandas.math - dot - ne - quantile - kurtosis - lt - std - le - add - sum - diff - same? - ge - cumprod - clip - cumsum - eq - mean - corr - sub - mod - pow - skew - rank - maximum - mode - between - pct-change - cummin - cnt - cummax - ops - autocorr - cov - div - round - mul - sem - var - abs - median - gt - minimum - describe - mean-abs-dev - floor-div) - -(export-symbols - panthera.pandas.utils - pytype - slice - ->clj - series? - data-frame?) - -(export-symbols - panthera.pandas.conversion - ->timedelta - date-range - astype - ->numeric - timedelta-range - infer-time-freq - ->datetime - interval-range) - -(export-symbols - panthera.pandas.reshape - pivot - aggregate - crosstab - cut - rolling - unstack - concatenate - remap - transpose - qcut - merge-ordered - dropna - merge-asof - assign - ewm - groupby - melt - drop - drop-rows - drop-cols) +(ns panthera.panthera + (:refer-clojure + :exclude [mod any? drop]) + (:require + [tech.parallel.utils :refer [export-symbols]] + [panthera.pandas.generics] + [panthera.pandas.math] + [panthera.pandas.utils] + [panthera.pandas.conversion] + [panthera.pandas.reshape])) + +(export-symbols + panthera.pandas.generics + n-rows + one-hot + hasnans? + swap-level + cross-section + n-unique + n-smallest + any? + subset-cols + n-largest + names + read-csv + select-rows + unique + filter-rows + dtype + value-counts + index + series + all? + read-excel + factorize + to-excel + set-index + to-csv + data-frame + subset-rows + rename + decreasing? + n-cols + head + increasing? + memory-usage + values + tail + reset-index + unique? + not-na? + shape + fill-na + nbytes + ftype) + +(export-symbols + panthera.pandas.math + dot + ne + quantile + kurtosis + lt + std + le + add + sum + diff + same? + ge + cumprod + clip + cumsum + eq + mean + corr + sub + mod + pow + skew + rank + maximum + mode + between + pct-change + cummin + cnt + cummax + ops + autocorr + cov + div + round + mul + sem + var + abs + median + gt + minimum + describe + mean-abs-dev + floor-div) + +(export-symbols + panthera.pandas.utils + pytype + slice + ->clj + series? + data-frame?) + +(export-symbols + panthera.pandas.conversion + ->timedelta + date-range + astype + ->numeric + timedelta-range + infer-time-freq + ->datetime + interval-range) + +(export-symbols + panthera.pandas.reshape + pivot + aggregate + crosstab + cut + rolling + unstack + concatenate + remap + transpose + drop-cols + qcut + drop + merge-ordered + drop-rows + dropna + merge-asof + assign + ewm + groupby + stack + melt) diff --git a/test/panthera/config.clj b/test/panthera/config.clj index ea80a13..1c833f6 100644 --- a/test/panthera/config.clj +++ b/test/panthera/config.clj @@ -1,8 +1,8 @@ -(ns panthera.config - (:require - [libpython-clj.python :as py])) - -(defn start-python! - [f] - (py/initialize!) - (f)) +(ns panthera.config + (:require + [libpython-clj.python :as py])) + +(defn start-python! + [f] + (py/initialize!) + (f)) diff --git a/test/panthera/generics_test.clj b/test/panthera/generics_test.clj index ea56a45..b18b695 100644 --- a/test/panthera/generics_test.clj +++ b/test/panthera/generics_test.clj @@ -1,386 +1,386 @@ -(ns panthera.generics-test - (:require - [clojure.test :refer :all] - [libpython-clj.python :as py] - [panthera.pandas.generics :as g :reload true] - [panthera.pandas.utils :as u :reload true] - [panthera.pandas.math :as m])) - -(deftest series - (are [i m] - (u/series? (g/series i m)) - [] {} - [] {:name :test} - [1 2 3] {} - 1 {} - ["1" "2"] {} - ["1" "2"] {:dtype :float32}) - (are [i m o] - (= (vec (g/series i m)) o) - [] {} [] - [] {:name :test} [] - [1 2 3] {} [1 2 3] - [:a :b] {} ["a" "b"] - ["a" "b"] {} ["a" "b"] - [1 2] {:dtype :str} ["1" "2"] - ["1" "2"] {:dtype :float32} [1.0 2.0])) - -(deftest data-frame - (are [i m] - (u/data-frame? (g/data-frame i m)) - [{:a 1 :b 2}] {} - (to-array-2d [[1 2] [3 4]]) {} - (to-array-2d [[1 2] [3 4]]) {:columns [:a :b]} - (to-array-2d [[1 2] [3 4]]) {:dtype :int8}) - (are [i m o] - (= (u/->clj (g/data-frame i m)) o) - [] {} [] - [] {:columns [:a :b]} [] - [{:a 1 :b 2} {:a 1 :b 2}] {} [{:a 1 :b 2} {:a 1 :b 2}] - [{:a "1" :b 2} {:a "3" :b 2}] {} [{:a "1" :b 2} {:a "3" :b 2}] - - [{:a "1" :b 2} {:a "3" :b 2}] - {:dtype :float32} - [{:a 1.0 :b 2.0} {:a 3.0 :b 2.0}] - - [{:a "1" :b 2} {:a "3" :b 2}] - {:dtype :str} - [{:a "1" :b "2"} {:a "3" :b "2"}] - - (to-array-2d [[1 2] [3 4]]) {} [{0 1 1 2} {0 3 1 4}] - (to-array-2d [[1 2] [3 4]]) - {:columns [:a :b]} [{:a 1 :b 2} {:a 3 :b 4}])) - -(deftest one-hot - (are [i m o] - (= (u/->clj (g/one-hot (g/series i) m)) o) - [] {} [] - ["a" "b"] {} [{:a 1 - :b 0} - {:a 0 - :b 1}] - ["a" "b"] {:prefix "pre"} [{:pre-a 1 - :pre-b 0} - {:pre-a 0 - :pre-b 1}]) - (are [i m o] - (= (u/->clj (g/one-hot (g/data-frame i) - {:columns m})) o) - - [{:a 1 :b "c"} {:a 2 :b "d"}] - [:b] - [{:a 1 - :b-c 1 - :b-d 0} - {:a 2 - :b-c 0 - :b-d 1}] - - [{:a 1 :b "c" :c 1} {:a 2 :b "d" :c 2}] - [:b :c] - [{:a 1 - :b-c 1 - :b-d 0 - :c-1 1 - :c-2 0} - {:a 2 - :b-c 0 - :b-d 1 - :c-1 0 - :c-2 1}])) - -(deftest unique - (are [i o] - (= (vec (g/unique i)) o) - [] [] - [1 1] [1] - [:a :b :a] ["a" "b"] - [1 -1 1] [1 -1])) - -(deftest index - (are [i o] - (= (vec (g/index i)) o) - (g/series []) [] - (g/series [1 2 3]) [0 1 2] - (g/series [1 2] {:index [100 1000]}) [100 1000])) - -(deftest values - (are [i o] - (= (vec (g/values i)) o) - (g/series []) [] - (g/series [1 2 3]) [1 2 3]) - (is (= (mapv vec (g/values (g/data-frame (to-array-2d [[1 2] [3 4]])))) - [[1 2] [3 4]]))) - -(deftest shape - (are [i o] - (= (vec (g/shape i)) o) - (g/series []) [0] - (g/series [1 2 3]) [3] - (g/data-frame (to-array-2d [[1 2] [3 4]])) [2 2])) - -(deftest hasnans? - (are [i o] - (= (g/hasnans? i) o) - (g/series []) false - (g/series [nil]) true - (g/series [1 2 nil]) true)) - -(deftest subset-rows - (are [s o] - (= (u/->clj (apply g/subset-rows - (g/data-frame (->> (range 1 11) - (partition 2) - to-array-2d)) s)) o) - [] (u/->clj (g/data-frame (->> (range 1 11) - (partition 2) - to-array-2d))) - [1] [{0 1 1 2}] - [1 3] [{0 3 1 4} {0 5 1 6}] - [1 3 2] [{0 3 1 4}])) - -(deftest cross-section - (are [k o] - (= (vec - (g/cross-section - (g/series (range 5) - {:index [:a :b :b :c :a]}) k)) - o) - :a [0 4] - :b [1 2])) - -(deftest head - (are [n o] - (= (u/->clj - (g/head - (g/data-frame - (vec - (flatten - (repeat 5 [{:a 1 :b 2} - {:a 2 :b 3}])))) n)) - o) - nil (drop-last (flatten - (repeat 3 [{:a 1 :b 2} - {:a 2 :b 3}]))) - 1 [{:a 1 :b 2}] - 8 (flatten - (repeat 4 [{:a 1 :b 2} - {:a 2 :b 3}])))) - -(deftest subset-cols - (are [i cols o] - (= (u/->clj - (apply - g/subset-cols - (g/data-frame i) - cols)) - o) - [{:a 1}] [:a] [{:a 1}] - [{:a 1 :b 2 :c 3}] [:a :c] [{:a 1 :c 3}] - (vec (repeat 5 {:a 1 :b 2})) [:b] (vec (repeat 5 {:b 2})))) - -(deftest n-largest - (are [m o] - (= (vec - (g/n-largest - (g/series (range 20)) m)) - o) - {:n 5} (range 19 14 -1) - {:n 3} [19 18 17] - {:n 8} (range 19 11 -1))) - -(deftest n-smallest - (are [m o] - (= (vec - (g/n-smallest - (g/series (range 20)) m)) - o) - {:n 5} (range 5) - {:n 3} (range 3) - {:n 8} (range 8))) - -(deftest n-unique - (are [i o] - (= (g/n-unique - (g/series i)) - o) - (range 10) 10 - [1 1 2] 2 - [11 nil 3] 2)) - -(deftest unique? - (are [i o] - (= (g/unique? i) o) - [] true - [1 2 3] true - [1 1] false - [-1 1] true - [1 nil] true - ["a" "b"] true - (g/series [1 1]) false)) - -(deftest increasing? - (are [i o] - (= (g/increasing? i) o) - [] true - [1 5 9] true - [1 nil 3] false - [1 1 1 1] true - [3 2 1] false)) - -(deftest decreasing? - (are [i o] - (= (g/decreasing? i) o) - [] true - [9 7 1] true - [3 nil 1] false - [3 3 3] true - [1 2 3] false)) - -(deftest value-counts - (are [i m o] - (= (g/value-counts i (merge {:clj true} m)) o) - [] {} {} - [1 1 2] {} {1 2 2 1} - [:a :a :b :c] {} {:a 2 :b 1 :c 1} - (repeat 50 :a) {} {:a 50} - [:a :a :b :c] {:normalize true} {:a 0.5 :b 0.25 :c 0.25})) - -(deftest reset-index - (are [i m o] - (= (u/->clj (g/reset-index (g/series i) m)) o) - (range 3) {} [{:index 0 0 0} - {:index 1 0 1} - {:index 2 0 2}] - (range 3) {:drop true} [{:unnamed 0} - {:unnamed 1} - {:unnamed 2}] - (range 3) {:name "col"} [{:index 0 :col 0} - {:index 1 :col 1} - {:index 2 :col 2}])) - -(deftest names - (are [i o] - (= (g/names i) o) - (g/series [1 2]) nil - (g/series [1 2] {:name "name"}) "name" - (g/series [1 2] {:name :my-name}) "my-name") - (are [i o] - (= (vec (g/names (g/data-frame i))) o) - [{:a 1 :b 2}] ["a" "b"] - [{"a name" 1 :c 2}] ["a name" "c"] - [{123 1 1/5 3}] [123.0 0.2])) - -(deftest filter-rows - (are [i b o] - (= (u/->clj - (g/filter-rows i b)) o) - (g/series (range 10)) #(m/gt % 5) [{:unnamed 6} - {:unnamed 7} - {:unnamed 8} - {:unnamed 9}] - (g/series (range 4)) [false true false true] [{:unnamed 1} - {:unnamed 3}] - - (g/data-frame [{:a 1 :b 2} - {:a 3 :b 4}]) - #(-> % - (g/subset-cols :a) - (m/lt 3) - g/values) - [{:a 1 :b 2}] - - (g/data-frame [{:a 1 :b 2} - {:a 3 :b 4} - {:a 4 :b 5}]) - [true false false] - [{:a 1 :b 2}])) - -(deftest tail - (are [i n o] - (= (u/->clj - (g/tail i n)) - o) - (g/series (range 20)) nil [{:unnamed 15} - {:unnamed 16} - {:unnamed 17} - {:unnamed 18} - {:unnamed 19}] - (g/series (range 20)) 2 [{:unnamed 18} {:unnamed 19}] - (g/data-frame (vec (repeat 10 {:a 1 :b 2}))) nil (repeat 5 {:a 1 :b 2}) - (g/data-frame (vec (repeat 10 {:a 1 :b 2}))) 2 (repeat 2 {:a 1 :b 2}))) - -(deftest fill-na - (are [v m o] - (= (vec - (g/fill-na (g/series [1 nil 2 nil]) v m)) o) - 3 {} [1.0 3.0 2.0 3.0] - "a" {} [1.0 "a" 2.0 "a"] - nil {:method :ffill} [1.0 1.0 2.0 2.0])) - -(deftest select-rows - (are [i id l h o] - (= (u/->clj - (g/select-rows - (g/data-frame i (or {:index l} {})) - id h)) - o) - (to-array-2d (partition 2 (range 20))) - [] - nil - nil - [] - - (to-array-2d (partition 2 (range 20))) - [0 3] - nil - nil - [{0 0 1 1} {0 6 1 7}] - - (to-array-2d (partition 2 (range 10))) - [0 3] - [:a :b :c :d :e] - nil - [{0 0 1 1} {0 6 1 7}] - - (to-array-2d (partition 2 (range 10))) - [0 3] - nil - :loc - [{0 0 1 1} {0 6 1 7}] - - (to-array-2d (partition 2 (range 10))) - [:a :d] - [:a :b :c :d :e] - :loc - [{0 0 1 1} {0 6 1 7}] - - (to-array-2d (partition 2 (range 10))) - (u/slice 3) - nil - nil - [{0 0 1 1} {0 2 1 3} {0 4 1 5}] - - (to-array-2d (partition 4 (range 20))) - [(u/slice 2) (u/slice 1)] - nil - :loc - [{0 0 1 1} {0 4 1 5} {0 8 1 9}])) - -(deftest set-index - (are [idx m oid ov] - (and (= (vec - (g/index - (g/set-index - (g/data-frame [{:a 1 :b 2 :c 3} {:a 2 :b 3 :c 4}]) - idx m))) - oid) - (= (u/->clj - (g/set-index - (g/data-frame [{:a 1 :b 2 :c 3} {:a 2 :b 3 :c 4}]) - idx m)) - ov)) - [:a] {} [1 2] [{:b 2 :c 3} {:b 3 :c 4}] - [:a :b] {} [[1 2] [2 3]] [{:c 3} {:c 4}] - [:a] {:drop false} [1 2] [{:a 1 :b 2 :c 3} {:a 2 :b 3 :c 4}] - [:a] {:append true} [[0 1] [1 2]] [{:b 2 :c 3} {:b 3 :c 4}])) +(ns panthera.generics-test + (:require + [clojure.test :refer :all] + [libpython-clj.python :as py] + [panthera.pandas.generics :as g :reload true] + [panthera.pandas.utils :as u :reload true] + [panthera.pandas.math :as m])) + +(deftest series + (are [i m] + (u/series? (g/series i m)) + [] {} + [] {:name :test} + [1 2 3] {} + 1 {} + ["1" "2"] {} + ["1" "2"] {:dtype :float32}) + (are [i m o] + (= (vec (g/series i m)) o) + [] {} [] + [] {:name :test} [] + [1 2 3] {} [1 2 3] + [:a :b] {} ["a" "b"] + ["a" "b"] {} ["a" "b"] + [1 2] {:dtype :str} ["1" "2"] + ["1" "2"] {:dtype :float32} [1.0 2.0])) + +(deftest data-frame + (are [i m] + (u/data-frame? (g/data-frame i m)) + [{:a 1 :b 2}] {} + (to-array-2d [[1 2] [3 4]]) {} + (to-array-2d [[1 2] [3 4]]) {:columns [:a :b]} + (to-array-2d [[1 2] [3 4]]) {:dtype :int8}) + (are [i m o] + (= (u/->clj (g/data-frame i m)) o) + [] {} [] + [] {:columns [:a :b]} [] + [{:a 1 :b 2} {:a 1 :b 2}] {} [{:a 1 :b 2} {:a 1 :b 2}] + [{:a "1" :b 2} {:a "3" :b 2}] {} [{:a "1" :b 2} {:a "3" :b 2}] + + [{:a "1" :b 2} {:a "3" :b 2}] + {:dtype :float32} + [{:a 1.0 :b 2.0} {:a 3.0 :b 2.0}] + + [{:a "1" :b 2} {:a "3" :b 2}] + {:dtype :str} + [{:a "1" :b "2"} {:a "3" :b "2"}] + + (to-array-2d [[1 2] [3 4]]) {} [{0 1 1 2} {0 3 1 4}] + (to-array-2d [[1 2] [3 4]]) + {:columns [:a :b]} [{:a 1 :b 2} {:a 3 :b 4}])) + +(deftest one-hot + (are [i m o] + (= (u/->clj (g/one-hot (g/series i) m)) o) + [] {} [] + ["a" "b"] {} [{:a 1 + :b 0} + {:a 0 + :b 1}] + ["a" "b"] {:prefix "pre"} [{:pre-a 1 + :pre-b 0} + {:pre-a 0 + :pre-b 1}]) + (are [i m o] + (= (u/->clj (g/one-hot (g/data-frame i) + {:columns m})) o) + + [{:a 1 :b "c"} {:a 2 :b "d"}] + [:b] + [{:a 1 + :b-c 1 + :b-d 0} + {:a 2 + :b-c 0 + :b-d 1}] + + [{:a 1 :b "c" :c 1} {:a 2 :b "d" :c 2}] + [:b :c] + [{:a 1 + :b-c 1 + :b-d 0 + :c-1 1 + :c-2 0} + {:a 2 + :b-c 0 + :b-d 1 + :c-1 0 + :c-2 1}])) + +(deftest unique + (are [i o] + (= (vec (g/unique i)) o) + [] [] + [1 1] [1] + [:a :b :a] ["a" "b"] + [1 -1 1] [1 -1])) + +(deftest index + (are [i o] + (= (vec (g/index i)) o) + (g/series []) [] + (g/series [1 2 3]) [0 1 2] + (g/series [1 2] {:index [100 1000]}) [100 1000])) + +(deftest values + (are [i o] + (= (vec (g/values i)) o) + (g/series []) [] + (g/series [1 2 3]) [1 2 3]) + (is (= (mapv vec (g/values (g/data-frame (to-array-2d [[1 2] [3 4]])))) + [[1 2] [3 4]]))) + +(deftest shape + (are [i o] + (= (vec (g/shape i)) o) + (g/series []) [0] + (g/series [1 2 3]) [3] + (g/data-frame (to-array-2d [[1 2] [3 4]])) [2 2])) + +(deftest hasnans? + (are [i o] + (= (g/hasnans? i) o) + (g/series []) false + (g/series [nil]) true + (g/series [1 2 nil]) true)) + +(deftest subset-rows + (are [s o] + (= (u/->clj (apply g/subset-rows + (g/data-frame (->> (range 1 11) + (partition 2) + to-array-2d)) s)) o) + [] (u/->clj (g/data-frame (->> (range 1 11) + (partition 2) + to-array-2d))) + [1] [{0 1 1 2}] + [1 3] [{0 3 1 4} {0 5 1 6}] + [1 3 2] [{0 3 1 4}])) + +(deftest cross-section + (are [k o] + (= (vec + (g/cross-section + (g/series (range 5) + {:index [:a :b :b :c :a]}) k)) + o) + :a [0 4] + :b [1 2])) + +(deftest head + (are [n o] + (= (u/->clj + (g/head + (g/data-frame + (vec + (flatten + (repeat 5 [{:a 1 :b 2} + {:a 2 :b 3}])))) n)) + o) + nil (drop-last (flatten + (repeat 3 [{:a 1 :b 2} + {:a 2 :b 3}]))) + 1 [{:a 1 :b 2}] + 8 (flatten + (repeat 4 [{:a 1 :b 2} + {:a 2 :b 3}])))) + +(deftest subset-cols + (are [i cols o] + (= (u/->clj + (apply + g/subset-cols + (g/data-frame i) + cols)) + o) + [{:a 1}] [:a] [{:a 1}] + [{:a 1 :b 2 :c 3}] [:a :c] [{:a 1 :c 3}] + (vec (repeat 5 {:a 1 :b 2})) [:b] (vec (repeat 5 {:b 2})))) + +(deftest n-largest + (are [m o] + (= (vec + (g/n-largest + (g/series (range 20)) m)) + o) + {:n 5} (range 19 14 -1) + {:n 3} [19 18 17] + {:n 8} (range 19 11 -1))) + +(deftest n-smallest + (are [m o] + (= (vec + (g/n-smallest + (g/series (range 20)) m)) + o) + {:n 5} (range 5) + {:n 3} (range 3) + {:n 8} (range 8))) + +(deftest n-unique + (are [i o] + (= (g/n-unique + (g/series i)) + o) + (range 10) 10 + [1 1 2] 2 + [11 nil 3] 2)) + +(deftest unique? + (are [i o] + (= (g/unique? i) o) + [] true + [1 2 3] true + [1 1] false + [-1 1] true + [1 nil] true + ["a" "b"] true + (g/series [1 1]) false)) + +(deftest increasing? + (are [i o] + (= (g/increasing? i) o) + [] true + [1 5 9] true + [1 nil 3] false + [1 1 1 1] true + [3 2 1] false)) + +(deftest decreasing? + (are [i o] + (= (g/decreasing? i) o) + [] true + [9 7 1] true + [3 nil 1] false + [3 3 3] true + [1 2 3] false)) + +(deftest value-counts + (are [i m o] + (= (g/value-counts i (merge {:clj true} m)) o) + [] {} {} + [1 1 2] {} {1 2 2 1} + [:a :a :b :c] {} {:a 2 :b 1 :c 1} + (repeat 50 :a) {} {:a 50} + [:a :a :b :c] {:normalize true} {:a 0.5 :b 0.25 :c 0.25})) + +(deftest reset-index + (are [i m o] + (= (u/->clj (g/reset-index (g/series i) m)) o) + (range 3) {} [{:index 0 0 0} + {:index 1 0 1} + {:index 2 0 2}] + (range 3) {:drop true} [{:unnamed 0} + {:unnamed 1} + {:unnamed 2}] + (range 3) {:name "col"} [{:index 0 :col 0} + {:index 1 :col 1} + {:index 2 :col 2}])) + +(deftest names + (are [i o] + (= (g/names i) o) + (g/series [1 2]) nil + (g/series [1 2] {:name "name"}) "name" + (g/series [1 2] {:name :my-name}) "my-name") + (are [i o] + (= (vec (g/names (g/data-frame i))) o) + [{:a 1 :b 2}] ["a" "b"] + [{"a name" 1 :c 2}] ["a name" "c"] + [{123 1 1/5 3}] [123.0 0.2])) + +(deftest filter-rows + (are [i b o] + (= (u/->clj + (g/filter-rows i b)) o) + (g/series (range 10)) #(m/gt % 5) [{:unnamed 6} + {:unnamed 7} + {:unnamed 8} + {:unnamed 9}] + (g/series (range 4)) [false true false true] [{:unnamed 1} + {:unnamed 3}] + + (g/data-frame [{:a 1 :b 2} + {:a 3 :b 4}]) + #(-> % + (g/subset-cols :a) + (m/lt 3) + g/values) + [{:a 1 :b 2}] + + (g/data-frame [{:a 1 :b 2} + {:a 3 :b 4} + {:a 4 :b 5}]) + [true false false] + [{:a 1 :b 2}])) + +(deftest tail + (are [i n o] + (= (u/->clj + (g/tail i n)) + o) + (g/series (range 20)) nil [{:unnamed 15} + {:unnamed 16} + {:unnamed 17} + {:unnamed 18} + {:unnamed 19}] + (g/series (range 20)) 2 [{:unnamed 18} {:unnamed 19}] + (g/data-frame (vec (repeat 10 {:a 1 :b 2}))) nil (repeat 5 {:a 1 :b 2}) + (g/data-frame (vec (repeat 10 {:a 1 :b 2}))) 2 (repeat 2 {:a 1 :b 2}))) + +(deftest fill-na + (are [v m o] + (= (vec + (g/fill-na (g/series [1 nil 2 nil]) v m)) o) + 3 {} [1.0 3.0 2.0 3.0] + "a" {} [1.0 "a" 2.0 "a"] + nil {:method :ffill} [1.0 1.0 2.0 2.0])) + +(deftest select-rows + (are [i id l h o] + (= (u/->clj + (g/select-rows + (g/data-frame i (or {:index l} {})) + id h)) + o) + (to-array-2d (partition 2 (range 20))) + [] + nil + nil + [] + + (to-array-2d (partition 2 (range 20))) + [0 3] + nil + nil + [{0 0 1 1} {0 6 1 7}] + + (to-array-2d (partition 2 (range 10))) + [0 3] + [:a :b :c :d :e] + nil + [{0 0 1 1} {0 6 1 7}] + + (to-array-2d (partition 2 (range 10))) + [0 3] + nil + :loc + [{0 0 1 1} {0 6 1 7}] + + (to-array-2d (partition 2 (range 10))) + [:a :d] + [:a :b :c :d :e] + :loc + [{0 0 1 1} {0 6 1 7}] + + (to-array-2d (partition 2 (range 10))) + (u/slice 3) + nil + nil + [{0 0 1 1} {0 2 1 3} {0 4 1 5}] + + (to-array-2d (partition 4 (range 20))) + [(u/slice 2) (u/slice 1)] + nil + :loc + [{0 0 1 1} {0 4 1 5} {0 8 1 9}])) + +(deftest set-index + (are [idx m oid ov] + (and (= (vec + (g/index + (g/set-index + (g/data-frame [{:a 1 :b 2 :c 3} {:a 2 :b 3 :c 4}]) + idx m))) + oid) + (= (u/->clj + (g/set-index + (g/data-frame [{:a 1 :b 2 :c 3} {:a 2 :b 3 :c 4}]) + idx m)) + ov)) + [:a] {} [1 2] [{:b 2 :c 3} {:b 3 :c 4}] + [:a :b] {} [[1 2] [2 3]] [{:c 3} {:c 4}] + [:a] {:drop false} [1 2] [{:a 1 :b 2 :c 3} {:a 2 :b 3 :c 4}] + [:a] {:append true} [[0 1] [1 2]] [{:b 2 :c 3} {:b 3 :c 4}])) diff --git a/test/panthera/utils_test.clj b/test/panthera/utils_test.clj index f41ec68..f016a87 100644 --- a/test/panthera/utils_test.clj +++ b/test/panthera/utils_test.clj @@ -1,72 +1,72 @@ -(ns panthera.utils-test - (:require - [clojure.test :refer :all] - [panthera.config :refer [start-python!]] - [libpython-clj.python :as py] - [panthera.pandas.utils :as u])) - -(use-fixtures :once start-python!) - -(deftest pytype - (are [t d] - (identical? t (u/pytype d)) - :list (py/->py-list []) - :list (py/->py-list [-1]) - :list (py/->py-list [1 2 3]) - :list (py/->py-list [[1 2] [3 4]]) - :tuple (py/->py-tuple []) - :tuple (py/->py-tuple [0]) - :tuple (py/->py-tuple [1 2 3]) - :tuple (py/->py-tuple [[1 2] [3 4]]) - :dict (py/->py-dict {}) - :dict (py/->py-dict {:a 1 :b "2" :c [1 2 3]}) - :dict (py/->py-dict {"a" 1}))) - -(deftest slice - (are [d] - (identical? :slice (u/pytype (apply u/slice d))) - [] - [nil] - [1] - [1 2] - [1 2 3] - [3 7 2]) - (are [s res] - (= (py/->jvm - (py/get-item - (py/->py-list (range 4)) s)) res) - (u/slice) (vec (range 4)) - (u/slice 2) [0 1] - (u/slice 1 3) [1 2] - (u/slice -1) [0 1 2] - (u/slice 0 5 2) [0 2])) - -(deftest keys->pyargs - (are [i o] - (= (u/keys->pyargs i) o) - {} {} - {:a 1} {"a" 1} - {:a 1 :b 2} {"a" 1 "b" 2} - {:a-k 1} {"a_k" 1})) - -(deftest memo-columns-converter - (are [i o] - (= (u/memo-columns-converter i) o) - 1 1 - nil nil - "a" :a - "col_1" :col-1 - ["multi" "col"] [:multi :col] - "ALL_CAPS" :ALL-CAPS - "WeIrD_caPs" :WeIrD-caPs)) - -(deftest ->clj - (is (= (u/->clj - (py/call-attr u/pd "DataFrame" [{:a 1 :b 2} {:a 3 :b 4}])) - [{:a 1 :b 2} {:a 3 :b 4}])) - (is (= (u/->clj - (py/call-attr u/pd "Series" [1 2 3])) - [{:unnamed 1} {:unnamed 2} {:unnamed 3}])) - (is (= (u/->clj - (py/call-attr-kw u/pd "Series" [[1 2 3]] {"name" "test"})) - [{:test 1} {:test 2} {:test 3}]))) +(ns panthera.utils-test + (:require + [clojure.test :refer :all] + [panthera.config :refer [start-python!]] + [libpython-clj.python :as py] + [panthera.pandas.utils :as u])) + +(use-fixtures :once start-python!) + +(deftest pytype + (are [t d] + (identical? t (u/pytype d)) + :list (py/->py-list []) + :list (py/->py-list [-1]) + :list (py/->py-list [1 2 3]) + :list (py/->py-list [[1 2] [3 4]]) + :tuple (py/->py-tuple []) + :tuple (py/->py-tuple [0]) + :tuple (py/->py-tuple [1 2 3]) + :tuple (py/->py-tuple [[1 2] [3 4]]) + :dict (py/->py-dict {}) + :dict (py/->py-dict {:a 1 :b "2" :c [1 2 3]}) + :dict (py/->py-dict {"a" 1}))) + +(deftest slice + (are [d] + (identical? :slice (u/pytype (apply u/slice d))) + [] + [nil] + [1] + [1 2] + [1 2 3] + [3 7 2]) + (are [s res] + (= (py/->jvm + (py/get-item + (py/->py-list (range 4)) s)) res) + (u/slice) (vec (range 4)) + (u/slice 2) [0 1] + (u/slice 1 3) [1 2] + (u/slice -1) [0 1 2] + (u/slice 0 5 2) [0 2])) + +(deftest keys->pyargs + (are [i o] + (= (u/keys->pyargs i) o) + {} {} + {:a 1} {"a" 1} + {:a 1 :b 2} {"a" 1 "b" 2} + {:a-k 1} {"a_k" 1})) + +(deftest memo-columns-converter + (are [i o] + (= (u/memo-columns-converter i) o) + 1 1 + nil nil + "a" :a + "col_1" :col-1 + ["multi" "col"] [:multi :col] + "ALL_CAPS" :ALL-CAPS + "WeIrD_caPs" :WeIrD-caPs)) + +(deftest ->clj + (is (= (u/->clj + (py/call-attr u/pd "DataFrame" [{:a 1 :b 2} {:a 3 :b 4}])) + [{:a 1 :b 2} {:a 3 :b 4}])) + (is (= (u/->clj + (py/call-attr u/pd "Series" [1 2 3])) + [{:unnamed 1} {:unnamed 2} {:unnamed 3}])) + (is (= (u/->clj + (py/call-attr-kw u/pd "Series" [[1 2 3]] {"name" "test"})) + [{:test 1} {:test 2} {:test 3}])))