Skip to content

Commit f4ba3aa

Browse files
committed
Add quick comparison of pandas groupby, pivot_table and crosstab functions
1 parent 152979e commit f4ba3aa

File tree

1 file changed

+217
-0
lines changed

1 file changed

+217
-0
lines changed

pandas-groupby-pivot-crosstab.ipynb

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 62,
6+
"metadata": {},
7+
"outputs": [
8+
{
9+
"data": {
10+
"text/html": [
11+
"<div>\n",
12+
"<style scoped>\n",
13+
" .dataframe tbody tr th:only-of-type {\n",
14+
" vertical-align: middle;\n",
15+
" }\n",
16+
"\n",
17+
" .dataframe tbody tr th {\n",
18+
" vertical-align: top;\n",
19+
" }\n",
20+
"\n",
21+
" .dataframe thead th {\n",
22+
" text-align: right;\n",
23+
" }\n",
24+
"</style>\n",
25+
"<table border=\"1\" class=\"dataframe\">\n",
26+
" <thead>\n",
27+
" <tr style=\"text-align: right;\">\n",
28+
" <th></th>\n",
29+
" <th>A</th>\n",
30+
" <th>B</th>\n",
31+
" <th>C</th>\n",
32+
" <th>D</th>\n",
33+
" <th>E</th>\n",
34+
" <th>F</th>\n",
35+
" <th>G</th>\n",
36+
" <th>H</th>\n",
37+
" <th>I</th>\n",
38+
" <th>J</th>\n",
39+
" </tr>\n",
40+
" </thead>\n",
41+
" <tbody>\n",
42+
" <tr>\n",
43+
" <th>0</th>\n",
44+
" <td>52</td>\n",
45+
" <td>93</td>\n",
46+
" <td>15</td>\n",
47+
" <td>72</td>\n",
48+
" <td>61</td>\n",
49+
" <td>21</td>\n",
50+
" <td>83</td>\n",
51+
" <td>87</td>\n",
52+
" <td>75</td>\n",
53+
" <td>75</td>\n",
54+
" </tr>\n",
55+
" <tr>\n",
56+
" <th>1</th>\n",
57+
" <td>88</td>\n",
58+
" <td>24</td>\n",
59+
" <td>3</td>\n",
60+
" <td>22</td>\n",
61+
" <td>53</td>\n",
62+
" <td>2</td>\n",
63+
" <td>88</td>\n",
64+
" <td>30</td>\n",
65+
" <td>38</td>\n",
66+
" <td>2</td>\n",
67+
" </tr>\n",
68+
" <tr>\n",
69+
" <th>2</th>\n",
70+
" <td>64</td>\n",
71+
" <td>60</td>\n",
72+
" <td>21</td>\n",
73+
" <td>33</td>\n",
74+
" <td>76</td>\n",
75+
" <td>58</td>\n",
76+
" <td>22</td>\n",
77+
" <td>89</td>\n",
78+
" <td>49</td>\n",
79+
" <td>91</td>\n",
80+
" </tr>\n",
81+
" <tr>\n",
82+
" <th>3</th>\n",
83+
" <td>59</td>\n",
84+
" <td>42</td>\n",
85+
" <td>92</td>\n",
86+
" <td>60</td>\n",
87+
" <td>80</td>\n",
88+
" <td>15</td>\n",
89+
" <td>62</td>\n",
90+
" <td>62</td>\n",
91+
" <td>47</td>\n",
92+
" <td>62</td>\n",
93+
" </tr>\n",
94+
" <tr>\n",
95+
" <th>4</th>\n",
96+
" <td>51</td>\n",
97+
" <td>55</td>\n",
98+
" <td>64</td>\n",
99+
" <td>3</td>\n",
100+
" <td>51</td>\n",
101+
" <td>7</td>\n",
102+
" <td>21</td>\n",
103+
" <td>73</td>\n",
104+
" <td>39</td>\n",
105+
" <td>18</td>\n",
106+
" </tr>\n",
107+
" </tbody>\n",
108+
"</table>\n",
109+
"</div>"
110+
],
111+
"text/plain": [
112+
" A B C D E F G H I J\n",
113+
"0 52 93 15 72 61 21 83 87 75 75\n",
114+
"1 88 24 3 22 53 2 88 30 38 2\n",
115+
"2 64 60 21 33 76 58 22 89 49 91\n",
116+
"3 59 42 92 60 80 15 62 62 47 62\n",
117+
"4 51 55 64 3 51 7 21 73 39 18"
118+
]
119+
},
120+
"execution_count": 62,
121+
"metadata": {},
122+
"output_type": "execute_result"
123+
}
124+
],
125+
"source": [
126+
"from string import ascii_uppercase\n",
127+
"\n",
128+
"import numpy as np\n",
129+
"import pandas as pd\n",
130+
"\n",
131+
"cols = list(ascii_uppercase[:10])\n",
132+
"np.random.seed(42)\n",
133+
"data = np.random.randint(1, 100, size=(100_000, 10))\n",
134+
"df = pd.DataFrame(data, columns=cols)\n",
135+
"df.head()"
136+
]
137+
},
138+
{
139+
"cell_type": "code",
140+
"execution_count": 63,
141+
"metadata": {},
142+
"outputs": [
143+
{
144+
"name": "stdout",
145+
"output_type": "stream",
146+
"text": [
147+
"8.99 ms ± 91.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
148+
]
149+
}
150+
],
151+
"source": [
152+
"%%timeit\n",
153+
"# groupby count values are floats\n",
154+
"df.groupby(['A', 'B'])['C'].count().unstack().fillna(0)"
155+
]
156+
},
157+
{
158+
"cell_type": "code",
159+
"execution_count": 64,
160+
"metadata": {},
161+
"outputs": [
162+
{
163+
"name": "stdout",
164+
"output_type": "stream",
165+
"text": [
166+
"35.7 ms ± 265 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
167+
]
168+
}
169+
],
170+
"source": [
171+
"%%timeit\n",
172+
"# pivot_table count values are integers\n",
173+
"df.pivot_table(values='C', index='A', columns='B', aggfunc='count', fill_value=0)"
174+
]
175+
},
176+
{
177+
"cell_type": "code",
178+
"execution_count": 65,
179+
"metadata": {},
180+
"outputs": [
181+
{
182+
"name": "stdout",
183+
"output_type": "stream",
184+
"text": [
185+
"79.2 ms ± 423 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
186+
]
187+
}
188+
],
189+
"source": [
190+
"%%timeit\n",
191+
"# crosstab count values are integers\n",
192+
"pd.crosstab(df.A, df.B)"
193+
]
194+
}
195+
],
196+
"metadata": {
197+
"kernelspec": {
198+
"display_name": "Python 3",
199+
"language": "python",
200+
"name": "python3"
201+
},
202+
"language_info": {
203+
"codemirror_mode": {
204+
"name": "ipython",
205+
"version": 3
206+
},
207+
"file_extension": ".py",
208+
"mimetype": "text/x-python",
209+
"name": "python",
210+
"nbconvert_exporter": "python",
211+
"pygments_lexer": "ipython3",
212+
"version": "3.6.7"
213+
}
214+
},
215+
"nbformat": 4,
216+
"nbformat_minor": 4
217+
}

0 commit comments

Comments
 (0)