From d6c17eaa56ab99ec6b178f8e35b54ba33b5f8e20 Mon Sep 17 00:00:00 2001 From: zhangyi Date: Mon, 27 May 2019 10:48:04 +0800 Subject: [PATCH] add class --- .../data_analysis/pandas_count.ipynb | 1135 +++++++++++++++++ 1 file changed, 1135 insertions(+) create mode 100644 machine_learning_diary/data_analysis/pandas_count.ipynb diff --git a/machine_learning_diary/data_analysis/pandas_count.ipynb b/machine_learning_diary/data_analysis/pandas_count.ipynb new file mode 100644 index 0000000..2cc41ee --- /dev/null +++ b/machine_learning_diary/data_analysis/pandas_count.ipynb @@ -0,0 +1,1135 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pandas - 汇总和频数统计\n", + "\n", + "单变量频数统计&多变量分组统计中的相关方法介绍。\n", + "\n", + "## 1. count&unique&nunique" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
x1x2x3
0a14
1b23
2c32
3b41
\n", + "
" + ], + "text/plain": [ + " x1 x2 x3\n", + "0 a 1 4\n", + "1 b 2 3\n", + "2 c 3 2\n", + "3 b 4 1" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "test_data = pd.DataFrame({\n", + " 'x1': [\"a\", \"b\", \"c\", \"b\"],\n", + " \"x2\": [1, 2, 3, 4],\n", + " \"x3\": [4, 3, 2, 1]\n", + "})\n", + "test_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.1 统计个数count" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_data['x1'].count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.2 统计不重复值个数nunique" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_data['x1'].nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.3 筛选不重复值" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['a', 'b', 'c'], dtype=object)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_data['x1'].unique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.4 统计某一个值的频数\n", + "\n", + "不同于列表,可以直接统计某个值出现的次数,DataFrame需要做一些转换。" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2\n", + "2\n", + "2\n" + ] + } + ], + "source": [ + "print(list(test_data['x1']).count('b'))\n", + "\n", + "print(sum(test_data['x1'].apply(lambda x: 1 if x=='b' else 0)))\n", + "\n", + "print(test_data['x1'].apply(lambda x: 1 if x=='b' else 0).sum())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. 分组统计 - groupby\n", + "\n", + "groupby有一点奇葩,分组之后,label都变成索引(行名了),可以设置as_index=False改变默认参数。" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
x1x2x3
0a11
1a12
2b13
3b24
4c25
\n", + "
" + ], + "text/plain": [ + " x1 x2 x3\n", + "0 a 1 1\n", + "1 a 1 2\n", + "2 b 1 3\n", + "3 b 2 4\n", + "4 c 2 5" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x = pd.DataFrame({\n", + " \"x1\": [\"a\", \"a\", \"b\", \"b\", 'c'],\n", + " \"x2\": [1, 1, 1, 2, 2],\n", + " \"x3\": [1, 2, 3, 4, 5]\n", + "})\n", + "\n", + "x" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1 分组统计count(*)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
x2x3
x1
a22
b22
c11
\n", + "
" + ], + "text/plain": [ + " x2 x3\n", + "x1 \n", + "a 2 2\n", + "b 2 2\n", + "c 1 1" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 分组统计各个列的个数\n", + "x.groupby(by='x1').count()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
x1x2x3
0a12
1b11
2b21
3c21
\n", + "
" + ], + "text/plain": [ + " x1 x2 x3\n", + "0 a 1 2\n", + "1 b 1 1\n", + "2 b 2 1\n", + "3 c 2 1" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.groupby(by=['x1', 'x2'], as_index=False).count()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "x1\n", + "a 2\n", + "b 2\n", + "c 1\n", + "dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 这里没有分各个列。\n", + "x.groupby(by='x1').size()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2 分组统计count(distinct col1)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
x1x2x3
x1
a112
b122
c111
\n", + "
" + ], + "text/plain": [ + " x1 x2 x3\n", + "x1 \n", + "a 1 1 2\n", + "b 1 2 2\n", + "c 1 1 1" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 类似于sql:select x1,count(distinct x1),count(distinct x2),count(distinct x3) from table group by x1\n", + "x.groupby(by='x1').nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.4 其余统计函数" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
x3
x1x2
a11.5
b13.0
24.0
c25.0
\n", + "
" + ], + "text/plain": [ + " x3\n", + "x1 x2 \n", + "a 1 1.5\n", + "b 1 3.0\n", + " 2 4.0\n", + "c 2 5.0" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.groupby(by=[\"x1\",'x2']).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
x3
x1x2
a13
b13
24
c25
\n", + "
" + ], + "text/plain": [ + " x3\n", + "x1 x2 \n", + "a 1 3\n", + "b 1 3\n", + " 2 4\n", + "c 2 5" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.groupby(by=[\"x1\",'x2']).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
x1x2x3
0a13
1b13
2b24
3c25
\n", + "
" + ], + "text/plain": [ + " x1 x2 x3\n", + "0 a 1 3\n", + "1 b 1 3\n", + "2 b 2 4\n", + "3 c 2 5" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.groupby(by=[\"x1\",'x2'], as_index=False).aggregate(sum)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.5 整体的描述统计" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
x3
countmeanstdmin25%50%75%max
x1x2
a12.01.50.7071071.01.251.51.752.0
b11.03.0NaN3.03.003.03.003.0
21.04.0NaN4.04.004.04.004.0
c21.05.0NaN5.05.005.05.005.0
\n", + "
" + ], + "text/plain": [ + " x3 \n", + " count mean std min 25% 50% 75% max\n", + "x1 x2 \n", + "a 1 2.0 1.5 0.707107 1.0 1.25 1.5 1.75 2.0\n", + "b 1 1.0 3.0 NaN 3.0 3.00 3.0 3.00 3.0\n", + " 2 1.0 4.0 NaN 4.0 4.00 4.0 4.00 4.0\n", + "c 2 1.0 5.0 NaN 5.0 5.00 5.0 5.00 5.0" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.groupby(by=[\"x1\",'x2'], as_index=True).describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
x2x3
countmeanstdmin25%50%75%maxcountmeanstdmin25%50%75%max
02.01.00.01.01.01.01.01.02.01.50.7071071.01.251.51.752.0
11.01.0NaN1.01.01.01.01.01.03.0NaN3.03.003.03.003.0
21.02.0NaN2.02.02.02.02.01.04.0NaN4.04.004.04.004.0
31.02.0NaN2.02.02.02.02.01.05.0NaN5.05.005.05.005.0
\n", + "
" + ], + "text/plain": [ + " x2 x3 \\\n", + " count mean std min 25% 50% 75% max count mean std min 25% \n", + "0 2.0 1.0 0.0 1.0 1.0 1.0 1.0 1.0 2.0 1.5 0.707107 1.0 1.25 \n", + "1 1.0 1.0 NaN 1.0 1.0 1.0 1.0 1.0 1.0 3.0 NaN 3.0 3.00 \n", + "2 1.0 2.0 NaN 2.0 2.0 2.0 2.0 2.0 1.0 4.0 NaN 4.0 4.00 \n", + "3 1.0 2.0 NaN 2.0 2.0 2.0 2.0 2.0 1.0 5.0 NaN 5.0 5.00 \n", + "\n", + " \n", + " 50% 75% max \n", + "0 1.5 1.75 2.0 \n", + "1 3.0 3.00 3.0 \n", + "2 4.0 4.00 4.0 \n", + "3 5.0 5.00 5.0 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.groupby(by=[\"x1\",'x2'], as_index=False).describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}