|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "#### Importing Data\n", |
| 8 | + "- pd.read_csv(filename) | From a CSV file\n", |
| 9 | + "- pd.read_table(filename) | From a delimited text file (like TSV)\n", |
| 10 | + "- pd.read_excel(filename) | From an Excel file\n", |
| 11 | + "- pd.read_sql(query, connection_object) | Read from a SQL table/database\n", |
| 12 | + "- pd.read_json(json_string) | Read from a JSON formatted string, URL or file.\n", |
| 13 | + "- pd.read_html(url) | Parses an html URL, string or file and extracts tables to a list of dataframes\n", |
| 14 | + "- pd.read_clipboard() | Takes the contents of your clipboard and passes it to read_table()\n", |
| 15 | + "- pd.DataFrame(dict) | From a dict, keys for columns names, values for data as lists" |
| 16 | + ] |
| 17 | + }, |
| 18 | + { |
| 19 | + "cell_type": "markdown", |
| 20 | + "metadata": {}, |
| 21 | + "source": [ |
| 22 | + "#### Exporting Data\n", |
| 23 | + "- df.to_csv(filename) | Write to a CSV file\n", |
| 24 | + "- df.to_excel(filename) | Write to an Excel file\n", |
| 25 | + "- df.to_sql(table_name, connection_object) | Write to a SQL table\n", |
| 26 | + "- df.to_json(filename) | Write to a file in JSON format" |
| 27 | + ] |
| 28 | + }, |
| 29 | + { |
| 30 | + "cell_type": "markdown", |
| 31 | + "metadata": {}, |
| 32 | + "source": [ |
| 33 | + "#### Create Test Objects\n", |
| 34 | + "Useful for testing code segements\n", |
| 35 | + "\n", |
| 36 | + "- pd.DataFrame(np.random.rand(20,5)) | 5 columns and 20 rows of random floats\n", |
| 37 | + "- pd.Series(my_list) | Create a series from an iterable my_list\n", |
| 38 | + "- df.index = pd.date_range('1900/1/30', periods=df.shape[0]) | Add a date index" |
| 39 | + ] |
| 40 | + }, |
| 41 | + { |
| 42 | + "cell_type": "markdown", |
| 43 | + "metadata": {}, |
| 44 | + "source": [ |
| 45 | + "#### Viewing/Inspecting Data\n", |
| 46 | + "- df.head(n) | First n rows of the DataFrame\n", |
| 47 | + "- df.tail(n) | Last n rows of the DataFrame\n", |
| 48 | + "- df.shape | Number of rows and columns\n", |
| 49 | + "- df.info() | Index, Datatype and Memory information\n", |
| 50 | + "- df.describe() | Summary statistics for numerical columns\n", |
| 51 | + "- s.value_counts(dropna=False) | View unique values and counts\n", |
| 52 | + "- df.apply(pd.Series.value_counts) | Unique values and counts for all columns" |
| 53 | + ] |
| 54 | + }, |
| 55 | + { |
| 56 | + "cell_type": "markdown", |
| 57 | + "metadata": {}, |
| 58 | + "source": [ |
| 59 | + "#### Selection\n", |
| 60 | + "- df[col] | Returns column with label col as Series\n", |
| 61 | + "- df[[col1, col2]] | Returns columns as a new DataFrame\n", |
| 62 | + "- s.iloc[0] | Selection by position\n", |
| 63 | + "- s.loc['index_one'] | Selection by index\n", |
| 64 | + "- df.iloc[0,:] | First row\n", |
| 65 | + "- df.iloc[0,0] | First element of \n", |
| 66 | + "- df.iat([0],[0]) | row & column\n", |
| 67 | + "- df.at([0], ['Country']) | row & column\n", |
| 68 | + "- df.ix[2] | Select single row in subset of rows\n", |
| 69 | + "- df.ix[:,'Capital'] | Select a single column of subset of columns\n", |
| 70 | + "- df.ix[1,'Capital'] | Select rows and columns\n", |
| 71 | + "- s[~(s > 1)] | Series s where value is not >1\n", |
| 72 | + "- s[(s < -1) | (s > 2)] | s where value is <-1 or >2\n", |
| 73 | + "- df[df['Population']>1200000000] | Use filter to adjust DataFrame\n", |
| 74 | + "- s['a'] = 6 | set index a of Series s to 6" |
| 75 | + ] |
| 76 | + }, |
| 77 | + { |
| 78 | + "cell_type": "markdown", |
| 79 | + "metadata": {}, |
| 80 | + "source": [ |
| 81 | + "#### Data Cleaning\n", |
| 82 | + "- df.columns = ['a','b','c'] | Rename columns\n", |
| 83 | + "- pd.isnull() | Checks for null Values, Returns Boolean Arrray\n", |
| 84 | + "- pd.notnull() | Opposite of pd.isnull()\n", |
| 85 | + "- df.dropna() | Drop all rows that contain null values\n", |
| 86 | + "- df.dropna(axis=1) | Drop all columns that contain null values\n", |
| 87 | + "- df.dropna(axis=1,thresh=n) | Drop all rows have have less than n non null values\n", |
| 88 | + "- df.fillna(x) | Replace all null values with x\n", |
| 89 | + "- s.fillna(s.mean()) | Replace all null values with the mean (mean can be replaced with almost any function from the statistics module)\n", |
| 90 | + "- s.astype(float) | Convert the datatype of the series to float\n", |
| 91 | + "- s.replace(1,'one') | Replace all values equal to 1 with 'one'\n", |
| 92 | + "- s.replace([1,3],['one','three']) | Replace all 1 with 'one' and 3 with 'three'\n", |
| 93 | + "- df.rename(columns=lambda x: x + 1) | Mass renaming of columns\n", |
| 94 | + "- df.rename(columns={'old_name': 'new_ name'}) | Selective renaming\n", |
| 95 | + "- df.set_index('column_one') | Change the index\n", |
| 96 | + "- df.rename(index=lambda x: x + 1) | Mass renaming of index\n", |
| 97 | + "- df.drop(['a', 'c']) | Drop value from row (axis=0)\n", |
| 98 | + "- df.drop('Col_name', axis=1) | Drop values from columns (axis=1)" |
| 99 | + ] |
| 100 | + }, |
| 101 | + { |
| 102 | + "cell_type": "markdown", |
| 103 | + "metadata": {}, |
| 104 | + "source": [ |
| 105 | + "#### Filter, Sort, and Groupby\n", |
| 106 | + "- df[df[col] > 0.5] | Rows where the column col is greater than 0.5\n", |
| 107 | + "- df[(df[col] > 0.5) & (df[col] < 0.7)] | Rows where 0.7 > col > 0.5\n", |
| 108 | + "- df.sort_values(col1) | Sort values by col1 in ascending order\n", |
| 109 | + "- df.sort_values(col2,ascending=False) | Sort values by col2 in descending order\n", |
| 110 | + "- df.sort_values([col1,col2],ascending=[True,False]) | Sort values by col1 in ascending order then col2 in descending order\n", |
| 111 | + "- df.sort_index | Sort by labels along an axis\n", |
| 112 | + "- df.groupby(col) | Returns a groupby object for values from one column\n", |
| 113 | + "- df.groupby([col1,col2]) | Returns groupby object for values from multiple columns\n", |
| 114 | + "- df.groupby(col1)[col2] | Returns the mean of the values in col2, grouped by the values in col1 (mean can be replaced with almost any function from the statistics module)\n", |
| 115 | + "- df.pivot_table(index=col1,values=[col2,col3],aggfunc=mean) | Create a pivot table that groups by col1 and calculates the mean of col2 and col3\n", |
| 116 | + "- df.groupby(col1).agg(np.mean) | Find the average across all columns for every unique col1 group\n", |
| 117 | + "- df.apply(np.mean) | Apply the function np.mean() across each column\n", |
| 118 | + "- df.applymap() | Apply function element-wise\n", |
| 119 | + "- df.rank() | assign ranks to entries" |
| 120 | + ] |
| 121 | + }, |
| 122 | + { |
| 123 | + "cell_type": "markdown", |
| 124 | + "metadata": {}, |
| 125 | + "source": [ |
| 126 | + "#### Join/Combine\n", |
| 127 | + "- df1.append(df2) | Add the rows in df1 to the end of df2 (columns should be identical)\n", |
| 128 | + "- pd.concat([df1, df2],axis=1) | Add the columns in df1 to the end of df2 (rows should be identical)\n", |
| 129 | + "- df1.join(df2,on=col1,how='inner') | SQL-style join the columns in df1 with the columns on df2 where the rows forcol have identical values. 'how' can be one of 'left', 'right', 'outer', 'inner'" |
| 130 | + ] |
| 131 | + }, |
| 132 | + { |
| 133 | + "cell_type": "markdown", |
| 134 | + "metadata": {}, |
| 135 | + "source": [ |
| 136 | + "#### Statistics\n", |
| 137 | + "These can all be applied to a series as well.\n", |
| 138 | + "\n", |
| 139 | + "- df.describe() | Summary statistics for numerical columns\n", |
| 140 | + "- df.mean() | Returns the mean of all columns\n", |
| 141 | + "- df.corr() | Returns the correlation between columns in a DataFrame\n", |
| 142 | + "- df.count() | Returns the number of non-null values in each DataFrame column\n", |
| 143 | + "- df.max() | Returns the highest value in each column\n", |
| 144 | + "- df.min() | Returns the lowest value in each column\n", |
| 145 | + "- df.median() | Returns the median of each column\n", |
| 146 | + "- df.std() | Returns the standard deviation of each column" |
| 147 | + ] |
| 148 | + }, |
| 149 | + { |
| 150 | + "cell_type": "markdown", |
| 151 | + "metadata": {}, |
| 152 | + "source": [ |
| 153 | + "#### Datetime\n", |
| 154 | + "\n", |
| 155 | + "- pd.to_datetime - converts a date to a datetime object\n", |
| 156 | + "- pd.to_local - " |
| 157 | + ] |
| 158 | + }, |
| 159 | + { |
| 160 | + "cell_type": "markdown", |
| 161 | + "metadata": {}, |
| 162 | + "source": [] |
| 163 | + }, |
| 164 | + { |
| 165 | + "cell_type": "code", |
| 166 | + "execution_count": null, |
| 167 | + "metadata": {}, |
| 168 | + "outputs": [], |
| 169 | + "source": [] |
| 170 | + } |
| 171 | + ], |
| 172 | + "metadata": { |
| 173 | + "kernelspec": { |
| 174 | + "display_name": "Python 3", |
| 175 | + "language": "python", |
| 176 | + "name": "python3" |
| 177 | + }, |
| 178 | + "language_info": { |
| 179 | + "codemirror_mode": { |
| 180 | + "name": "ipython", |
| 181 | + "version": 3 |
| 182 | + }, |
| 183 | + "file_extension": ".py", |
| 184 | + "mimetype": "text/x-python", |
| 185 | + "name": "python", |
| 186 | + "nbconvert_exporter": "python", |
| 187 | + "pygments_lexer": "ipython3", |
| 188 | + "version": "3.6.9" |
| 189 | + }, |
| 190 | + "toc": { |
| 191 | + "base_numbering": 1, |
| 192 | + "nav_menu": {}, |
| 193 | + "number_sections": true, |
| 194 | + "sideBar": true, |
| 195 | + "skip_h1_title": false, |
| 196 | + "title_cell": "Table of Contents", |
| 197 | + "title_sidebar": "Contents", |
| 198 | + "toc_cell": false, |
| 199 | + "toc_position": {}, |
| 200 | + "toc_section_display": true, |
| 201 | + "toc_window_display": true |
| 202 | + } |
| 203 | + }, |
| 204 | + "nbformat": 4, |
| 205 | + "nbformat_minor": 2 |
| 206 | +} |
0 commit comments