|
5 | 5 | "colab": { |
6 | 6 | "name": "01_intro_dask.ipynb", |
7 | 7 | "provenance": [], |
8 | | - "authorship_tag": "ABX9TyOkip/jPKANIdmhdkT7IEkI" |
| 8 | + "authorship_tag": "ABX9TyOkip/jPKANIdmhdkT7IEkI", |
| 9 | + "include_colab_link": true |
9 | 10 | }, |
10 | 11 | "kernelspec": { |
11 | 12 | "name": "python3", |
12 | 13 | "display_name": "Python 3" |
13 | 14 | } |
14 | 15 | }, |
15 | 16 | "cells": [ |
| 17 | + { |
| 18 | + "cell_type": "markdown", |
| 19 | + "metadata": { |
| 20 | + "id": "view-in-github", |
| 21 | + "colab_type": "text" |
| 22 | + }, |
| 23 | + "source": [ |
| 24 | + "<a href=\"https://colab.research.google.com/github/gumdropsteve/intro_to_python/blob/main/day_15/01_intro_dask.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" |
| 25 | + ] |
| 26 | + }, |
16 | 27 | { |
17 | 28 | "cell_type": "markdown", |
18 | 29 | "metadata": { |
|
46 | 57 | "\r\n", |
47 | 58 | "df" |
48 | 59 | ], |
49 | | - "execution_count": 1, |
| 60 | + "execution_count": null, |
50 | 61 | "outputs": [ |
51 | 62 | { |
52 | 63 | "output_type": "execute_result", |
|
110 | 121 | "source": [ |
111 | 122 | "df.to_csv('small.csv', index=False)" |
112 | 123 | ], |
113 | | - "execution_count": 2, |
| 124 | + "execution_count": null, |
114 | 125 | "outputs": [] |
115 | 126 | }, |
116 | 127 | { |
|
125 | 136 | "source": [ |
126 | 137 | "!python -m pip install \"dask[dataframe]\"" |
127 | 138 | ], |
128 | | - "execution_count": 4, |
| 139 | + "execution_count": null, |
129 | 140 | "outputs": [ |
130 | 141 | { |
131 | 142 | "output_type": "stream", |
|
166 | 177 | "\r\n", |
167 | 178 | "dd.read_csv('small.csv')" |
168 | 179 | ], |
169 | | - "execution_count": 5, |
| 180 | + "execution_count": null, |
170 | 181 | "outputs": [ |
171 | 182 | { |
172 | 183 | "output_type": "execute_result", |
|
254 | 265 | "source": [ |
255 | 266 | "dd.read_csv('small.csv').compute()" |
256 | 267 | ], |
257 | | - "execution_count": 6, |
| 268 | + "execution_count": null, |
258 | 269 | "outputs": [ |
259 | 270 | { |
260 | 271 | "output_type": "execute_result", |
|
322 | 333 | "source": [ |
323 | 334 | "type(dd.read_csv('small.csv').compute())" |
324 | 335 | ], |
325 | | - "execution_count": 7, |
| 336 | + "execution_count": null, |
326 | 337 | "outputs": [ |
327 | 338 | { |
328 | 339 | "output_type": "execute_result", |
|
353 | 364 | "\r\n", |
354 | 365 | "df.describe()" |
355 | 366 | ], |
356 | | - "execution_count": 12, |
| 367 | + "execution_count": null, |
357 | 368 | "outputs": [ |
358 | 369 | { |
359 | 370 | "output_type": "execute_result", |
|
432 | 443 | "source": [ |
433 | 444 | "df.describe().compute()" |
434 | 445 | ], |
435 | | - "execution_count": 13, |
| 446 | + "execution_count": null, |
436 | 447 | "outputs": [ |
437 | 448 | { |
438 | 449 | "output_type": "execute_result", |
|
551 | 562 | "graph = df.__dask_graph__()\r\n", |
552 | 563 | "graph.layers" |
553 | 564 | ], |
554 | | - "execution_count": 18, |
| 565 | + "execution_count": null, |
555 | 566 | "outputs": [ |
556 | 567 | { |
557 | 568 | "output_type": "execute_result", |
|
614 | 625 | "# dataframe as is now\r\n", |
615 | 626 | "df" |
616 | 627 | ], |
617 | | - "execution_count": 19, |
| 628 | + "execution_count": null, |
618 | 629 | "outputs": [ |
619 | 630 | { |
620 | 631 | "output_type": "execute_result", |
|
694 | 705 | "# just reading\r\n", |
695 | 706 | "dd.read_csv('small.csv')" |
696 | 707 | ], |
697 | | - "execution_count": 20, |
| 708 | + "execution_count": null, |
698 | 709 | "outputs": [ |
699 | 710 | { |
700 | 711 | "output_type": "execute_result", |
|
782 | 793 | "source": [ |
783 | 794 | "df.compute()" |
784 | 795 | ], |
785 | | - "execution_count": 21, |
| 796 | + "execution_count": null, |
786 | 797 | "outputs": [ |
787 | 798 | { |
788 | 799 | "output_type": "execute_result", |
|
862 | 873 | "\r\n", |
863 | 874 | "!wget \"https://github.com/gumdropsteve/datasets/raw/master/airlines.parquet\"" |
864 | 875 | ], |
865 | | - "execution_count": 27, |
| 876 | + "execution_count": null, |
866 | 877 | "outputs": [ |
867 | 878 | { |
868 | 879 | "output_type": "stream", |
|
902 | 913 | "%%time\r\n", |
903 | 914 | "pd.read_parquet('airlines.parquet')" |
904 | 915 | ], |
905 | | - "execution_count": 28, |
| 916 | + "execution_count": null, |
906 | 917 | "outputs": [ |
907 | 918 | { |
908 | 919 | "output_type": "stream", |
|
1181 | 1192 | "%%time\r\n", |
1182 | 1193 | "dd.read_parquet('airlines.parquet')" |
1183 | 1194 | ], |
1184 | | - "execution_count": 29, |
| 1195 | + "execution_count": null, |
1185 | 1196 | "outputs": [ |
1186 | 1197 | { |
1187 | 1198 | "output_type": "stream", |
|
1317 | 1328 | "%%time\r\n", |
1318 | 1329 | "dd.read_parquet('airlines.parquet').compute()" |
1319 | 1330 | ], |
1320 | | - "execution_count": 33, |
| 1331 | + "execution_count": null, |
1321 | 1332 | "outputs": [ |
1322 | 1333 | { |
1323 | 1334 | "output_type": "stream", |
|
1599 | 1610 | "\r\n", |
1600 | 1611 | "df.compute()" |
1601 | 1612 | ], |
1602 | | - "execution_count": 45, |
| 1613 | + "execution_count": null, |
1603 | 1614 | "outputs": [ |
1604 | 1615 | { |
1605 | 1616 | "output_type": "stream", |
|
1628 | 1639 | "\r\n", |
1629 | 1640 | "df" |
1630 | 1641 | ], |
1631 | | - "execution_count": 46, |
| 1642 | + "execution_count": null, |
1632 | 1643 | "outputs": [ |
1633 | 1644 | { |
1634 | 1645 | "output_type": "stream", |
|
1681 | 1692 | "total = sum(output)\r\n", |
1682 | 1693 | "total" |
1683 | 1694 | ], |
1684 | | - "execution_count": 7, |
| 1695 | + "execution_count": null, |
1685 | 1696 | "outputs": [ |
1686 | 1697 | { |
1687 | 1698 | "output_type": "execute_result", |
|
1721 | 1732 | "\r\n", |
1722 | 1733 | "total.visualize()" |
1723 | 1734 | ], |
1724 | | - "execution_count": 10, |
| 1735 | + "execution_count": null, |
1725 | 1736 | "outputs": [ |
1726 | 1737 | { |
1727 | 1738 | "output_type": "execute_result", |
|
1750 | 1761 | "source": [ |
1751 | 1762 | "total.compute()" |
1752 | 1763 | ], |
1753 | | - "execution_count": 11, |
| 1764 | + "execution_count": null, |
1754 | 1765 | "outputs": [ |
1755 | 1766 | { |
1756 | 1767 | "output_type": "execute_result", |
|
1789 | 1800 | "\r\n", |
1790 | 1801 | "compute(*[total, total])" |
1791 | 1802 | ], |
1792 | | - "execution_count": 12, |
| 1803 | + "execution_count": null, |
1793 | 1804 | "outputs": [ |
1794 | 1805 | { |
1795 | 1806 | "output_type": "execute_result", |
|
1826 | 1837 | "\r\n", |
1827 | 1838 | "df = dd.read_csv('iris.csv')" |
1828 | 1839 | ], |
1829 | | - "execution_count": 17, |
| 1840 | + "execution_count": null, |
1830 | 1841 | "outputs": [] |
1831 | 1842 | }, |
1832 | 1843 | { |
|
1841 | 1852 | "source": [ |
1842 | 1853 | "compute([df])" |
1843 | 1854 | ], |
1844 | | - "execution_count": 18, |
| 1855 | + "execution_count": null, |
1845 | 1856 | "outputs": [ |
1846 | 1857 | { |
1847 | 1858 | "output_type": "execute_result", |
|
1882 | 1893 | "source": [ |
1883 | 1894 | "compute(*[total, total, df])" |
1884 | 1895 | ], |
1885 | | - "execution_count": 19, |
| 1896 | + "execution_count": null, |
1886 | 1897 | "outputs": [ |
1887 | 1898 | { |
1888 | 1899 | "output_type": "execute_result", |
|
1925 | 1936 | "source": [ |
1926 | 1937 | "compute(*[total, df, total])" |
1927 | 1938 | ], |
1928 | | - "execution_count": 20, |
| 1939 | + "execution_count": null, |
1929 | 1940 | "outputs": [ |
1930 | 1941 | { |
1931 | 1942 | "output_type": "execute_result", |
|
0 commit comments