Skip to content

Commit

Permalink
Updates Fall2024
Browse files Browse the repository at this point in the history
  • Loading branch information
trajanov committed Sep 19, 2024
1 parent 50d042e commit 733538a
Show file tree
Hide file tree
Showing 31 changed files with 1,083 additions and 1,191 deletions.
2 changes: 1 addition & 1 deletion Notebooks/Spark-Example-01-Word-Count-Example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
"version": "3.10.14"
},
"vscode": {
"interpreter": {
Expand Down
2 changes: 1 addition & 1 deletion Notebooks/Spark-Example-02-RDD Basics Toutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2983,7 +2983,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
"version": "3.10.14"
},
"vscode": {
"interpreter": {
Expand Down
2 changes: 1 addition & 1 deletion Notebooks/Spark-Example-03-PySpark vs Python.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
"version": "3.10.14"
}
},
"nbformat": 4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
"version": "3.10.14"
},
"vscode": {
"interpreter": {
Expand Down
4 changes: 2 additions & 2 deletions Notebooks/Spark-Example-08-PySpark and NumPy.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 20,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -531,7 +531,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
"version": "3.10.14"
},
"vscode": {
"interpreter": {
Expand Down
43 changes: 22 additions & 21 deletions Notebooks/Spark-Example-09-Data-Partitioning-TreeAggregate.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"Partitioning is the process of dividing a large dataset into smaller, more manageable chunks, known as partitions. In Spark, each partition is a logical chunk of a large distributed data frame or RDD (Resilient Distributed Dataset).\n",
"\n",
"$$\n",
"\\text{Number of Partitions} = \\frac{\\text{Size of Data}}{\\text{Size of each Partition}}\n",
"\\text{Number of Partitions} = \\frac{\\text{Size of Data}}{\\text{Size of Partition}}\n",
"$$\n",
"\n",
"## Importance of Partitioning\n",
Expand All @@ -27,7 +27,7 @@
"Spark will create the partitions based on the hash value of the key. For example, if you have 1000 keys and you want to create 10 partitions, then Spark will create 10 partitions based on the hash value of the key. The hash value of the key will be calculated by using the following formula:\n",
"\n",
"$$\n",
"\\text{Partition} = \\text{hash}(key) \\mod \\text{numPartitions}\n",
"\\text{Partition} = \\text{hash}(key) \\mod numPartitions\n",
"$$\n",
"\n",
"### 2. Range Partitioning\n",
Expand Down Expand Up @@ -56,7 +56,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -70,7 +70,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 17,
"metadata": {},
"outputs": [
{
Expand All @@ -79,7 +79,7 @@
"8"
]
},
"execution_count": 30,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -100,7 +100,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 4,
"metadata": {},
"outputs": [
{
Expand All @@ -109,7 +109,7 @@
"4"
]
},
"execution_count": 31,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -122,7 +122,7 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -152,7 +152,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 6,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -197,7 +197,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 18,
"metadata": {},
"outputs": [
{
Expand All @@ -214,6 +214,7 @@
"source": [
"ratings = sc.parallelize([(\"A\",3),(\"A\",4),(\"A\",5),(\"A\",6),(\"B\",8),(\"B\",9),], 3)\n",
"print(\"partitions\\n\",ratings.glom().collect())\n",
"# Calculate the sum and count in one pass using aggregateByKey\n",
"sum_count = ratings.aggregateByKey((0, 0),\n",
" (lambda C, V: (C[0]+V, C[1]+1)),\n",
" (lambda C1, C2: (C1[0]+C2[0], C1[1]+C2[1]))\n",
Expand All @@ -239,7 +240,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -276,7 +277,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 9,
"metadata": {},
"outputs": [
{
Expand All @@ -296,7 +297,7 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 10,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -345,7 +346,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"metadata": {},
"outputs": [
{
Expand All @@ -364,7 +365,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 12,
"metadata": {},
"outputs": [
{
Expand All @@ -390,7 +391,7 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": 13,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -433,7 +434,7 @@
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": 19,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -476,7 +477,7 @@
},
{
"cell_type": "code",
"execution_count": 40,
"execution_count": 20,
"metadata": {},
"outputs": [
{
Expand All @@ -486,12 +487,12 @@
"Total Partitions: 3\n",
"Partition 0: [('Bob', 2), ('Catherine', 3), ('Daniel', 4), ('George', 7)]\n",
"Partition 1: [('Ella', 5), ('Jack', 10)]\n",
"Partition 2: [('Alice', 1), ('Frank', 6), ('Harry', 8), ('Irene', 9)]\n"
"Partition 2: [('Alice', 1), ('Frank', 6), ('Harry', 8), ('Irene', 9), ('Al', 10)]\n"
]
}
],
"source": [
"data = [(\"Alice\", 1), (\"Bob\", 2), (\"Catherine\", 3), (\"Daniel\", 4), (\"Ella\", 5), (\"Frank\", 6), (\"George\", 7), (\"Harry\", 8), (\"Irene\", 9), (\"Jack\", 10) ]\n",
"data = [(\"Alice\", 1), (\"Bob\", 2), (\"Catherine\", 3), (\"Daniel\", 4), (\"Ella\", 5), (\"Frank\", 6), (\"George\", 7), (\"Harry\", 8), (\"Irene\", 9), (\"Jack\", 10), (\"Al\", 10) ]\n",
"rdd = sc.parallelize(data)\n",
"\n",
"# Define a custom partitioning function\n",
Expand Down Expand Up @@ -535,7 +536,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
"version": "3.10.14"
},
"vscode": {
"interpreter": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5104,7 +5104,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
"version": "3.10.14"
},
"vscode": {
"interpreter": {
Expand Down
Loading

0 comments on commit 733538a

Please sign in to comment.