Updates Fall2024

trajanov · Sep 19, 2024 · 733538a · 733538a
1 parent 50d042e
commit 733538a
Show file tree

Hide file tree

Showing 31 changed files with 1,083 additions and 1,191 deletions.
diff --git a/Notebooks/Spark-Example-01-Word-Count-Example.ipynb b/Notebooks/Spark-Example-01-Word-Count-Example.ipynb
@@ -200,7 +200,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.4"
+   "version": "3.10.14"
   },
   "vscode": {
    "interpreter": {

diff --git a/Notebooks/Spark-Example-02-RDD Basics Toutorial.ipynb b/Notebooks/Spark-Example-02-RDD Basics Toutorial.ipynb
@@ -2983,7 +2983,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.4"
+   "version": "3.10.14"
   },
   "vscode": {
    "interpreter": {

diff --git a/Notebooks/Spark-Example-03-PySpark vs Python.ipynb b/Notebooks/Spark-Example-03-PySpark vs Python.ipynb
@@ -563,7 +563,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.4"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,

diff --git a/Notebooks/Spark-Example-04-Not to use transformations.ipynb b/Notebooks/Spark-Example-04-Not to use transformations.ipynb
@@ -204,7 +204,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.4"
+   "version": "3.10.14"
   },
   "vscode": {
    "interpreter": {

diff --git a/Notebooks/Spark-Example-08-PySpark and NumPy.ipynb b/Notebooks/Spark-Example-08-PySpark and NumPy.ipynb
@@ -481,7 +481,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -531,7 +531,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.4"
+   "version": "3.10.14"
   },
   "vscode": {
    "interpreter": {

diff --git a/Notebooks/Spark-Example-09-Data-Partitioning-TreeAggregate.ipynb b/Notebooks/Spark-Example-09-Data-Partitioning-TreeAggregate.ipynb
@@ -9,7 +9,7 @@
     "Partitioning is the process of dividing a large dataset into smaller, more manageable chunks, known as partitions. In Spark, each partition is a logical chunk of a large distributed data frame or RDD (Resilient Distributed Dataset).\n",
     "\n",
     "$$\n",
-    "\\text{Number of Partitions} = \\frac{\\text{Size of Data}}{\\text{Size of each Partition}}\n",
+    "\\text{Number of Partitions} = \\frac{\\text{Size of Data}}{\\text{Size of Partition}}\n",
     "$$\n",
     "\n",
     "## Importance of Partitioning\n",
@@ -27,7 +27,7 @@
     "Spark will create the partitions based on the hash value of the key. For example, if you have 1000 keys and you want to create 10 partitions, then Spark will create 10 partitions based on the hash value of the key. The hash value of the key will be calculated by using the following formula:\n",
     "\n",
     "$$\n",
-    "\\text{Partition} = \\text{hash}(key) \\mod \\text{numPartitions}\n",
+    "\\text{Partition} = \\text{hash}(key) \\mod numPartitions\n",
     "$$\n",
     "\n",
     "### 2. Range Partitioning\n",
@@ -56,7 +56,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -70,7 +70,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -79,7 +79,7 @@
        "8"
       ]
      },
-     "execution_count": 30,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -100,7 +100,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -109,7 +109,7 @@
        "4"
       ]
      },
-     "execution_count": 31,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -122,7 +122,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -152,7 +152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -197,7 +197,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -214,6 +214,7 @@
    "source": [
     "ratings = sc.parallelize([(\"A\",3),(\"A\",4),(\"A\",5),(\"A\",6),(\"B\",8),(\"B\",9),], 3)\n",
     "print(\"partitions\\n\",ratings.glom().collect())\n",
+    "# Calculate the sum and count in one pass using aggregateByKey\n",
     "sum_count = ratings.aggregateByKey((0, 0),\n",
     "                (lambda C, V: (C[0]+V, C[1]+1)),\n",
     "                (lambda C1, C2: (C1[0]+C2[0], C1[1]+C2[1]))\n",
@@ -239,7 +240,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -276,7 +277,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -296,7 +297,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -345,7 +346,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -364,7 +365,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -390,7 +391,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -433,7 +434,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -476,7 +477,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -486,12 +487,12 @@
       "Total Partitions: 3\n",
       "Partition 0: [('Bob', 2), ('Catherine', 3), ('Daniel', 4), ('George', 7)]\n",
       "Partition 1: [('Ella', 5), ('Jack', 10)]\n",
-      "Partition 2: [('Alice', 1), ('Frank', 6), ('Harry', 8), ('Irene', 9)]\n"
+      "Partition 2: [('Alice', 1), ('Frank', 6), ('Harry', 8), ('Irene', 9), ('Al', 10)]\n"
      ]
     }
    ],
    "source": [
-    "data = [(\"Alice\", 1), (\"Bob\", 2), (\"Catherine\", 3), (\"Daniel\", 4), (\"Ella\", 5), (\"Frank\", 6), (\"George\", 7), (\"Harry\", 8), (\"Irene\", 9), (\"Jack\", 10) ]\n",
+    "data = [(\"Alice\", 1), (\"Bob\", 2), (\"Catherine\", 3), (\"Daniel\", 4), (\"Ella\", 5), (\"Frank\", 6), (\"George\", 7), (\"Harry\", 8), (\"Irene\", 9), (\"Jack\", 10), (\"Al\", 10) ]\n",
     "rdd = sc.parallelize(data)\n",
     "\n",
     "# Define a custom partitioning function\n",
@@ -535,7 +536,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.4"
+   "version": "3.10.14"
   },
   "vscode": {
    "interpreter": {

diff --git a/Notebooks/Spark-Example-09.1-treeAggregate-Min-Max-Calculation.ipynb b/Notebooks/Spark-Example-09.1-treeAggregate-Min-Max-Calculation.ipynb
@@ -5104,7 +5104,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.10.4"
+      "version": "3.10.14"
     },
     "vscode": {
       "interpreter": {