Initial commit.

justinrmiller · justinrmiller · commit 0143aaf47385 · 2024-06-19T20:30:09.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -157,4 +157,6 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
+
+parquet/
diff --git a/README.md b/README.md
@@ -0,0 +1,24 @@
+Offline Feature Extraction with Spark and Regular Expressions
+----
+
+This project demonstrates the scalability of Spark using the Wikipedia dataset and it's ability to perform off-line feature extract. This allows the user of the scripts to find articles that are related to the supplied RegExs for use in training machine learning models.
+
+The dataset can be found here:
+https://huggingface.co/datasets/wikimedia/wikipedia
+
+Three scripts are provided:
+
+1. `download_wikipedia.py` - This script downloads the English Wikipedia dataset as a single file.
+2. `repartition.py` - This script uses Spark to repartition the data into convenient ~512 MB Parquet files w/Snappy compression.
+3. `scan.py` - This script uses Spark to add a regular expression column to the data set, filter out rows the lack any matches and saves the data to disk.
+
+Note: 
+
+The configuration of the driver/executors are hardcoded into the repartition and scan scripts. This should likely be driven by a configuration file instead. 
+
+Future Enhancements:
+
+1. Remove unnecessary columns to save on storage space.
+2. Replace UDF with a flatMap to reduce time associated with `filter` step.
+3. Source a larger dataset and benchmark the performance of the scripts.
+4. Load driver/executor settings from a configuration file instead of hard-coding.
diff --git a/download_wikipedia.py b/download_wikipedia.py
@@ -0,0 +1,11 @@
+from datasets import load_dataset
+import pandas as pd
+
+# Load the dataset, note this data set is quite large so make sure you have enough free disk space
+ds = load_dataset("wikimedia/wikipedia", "20231101.en")
+
+# Convert the dataset to a Pandas DataFrame
+df = pd.DataFrame(ds['train'])
+
+# Save the DataFrame as a Parquet file
+df.to_parquet('parquet/input/wikipedia_20231101.parquet', engine='fastparquet')
diff --git a/repartition.py b/repartition.py
@@ -0,0 +1,39 @@
+from pyspark.sql import SparkSession
+from pyspark import StorageLevel
+
+import math
+
+# Initialize Spark session
+spark = SparkSession.builder \
+    .master('spark://processing:7077') \
+    .config("spark.driver.memory", "16g") \
+    .config("spark.driver.cores", "1") \
+    .config("spark.executor.memory", "32g") \
+    .config("spark.executor.instances", "1") \
+    .config("spark.executor.cores", "1") \
+    .appName('RepartitionFile') \
+    .getOrCreate()
+
+# File Size for repartition targets and Input/Output file paths
+file_size_gb = 11 # This is the approximate size of the input parquet files, should probably be loaded from disk
+file_size_mb = file_size_gb * 1024
+target_partition_size_mb = 512
+
+input_path = "parquet/input/wikipedia_20231101.parquet"
+output_path = "parquet/input/repartitioned_data"
+
+# Load the input parquet file
+df = spark.read.parquet(input_path)
+df.persist(StorageLevel.MEMORY_AND_DISK)
+
+# Calculate the number of partitions
+num_partitions = math.ceil(file_size_mb / target_partition_size_mb)
+
+# Repartition the dataframe, note this doesn't happen until the write on a subsequent line as dataframes in Spark are lazy
+repartitioned_df = df.repartition(num_partitions)
+
+# Write the repartitioned dataframe to parquet files
+repartitioned_df.write.mode("overwrite").parquet(output_path)
+
+# Stop the Spark session, this frees resources in the Spark cluster and clearly marks the end of the job
+spark.stop()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,11 @@
+# PySpark
+pyspark
+
+# ML
+datasets
+pandas
+
+# Parquet
+fastparquet    # used by the download wikipedia script to save the dataset
+parquet-tools  # this is used for CLI reading/inspection of parquet files
+
diff --git a/scan.py b/scan.py
@@ -0,0 +1,51 @@
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col, udf, size
+from pyspark.sql.types import StringType, ArrayType
+from pyspark import StorageLevel
+from utils import regex_patterns
+
+import re
+
+spark = SparkSession.builder \
+    .master('spark://processing:7077') \
+    .config("spark.driver.memory", "12g") \
+    .config("spark.driver.cores", "1") \
+    .config("spark.executor.memory", "4g") \
+    .config("spark.executor.instances", "4") \
+    .config("spark.executor.cores", "1") \
+    .appName('Scan') \
+    .getOrCreate()
+
+# Compile the combined regex patterns, this significantly speeds up the process and (?i) makes it case insensitive
+compiled_patterns = {category: re.compile("(?i)" + "|".join(patterns)) for category, patterns in regex_patterns.items()}
+
+# Define UDF to classify text based on compiled regex patterns
+def classify_text(text):
+    matched_categories = []
+    for category, pattern in compiled_patterns.items():
+        if pattern.search(text):
+            matched_categories.append(category)
+    return matched_categories if matched_categories else None
+
+classify_text_udf = udf(classify_text, ArrayType(StringType()))
+
+# Load the input parquet file
+input_df = spark.read.parquet("parquet/input/repartitioned_data/*.parquet")
+
+# Ensure that in limited memory environments the job can run successfully
+input_df.persist(StorageLevel.MEMORY_AND_DISK)
+
+# Apply the classification UDF to the text column
+output_df = input_df.withColumn("categories", classify_text_udf(col("text")))
+
+# Filter out rows with no categories
+filtered_df = output_df.filter(col("categories").isNotNull() & (size(col("categories")) > 0))
+
+# Select the required columns
+result_df = filtered_df.select("title", "text", "categories")
+
+# Save the result to a parquet file
+result_df.write.mode("overwrite").parquet("parquet/output/results")
+
+# Stop the Spark session
+spark.stop()
diff --git a/utils.py b/utils.py
@@ -0,0 +1,55 @@
+# RegEx Patterns generated by ChatGPT for electronic music
+regex_patterns = {
+    "house": [
+        "\\bdeep\\s+house\\b",
+        "\\btech\\s+house\\b",
+        "\\bprogressive\\s+house\\b",
+        "\\btropical\\s+house\\b",
+        "\\bfuture\\s+house\\b",
+        "\\bacid\\s+house\\b",
+        "\\belectro\\s+house\\b"
+    ],
+    "techno": [
+        "\\bminimal\\s+techno\\b",
+        "\\bdetroit\\s+techno\\b",
+        "\\bacid\\s+techno\\b",
+        "\\bindustrial\\s+techno\\b",
+        "\\bhard\\s+techno\\b",
+        "\\btech\\s+techno\\b"
+    ],
+    "trance": [
+        "\\bprogressive\\s+trance\\b",
+        "\\buplifting\\s+trance\\b",
+        "\\bpsychedelic\\s+trance\\b",
+        "\\bgoa\\s+trance\\b",
+        "\\btech\\s+trance\\b",
+        "\\bvocal\\s+trance\\b"
+    ],
+    "dubstep": [
+        "\\bbrostep\\b",
+        "\\bchillstep\\b",
+        "\\briddim\\b",
+        "\\bdeep\\s+dubstep\\b"
+    ],
+    "drum_and_bass": [
+        "\\bdnb\\b",
+        "\\bneurofunk\\b",
+        "\\bliquid\\s+funk\\b",
+        "\\bjump\\s+up\\b",
+        "\\bdarkstep\\b",
+        "\\bbreakcore\\b"
+    ],
+    "electro": [
+        "\\belectro\\b",
+        "\\belectro\\s+funk\\b",
+        "\\belectro\\s+clash\\b",
+        "\\bfuture\\s+electro\\b"
+    ],
+    "hardcore": [
+        "\\bhardcore\\b",
+        "\\bgabber\\b",
+        "\\bhappy\\s+hardcore\\b",
+        "\\bdigital\\s+hardcore\\b",
+        "\\bbreakbeat\\s+hardcore\\b"
+    ]
+}