DemystData · hzambare2698 · Dec 16, 2024 · Dec 16, 2024
diff --git a/demyst_anonymize.py b/demyst_anonymize.py
@@ -0,0 +1,47 @@
+import dask.dataframe as dd
+import random
+import string
+import os
+
+# Import file to generate large dataset
+import generate_large_file
+
+# Function to anonymize a specific column by generating random strings
+def anonymize_column(partition, column_name):
+    # Checks to make sure column type is string, if not throws error
+    if partition[column_name].dtype != 'object':
+        raise ValueError(f"Column '{column_name}' is not of type string. Skipping anonymization.")
+    # Applies anonymize function to dask dataframe by column called, expecting a string value
+    partition[column_name] = partition[column_name].apply(anonymize, meta=('x', 'str'))
+    return partition
+
+# Helper function to generate random strings
+def anonymize(x):
+    # Randomizes the string, with k=10 for randomizing 10 elements
+    return ''.join(random.choices(string.ascii_uppercase, k=10))
+
+def apply_to_data():
+    # Gets the current directory, and creates a directory if one does not exist
+    current_directory = os.getcwd()
+    # If directory already exists, nothing happens
+    os.makedirs(current_directory, exist_ok=True)
+
+    # Read the large CSV file using Dask
+    df = dd.read_csv('generated_large_data.csv')
+
+    # Anonymize specific columns using anonymize function
+    df['first_name'] = df['first_name'].apply(anonymize, meta=('x', 'str'))
+    df['last_name'] = df['last_name'].apply(anonymize, meta=('x', 'str'))
+    df['address'] = df['address'].apply(anonymize, meta=('x', 'str'))
+
+    # Write the anonymized DataFrame back to a new CSV file
+    output_file_path = os.path.join(current_directory, 'anonymized_file-*.csv')
+    df.to_csv(output_file_path, single_file=True)
+
+    # Trigger computation to processe and save the data
+    df.compute()
+
+if __name__ == '__main__':
+    apply_to_data()
+    generate_large_file.generate_random_data()
+    generate_large_file.generate_file()
diff --git a/demyst_fixed_width.py b/demyst_fixed_width.py
@@ -0,0 +1,63 @@
+# Opens, reads, and closes the spec.json file
+with open('spec.json', 'r') as spec_file:
+    spec_dict = eval(spec_file.read().replace('\n', ''))
+
+# Creates variables for each entry in spec.json file
+column_names = spec_dict["ColumnNames"]
+offsets = list(map(int, spec_dict["Offsets"])) # Convert 'offsets' from string type to integers type
+    # Uses map function to efficiently cast each value of 'offsets' from string type to int type
+fixed_width_encoding = spec_dict["FixedWidthEncoding"]
+include_header = spec_dict["IncludeHeader"] == "True"
+delimited_encoding = spec_dict["DelimitedEncoding"]
+
+# Function to parse the fixed-width file and write to a CSV file
+def parse_fixed_width_file(input_file_path, output_file_path):
+
+    # Opens, reads, and closes each line of the file
+    with open(input_file_path, 'r', encoding=fixed_width_encoding) as input_file:
+        lines = input_file.readlines()
+
+        # Opens and closes output file for writing
+        with open(output_file_path, 'w', encoding=delimited_encoding) as output_file:
+
+            # Write the header if specified
+            if include_header:
+                # Adds comma to make the file comma delimited per CSV format desired
+                output_file.write(','.join(column_names) + '\n')
+
+            # Parses each line based on column 'offsets'
+            for line in lines:
+                line = line.rstrip('\n')  # Remove any trailing newline
+                row = []
+                current_pos = 0
+                for offset in offsets:
+                    # Concatenates input line from current position to current position + the offest value
+                    value = line[current_pos:current_pos + offset].strip()
+
+                    # Appends the concatenated value to the row of data
+                    row.append(value)
+
+                    # Moves current position to be at the start of the next offset column
+                    current_pos += offset
+
+                # Write the row to the CSV file by adding comma to each entry
+                output_file.write(','.join(row) + '\n')
+
+if __name__ == '__main__':
+    # Standard test
+    parse_fixed_width_file('input1.txt', 'output1.csv')
+
+    # Values that are larger than width test
+    parse_fixed_width_file('input2.txt', 'output2.csv')
+
+    # Empty file test
+    parse_fixed_width_file('input3.txt', 'output3.csv')
+
+    # Single word test
+    parse_fixed_width_file('input4.txt', 'output4.csv')
+
+    # More than 98 characters test
+    parse_fixed_width_file('input5.txt', 'output5.csv')
+
+    # Commas test
+    parse_fixed_width_file('input6.txt', 'output6.csv')
diff --git a/generate_large_file.py b/generate_large_file.py
@@ -0,0 +1,34 @@
+import pandas as pd
+import random
+import string
+
+# Function to generate random data
+def generate_random_data(num_rows):
+    data = []
+    for _ in range(num_rows):
+        first_name = ''.join(random.choices(string.ascii_uppercase, k=8))
+        last_name = ''.join(random.choices(string.ascii_uppercase, k=12))
+        address = ''.join(random.choices(string.ascii_uppercase + string.digits + ' ', k=20))
+        dob = f'{random.randint(1, 31):02d}-{random.randint(1, 12):02d}-{random.randint(1900, 2000)}'
+        data.append([first_name, last_name, address, dob])
+    return data
+
+# Number of rows required to create approximately a 2GB CSV file
+# num_rows_per_chunk = 500000  
+# Generate in chunks of 500k rows
+# total_rows = 30000000   
+# 30 million rows for ~2GB file
+
+# Create and write data in chunks
+def generate_file(num_rows_per_chunk = 500000, total_rows = 30000000, columns = ['first_name', 'last_name', 'address', 'date_of_birth']):
+    output_file = 'generated_large_data.csv'
+    with open(output_file, 'w') as f:
+        # Write headers to the CSV file
+        f.write(','.join(columns) + '\n')
+
+        # Generate and write data in chunks
+        for _ in range(total_rows // num_rows_per_chunk):
+            data_chunk = generate_random_data(num_rows_per_chunk)
+            # Write chunk to file
+            for row in data_chunk:
+                f.write(','.join(row) + '\n')
diff --git a/input1.txt b/input1.txt
@@ -0,0 +1,2 @@
+John  25  M  NY   USA    New York    1234   Developer   3000         Manager
+Mary  32  F  LA   USA    Los Angeles  5000   Designer    4000         Leader
diff --git a/input2.txt b/input2.txt
@@ -0,0 +1,2 @@
+William  250584390584390  ABCD  NYNY   UnitedStatesofAmerica    New York City    12345678910   DeveloperJOBthing12   3000         Manajfkld^^yrewuihfdsklger
+Himanshu  32108349083290  WXYZ  LALA   USofAmerica    Los Angeles County  5000   Designerbranded57#$"f    4000000000000000000000000         Leader
diff --git a/input3.txt b/input3.txt
diff --git a/input4.txt b/input4.txt
@@ -0,0 +1 @@
+SINGLEWORDTESTFILE
diff --git a/input5.txt b/input5.txt
@@ -0,0 +1 @@
+jfdklsafjtre98t 74895 7r384r &*(%&v* (%)_%*#U49T04UFJILDJG *)&#() UREIWPTUFOPI9-EWU80 U80@#$%^&*&^%$#$%^&*(  UISDUF JDISUFJEDITOUEWRGJORIUFJGTR8GUEW79GU89RFEIYGHJ8RIOG ITKPO9T7493847 39 EUW89TYWE79WE
diff --git a/input6.txt b/input6.txt
@@ -0,0 +1 @@
+,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		John 25 M NY USA New York 1234 Developer 3000 Manager
		Mary 32 F LA USA Los Angeles 5000 Designer 4000 Leader
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		William 250584390584390 ABCD NYNY UnitedStatesofAmerica New York City 12345678910 DeveloperJOBthing12 3000 Manajfkld^^yrewuihfdsklger
		Himanshu 32108349083290 WXYZ LALA USofAmerica Los Angeles County 5000 Designerbranded57#$"f 4000000000000000000000000 Leader
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		jfdklsafjtre98t 74895 7r384r &(%&v (%)_%#U49T04UFJILDJG )&#() UREIWPTUFOPI9-EWU80 U80@#$%^&&^%$#$%^&( UISDUF JDISUFJEDITOUEWRGJORIUFJGTR8GUEW79GU89RFEIYGHJ8RIOG ITKPO9T7493847 39 EUW89TYWE79WE
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,