Skip to content

Hzambare2698 code challenge #54

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: data-eng
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions demyst_anonymize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import dask.dataframe as dd
import random
import string
import os

# Import file to generate large dataset
import generate_large_file

# Function to anonymize a specific column by generating random strings
def anonymize_column(partition, column_name):
# Checks to make sure column type is string, if not throws error
if partition[column_name].dtype != 'object':
raise ValueError(f"Column '{column_name}' is not of type string. Skipping anonymization.")
# Applies anonymize function to dask dataframe by column called, expecting a string value
partition[column_name] = partition[column_name].apply(anonymize, meta=('x', 'str'))
return partition

# Helper function to generate random strings
def anonymize(x):
# Randomizes the string, with k=10 for randomizing 10 elements
return ''.join(random.choices(string.ascii_uppercase, k=10))

def apply_to_data():
# Gets the current directory, and creates a directory if one does not exist
current_directory = os.getcwd()
# If directory already exists, nothing happens
os.makedirs(current_directory, exist_ok=True)

# Read the large CSV file using Dask
df = dd.read_csv('generated_large_data.csv')

# Anonymize specific columns using anonymize function
df['first_name'] = df['first_name'].apply(anonymize, meta=('x', 'str'))
df['last_name'] = df['last_name'].apply(anonymize, meta=('x', 'str'))
df['address'] = df['address'].apply(anonymize, meta=('x', 'str'))

# Write the anonymized DataFrame back to a new CSV file
output_file_path = os.path.join(current_directory, 'anonymized_file-*.csv')
df.to_csv(output_file_path, single_file=True)

# Trigger computation to processe and save the data
df.compute()

if __name__ == '__main__':
apply_to_data()
generate_large_file.generate_random_data()
generate_large_file.generate_file()
63 changes: 63 additions & 0 deletions demyst_fixed_width.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Opens, reads, and closes the spec.json file
with open('spec.json', 'r') as spec_file:
spec_dict = eval(spec_file.read().replace('\n', ''))

# Creates variables for each entry in spec.json file
column_names = spec_dict["ColumnNames"]
offsets = list(map(int, spec_dict["Offsets"])) # Convert 'offsets' from string type to integers type
# Uses map function to efficiently cast each value of 'offsets' from string type to int type
fixed_width_encoding = spec_dict["FixedWidthEncoding"]
include_header = spec_dict["IncludeHeader"] == "True"
delimited_encoding = spec_dict["DelimitedEncoding"]

# Function to parse the fixed-width file and write to a CSV file
def parse_fixed_width_file(input_file_path, output_file_path):

# Opens, reads, and closes each line of the file
with open(input_file_path, 'r', encoding=fixed_width_encoding) as input_file:
lines = input_file.readlines()

# Opens and closes output file for writing
with open(output_file_path, 'w', encoding=delimited_encoding) as output_file:

# Write the header if specified
if include_header:
# Adds comma to make the file comma delimited per CSV format desired
output_file.write(','.join(column_names) + '\n')

# Parses each line based on column 'offsets'
for line in lines:
line = line.rstrip('\n') # Remove any trailing newline
row = []
current_pos = 0
for offset in offsets:
# Concatenates input line from current position to current position + the offest value
value = line[current_pos:current_pos + offset].strip()

# Appends the concatenated value to the row of data
row.append(value)

# Moves current position to be at the start of the next offset column
current_pos += offset

# Write the row to the CSV file by adding comma to each entry
output_file.write(','.join(row) + '\n')

if __name__ == '__main__':
# Standard test
parse_fixed_width_file('input1.txt', 'output1.csv')

# Values that are larger than width test
parse_fixed_width_file('input2.txt', 'output2.csv')

# Empty file test
parse_fixed_width_file('input3.txt', 'output3.csv')

# Single word test
parse_fixed_width_file('input4.txt', 'output4.csv')

# More than 98 characters test
parse_fixed_width_file('input5.txt', 'output5.csv')

# Commas test
parse_fixed_width_file('input6.txt', 'output6.csv')
34 changes: 34 additions & 0 deletions generate_large_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import pandas as pd
import random
import string

# Function to generate random data
def generate_random_data(num_rows):
data = []
for _ in range(num_rows):
first_name = ''.join(random.choices(string.ascii_uppercase, k=8))
last_name = ''.join(random.choices(string.ascii_uppercase, k=12))
address = ''.join(random.choices(string.ascii_uppercase + string.digits + ' ', k=20))
dob = f'{random.randint(1, 31):02d}-{random.randint(1, 12):02d}-{random.randint(1900, 2000)}'
data.append([first_name, last_name, address, dob])
return data

# Number of rows required to create approximately a 2GB CSV file
# num_rows_per_chunk = 500000
# Generate in chunks of 500k rows
# total_rows = 30000000
# 30 million rows for ~2GB file

# Create and write data in chunks
def generate_file(num_rows_per_chunk = 500000, total_rows = 30000000, columns = ['first_name', 'last_name', 'address', 'date_of_birth']):
output_file = 'generated_large_data.csv'
with open(output_file, 'w') as f:
# Write headers to the CSV file
f.write(','.join(columns) + '\n')

# Generate and write data in chunks
for _ in range(total_rows // num_rows_per_chunk):
data_chunk = generate_random_data(num_rows_per_chunk)
# Write chunk to file
for row in data_chunk:
f.write(','.join(row) + '\n')
2 changes: 2 additions & 0 deletions input1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
John 25 M NY USA New York 1234 Developer 3000 Manager
Mary 32 F LA USA Los Angeles 5000 Designer 4000 Leader
2 changes: 2 additions & 0 deletions input2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
William 250584390584390 ABCD NYNY UnitedStatesofAmerica New York City 12345678910 DeveloperJOBthing12 3000 Manajfkld^^yrewuihfdsklger
Himanshu 32108349083290 WXYZ LALA USofAmerica Los Angeles County 5000 Designerbranded57#$"f 4000000000000000000000000 Leader
Empty file added input3.txt
Empty file.
1 change: 1 addition & 0 deletions input4.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SINGLEWORDTESTFILE
1 change: 1 addition & 0 deletions input5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
jfdklsafjtre98t 74895 7r384r &*(%&v* (%)_%*#U49T04UFJILDJG *)&#() UREIWPTUFOPI9-EWU80 U80@#$%^&*&^%$#$%^&*( UISDUF JDISUFJEDITOUEWRGJORIUFJGTR8GUEW79GU89RFEIYGHJ8RIOG ITKPO9T7493847 39 EUW89TYWE79WE
1 change: 1 addition & 0 deletions input6.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,