⚙️ Staged data format should be CSV; minor formatting

ismaildawoodjee · ismaildawoodjee · commit fa5661bdc79e · 2022-01-12T19:35:54.000+08:00
diff --git a/dags/scripts/spark/malware_file_detection.py b/dags/scripts/spark/malware_file_detection.py
@@ -1,8 +1,8 @@
-"""This is one place where machine learning with Spark could occur. A previously
-trained classification algorithm could perhaps be used to classify the new batch of 
-incoming data and predict whether or not the features in there describe a malicious 
-file. Additional ML engineering can be done to feed the algorithm with new data
-and also improve its accuracy, but that is out of the scope of this project. 
+"""This is one place where data processing or machine learning with Spark could occur. 
+A previously trained classification algorithm could perhaps be used to classify the 
+new batch of incoming data and predict whether or not the features in there describe 
+a malicious file. Additional ML engineering can be done to feed the algorithm with 
+new data and also improve its accuracy, but that is out of the scope of this project. 
 
 In this script, I am just selecting some columns that I think might be useful to 
 display on a daily dashboard, and will not be doing any machine learning. The
@@ -37,4 +37,4 @@
 
 # sys.argv[2] is also the full S3 URI for the output destination folder that EMR will write to
 # this is going to be the `stage` folder on S3
-new_df.write.format("parquet").mode("overwrite").save(sys.argv[2])
+new_df.write.format("csv").mode("overwrite").save(sys.argv[2])
diff --git a/dags/utils.py b/dags/utils.py
@@ -135,7 +135,7 @@ def _pause_redshift_cluster(cluster_identifier: str):
     cluster_state = redshift_hook.cluster_status(cluster_identifier=cluster_identifier)
 
     try:
-        if cluster_state == 'paused':
+        if cluster_state == "paused":
             return
 
         redshift_hook.get_conn().pause_cluster(ClusterIdentifier=cluster_identifier)