check columns for osm and repd

alan-turing-institute · Apr 16, 2020 · 1752795 · 1752795
1 parent 26b770a
commit 1752795
Show file tree

Hide file tree

Showing 2 changed files with 96 additions and 14 deletions.
diff --git a/data/processed/pre-process-osm.py b/data/processed/pre-process-osm.py
@@ -9,12 +9,37 @@
 sys.stdin.reconfigure(encoding='iso-8859-1')
 osm_df = pd.read_csv(sys.stdin)
 
+# Check the file has the columns we expect and order them as we expect
+# If the columns don't exist, make the column empty
+output_df = pd.DataFrame()
+required_columns = ['objtype',
+                    'id',
+                    'user',
+                    'timestamp',
+                    'lat',
+                    'lon',
+                    'calc_area',
+                    'calc_capacity',
+                    'generator:solar:modules',
+                    'location',
+                    'orientation',
+                    'plantref',
+                    'tag_power',
+                    'tag_repd:id',
+                    'tag_start_date']
+
+for col in required_columns:
+    try:
+        output_df[col] = osm_df[col]
+    except KeyError:
+        output_df[col] = np.nan
+
 # Edit tagged date column with pandas
 dates = []
 before_date_strs = ['before ']
 after_date_strs = ['.']
 mistakes = [('-00', '-01')]
-for date in osm_df['tag_start_date']:
+for date in output_df['tag_start_date']:
     if pd.notna(date):
         og_date = date
         date = str(date)
@@ -27,8 +52,8 @@
         dates.append(parse(date, ignoretz=True, default=parse('2020-01-01')))
     else:
         dates.append(None)
-osm_df['tag_start_date'] = dates
+output_df['tag_start_date'] = dates
 
-osm_csv_str = osm_df.to_csv(index=False)
+osm_csv_str = output_df.to_csv(index=False)
 
 sys.stdout.write(osm_csv_str)
diff --git a/data/processed/pre-process-repd.py b/data/processed/pre-process-repd.py
@@ -18,30 +18,87 @@ def clean_repd_csv(csv_str):
 sys.stdin.reconfigure(encoding='iso-8859-1')
 repd_df = pd.read_csv(sys.stdin, skiprows=1)
 
+# Check the file has the columns we expect and order them as we expect
+# If the columns don't exist, make the column empty
+output_df = pd.DataFrame()
+required_columns = ['Old Ref ID',
+                    'Ref ID',
+                    'Record Last Updated (dd/mm/yyyy)',
+                    'Operator (or Applicant)',
+                    'Site Name',
+                    'Technology Type',
+                    'Storage Type',
+                    'Storage Co-location REPD Ref ID',
+                    'Installed Capacity (MWelec)',
+                    'CHP Enabled',
+                    'RO Banding (ROC/MWh)',
+                    'FiT Tariff (p/kWh)',
+                    'CfD Capacity (MW)',
+                    'Turbine Capacity (MW)',
+                    'No. of Turbines',
+                    'Height of Turbines (m)',
+                    'Mounting Type for Solar',
+                    'Development Status',
+                    'Development Status (short)',
+                    'Address',
+                    'County',
+                    'Region',
+                    'Country',
+                    'Post Code',
+                    'X-coordinate',
+                    'Y-coordinate',
+                    'Planning Authority',
+                    'Planning Application Reference',
+                    'Appeal Reference',
+                    'Secretary of State Reference',
+                    'Type of Secretary of State Intervention',
+                    'Judicial Review',
+                    'Offshore Wind Round',
+                    'Planning Application Submitted',
+                    'Planning Application Withdrawn',
+                    'Planning Permission Refused',
+                    'Appeal Lodged',
+                    'Appeal Withdrawn',
+                    'Appeal Refused',
+                    'Appeal Granted',
+                    'Planning Permission Granted',
+                    'Secretary of State - Intervened',
+                    'Secretary of State - Refusal',
+                    'Secretary of State - Granted',
+                    'Planning Permission Expired',
+                    'Under Construction',
+                    'Operational']
+
+for col in required_columns:
+    try:
+        output_df[col] = repd_df[col]
+    except KeyError:
+        output_df[col] = np.nan
+
 # Remove thousand-separator commas from number fields
-repd_df['Storage Co-location REPD Ref ID'] = repd_df['Storage Co-location REPD Ref ID'].map(lambda x: float(str(x).replace(',','')))
-repd_df['X-coordinate'] = repd_df['X-coordinate'].map(lambda x: float(str(x).replace(',','')))
-repd_df['Y-coordinate'] = repd_df['Y-coordinate'].map(lambda x: float(str(x).replace(',','')))
+output_df['Storage Co-location REPD Ref ID'] = output_df['Storage Co-location REPD Ref ID'].map(lambda x: float(str(x).replace(',','')))
+output_df['X-coordinate'] = output_df['X-coordinate'].map(lambda x: float(str(x).replace(',','')))
+output_df['Y-coordinate'] = output_df['Y-coordinate'].map(lambda x: float(str(x).replace(',','')))
 
 # Remove spaces from postcodes
-repd_df['Post Code'] = repd_df['Post Code'].map(lambda x: str(x).replace(' ',''))
+output_df['Post Code'] = output_df['Post Code'].map(lambda x: str(x).replace(' ',''))
 
 # Ensure the tariff is numeric
-repd_df['FiT Tariff (p/kWh)'] = repd_df['FiT Tariff (p/kWh)'].map(lambda x: float(x))
+output_df['FiT Tariff (p/kWh)'] = output_df['FiT Tariff (p/kWh)'].map(lambda x: float(x))
 
 # Remove line breaks from within certain fields
-repd_df['Address'] = repd_df['Address'].str.replace('\r\n', ', ')
-repd_df['Appeal Reference'] = repd_df['Appeal Reference'].map(lambda x: str(x).replace('\r\n', ''))
+output_df['Address'] = output_df['Address'].str.replace('\r\n', ', ')
+output_df['Appeal Reference'] = output_df['Appeal Reference'].map(lambda x: str(x).replace('\r\n', ''))
 
 # Convert each BNG coordinate to lat and lon and add new columns
-for index, row in repd_df.iterrows():
+for index, row in output_df.iterrows():
     # Ignore any rows that don't have an X and Y coordinate
     if pd.notna(row['X-coordinate']) and pd.notna(row['Y-coordinate']):
         lat, lon = convert(row['X-coordinate'], row['Y-coordinate'])
-        repd_df.set_value(index,'latitude', lat)
-        repd_df.set_value(index,'longitude', lon)
+        output_df.set_value(index,'latitude', lat)
+        output_df.set_value(index,'longitude', lon)
 
-repd_csv_str = repd_df.to_csv(index=False)
+repd_csv_str = output_df.to_csv(index=False)
 
 # Make generic edits and write out
 sys.stdout.write(clean_repd_csv(repd_csv_str))