Skip to content

Commit

Permalink
check columns for osm and repd
Browse files Browse the repository at this point in the history
  • Loading branch information
Ed Chalstrey authored and Ed Chalstrey committed Apr 16, 2020
1 parent 26b770a commit 1752795
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 14 deletions.
31 changes: 28 additions & 3 deletions data/processed/pre-process-osm.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,37 @@
sys.stdin.reconfigure(encoding='iso-8859-1')
osm_df = pd.read_csv(sys.stdin)

# Check the file has the columns we expect and order them as we expect
# If the columns don't exist, make the column empty
output_df = pd.DataFrame()
required_columns = ['objtype',
'id',
'user',
'timestamp',
'lat',
'lon',
'calc_area',
'calc_capacity',
'generator:solar:modules',
'location',
'orientation',
'plantref',
'tag_power',
'tag_repd:id',
'tag_start_date']

for col in required_columns:
try:
output_df[col] = osm_df[col]
except KeyError:
output_df[col] = np.nan

# Edit tagged date column with pandas
dates = []
before_date_strs = ['before ']
after_date_strs = ['.']
mistakes = [('-00', '-01')]
for date in osm_df['tag_start_date']:
for date in output_df['tag_start_date']:
if pd.notna(date):
og_date = date
date = str(date)
Expand All @@ -27,8 +52,8 @@
dates.append(parse(date, ignoretz=True, default=parse('2020-01-01')))
else:
dates.append(None)
osm_df['tag_start_date'] = dates
output_df['tag_start_date'] = dates

osm_csv_str = osm_df.to_csv(index=False)
osm_csv_str = output_df.to_csv(index=False)

sys.stdout.write(osm_csv_str)
79 changes: 68 additions & 11 deletions data/processed/pre-process-repd.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,30 +18,87 @@ def clean_repd_csv(csv_str):
sys.stdin.reconfigure(encoding='iso-8859-1')
repd_df = pd.read_csv(sys.stdin, skiprows=1)

# Check the file has the columns we expect and order them as we expect
# If the columns don't exist, make the column empty
output_df = pd.DataFrame()
required_columns = ['Old Ref ID',
'Ref ID',
'Record Last Updated (dd/mm/yyyy)',
'Operator (or Applicant)',
'Site Name',
'Technology Type',
'Storage Type',
'Storage Co-location REPD Ref ID',
'Installed Capacity (MWelec)',
'CHP Enabled',
'RO Banding (ROC/MWh)',
'FiT Tariff (p/kWh)',
'CfD Capacity (MW)',
'Turbine Capacity (MW)',
'No. of Turbines',
'Height of Turbines (m)',
'Mounting Type for Solar',
'Development Status',
'Development Status (short)',
'Address',
'County',
'Region',
'Country',
'Post Code',
'X-coordinate',
'Y-coordinate',
'Planning Authority',
'Planning Application Reference',
'Appeal Reference',
'Secretary of State Reference',
'Type of Secretary of State Intervention',
'Judicial Review',
'Offshore Wind Round',
'Planning Application Submitted',
'Planning Application Withdrawn',
'Planning Permission Refused',
'Appeal Lodged',
'Appeal Withdrawn',
'Appeal Refused',
'Appeal Granted',
'Planning Permission Granted',
'Secretary of State - Intervened',
'Secretary of State - Refusal',
'Secretary of State - Granted',
'Planning Permission Expired',
'Under Construction',
'Operational']

for col in required_columns:
try:
output_df[col] = repd_df[col]
except KeyError:
output_df[col] = np.nan

# Remove thousand-separator commas from number fields
repd_df['Storage Co-location REPD Ref ID'] = repd_df['Storage Co-location REPD Ref ID'].map(lambda x: float(str(x).replace(',','')))
repd_df['X-coordinate'] = repd_df['X-coordinate'].map(lambda x: float(str(x).replace(',','')))
repd_df['Y-coordinate'] = repd_df['Y-coordinate'].map(lambda x: float(str(x).replace(',','')))
output_df['Storage Co-location REPD Ref ID'] = output_df['Storage Co-location REPD Ref ID'].map(lambda x: float(str(x).replace(',','')))
output_df['X-coordinate'] = output_df['X-coordinate'].map(lambda x: float(str(x).replace(',','')))
output_df['Y-coordinate'] = output_df['Y-coordinate'].map(lambda x: float(str(x).replace(',','')))

# Remove spaces from postcodes
repd_df['Post Code'] = repd_df['Post Code'].map(lambda x: str(x).replace(' ',''))
output_df['Post Code'] = output_df['Post Code'].map(lambda x: str(x).replace(' ',''))

# Ensure the tariff is numeric
repd_df['FiT Tariff (p/kWh)'] = repd_df['FiT Tariff (p/kWh)'].map(lambda x: float(x))
output_df['FiT Tariff (p/kWh)'] = output_df['FiT Tariff (p/kWh)'].map(lambda x: float(x))

# Remove line breaks from within certain fields
repd_df['Address'] = repd_df['Address'].str.replace('\r\n', ', ')
repd_df['Appeal Reference'] = repd_df['Appeal Reference'].map(lambda x: str(x).replace('\r\n', ''))
output_df['Address'] = output_df['Address'].str.replace('\r\n', ', ')
output_df['Appeal Reference'] = output_df['Appeal Reference'].map(lambda x: str(x).replace('\r\n', ''))

# Convert each BNG coordinate to lat and lon and add new columns
for index, row in repd_df.iterrows():
for index, row in output_df.iterrows():
# Ignore any rows that don't have an X and Y coordinate
if pd.notna(row['X-coordinate']) and pd.notna(row['Y-coordinate']):
lat, lon = convert(row['X-coordinate'], row['Y-coordinate'])
repd_df.set_value(index,'latitude', lat)
repd_df.set_value(index,'longitude', lon)
output_df.set_value(index,'latitude', lat)
output_df.set_value(index,'longitude', lon)

repd_csv_str = repd_df.to_csv(index=False)
repd_csv_str = output_df.to_csv(index=False)

# Make generic edits and write out
sys.stdout.write(clean_repd_csv(repd_csv_str))

0 comments on commit 1752795

Please sign in to comment.