update to new parliamentary constituencies

kanedata · Jun 5, 2024 · 3168066 · 3168066
1 parent 4add5e4
commit 3168066
Show file tree

Hide file tree

Showing 10 changed files with 198 additions and 40 deletions.
diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml
@@ -7,11 +7,11 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Parse Python version
         run: sed  s/python-// runtime.txt | head > .python-version
       - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version-file: ".python-version"
       - name: Install dependencies
@@ -21,10 +21,8 @@ jobs:
           pip install -r requirements.txt
       - name: ruff
         run: |
-          ruff .
-      - name: black
-        run: |
-          black . --check
+          ruff check .
+          ruff format . --check
       - name: Test with pytest
         run: |
           pip install pytest

diff --git a/.isort.cfg b/.isort.cfg
diff --git a/findthatpostcode/commands/__init__.py b/findthatpostcode/commands/__init__.py
@@ -1,6 +1,6 @@
 from flask.cli import AppGroup
 
-from . import boundaries, codes, placenames, postcodes, stats
+from . import boundaries, codes, new_pcon, placenames, postcodes, stats
 
 
 def init_app(app):
@@ -13,6 +13,7 @@ def init_app(app):
     import_cli.add_command(stats.import_imd2019)
     import_cli.add_command(stats.import_imd2015)
     import_cli.add_command(placenames.import_placenames)
+    import_cli.add_command(new_pcon.import_new_pcon)
 
     app.cli.add_command(import_cli)
 

diff --git a/findthatpostcode/commands/boundaries.py b/findthatpostcode/commands/boundaries.py
@@ -1,6 +1,7 @@
 """
 Import commands for the register of geographic codes and code history database
 """
+
 import csv
 import glob
 import io
@@ -16,6 +17,10 @@
 from elasticsearch.helpers import scan
 from flask import current_app
 from flask.cli import with_appcontext
+from pyproj import Transformer
+from shapely import to_geojson
+from shapely.geometry import shape
+from shapely.ops import transform
 
 from .. import db
 from .codes import AREA_INDEX
@@ -67,6 +72,11 @@ def import_boundary(client, url, examine=False, code_field=None):
             boundaries = json.load(f)
     errors = []
 
+    # Check the CRS
+    transformer = None
+    if boundaries.get("crs", {}).get("properties", {}).get("name") == "EPSG:27700":
+        transformer = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
+
     # find the code field for a boundary
     if len(boundaries.get("features", [])) == 0:
         errors.append("[ERROR][%s] Features not found in file" % (url,))
@@ -132,6 +142,18 @@ def import_boundary(client, url, examine=False, code_field=None):
         ):
             area_code = i["properties"][code_field]
             prefix = area_code[0:3]
+            if transformer:
+                # create a shapely object from the geometry
+                geometry = shape(i["geometry"])
+                i["geometry"] = json.loads(
+                    to_geojson(
+                        transform(
+                            transformer.transform,
+                            geometry,
+                        )
+                    )
+                )
+
             client.upload_fileobj(
                 io.BytesIO(json.dumps(i).encode("utf-8")),
                 current_app.config["S3_BUCKET"],

diff --git a/findthatpostcode/commands/new_pcon.py b/findthatpostcode/commands/new_pcon.py
@@ -0,0 +1,151 @@
+import codecs
+import csv
+import io
+import zipfile
+from collections import defaultdict
+
+import click
+import requests
+import requests_cache
+import tqdm
+from elasticsearch.helpers import bulk
+from flask import current_app
+from flask.cli import with_appcontext
+
+from findthatpostcode.commands.codes import AREA_INDEX
+from findthatpostcode.commands.postcodes import PC_INDEX
+
+from .. import db
+
+PCON_NAMES_AND_CODES_URL = "https://opendata.arcgis.com/api/v3/datasets/9a876e4777bc47e392e670a7b8bc3f5c_0/downloads/data?format=csv&spatialRefId=4326&where=1%3D1"
+PCON_2010_LOOKUP_URL = "https://opendata.arcgis.com/api/v3/datasets/c776b66c0e534b849cae5a5121b7a16a_0/downloads/data?format=csv&spatialRefId=4326&where=1%3D1"
+PCON_POSTCODE_URL = "https://www.arcgis.com/sharing/rest/content/items/f60c78533aa7462cb934bb4a81afc1e0/data"
+PCON_BOUNDARIES_URL = "https://stg-arcgisazurecdataprod1.az.arcgis.com/exportfiles-1559-23529/Westminster_Parliamentary_Constituencies_July_2024_Boundaries_UK_BSC_7275719608364942765.geojson"
+
+
+@click.command("new_pcon")
+@click.option("--area-index", default=AREA_INDEX)
+@click.option("--postcode-index", default=PC_INDEX)
+@with_appcontext
+def import_new_pcon(area_index=AREA_INDEX, postcode_index=PC_INDEX):
+    if current_app.config["DEBUG"]:
+        requests_cache.install_cache()
+
+    es = db.get_db()
+
+    # get the names and codes for the new areas
+    r = requests.get(PCON_NAMES_AND_CODES_URL, stream=True)
+    reader = csv.DictReader(codecs.iterdecode(r.iter_lines(), "utf-8-sig"))
+    areas = {}
+    for row in reader:
+        names = [row["PCON24NM"]]
+        if row["PCON24NMW"]:
+            names.append(row["PCON24NMW"])
+        areas[row["PCON24CD"]] = {
+            "code": row["PCON24CD"],
+            "name": row["PCON24NM"],
+            "name_welsh": row["PCON24NMW"] if row["PCON24NMW"] else None,
+            "statutory_instrument_id": "1230/2023",
+            "statutory_instrument_title": "The Parliamentary Constituencies Order 2023",
+            "date_start": "2024-07-05T00:00:00",
+            "date_end": None,
+            "parent": None,
+            "entity": row["PCON24CD"][0:3],
+            "owner": "LGBC",
+            "active": True,
+            "areaehect": None,
+            "areachect": None,
+            "areaihect": None,
+            "arealhect": None,
+            "sort_order": row["PCON24CD"],
+            "predecessor": [],
+            "successor": [],
+            "equivalents": {},
+            "type": "pcon",
+            "alternative_names": names,
+        }
+
+    # get the lookup for the 2010 areas
+    r = requests.get(PCON_2010_LOOKUP_URL, stream=True)
+    reader = csv.DictReader(codecs.iterdecode(r.iter_lines(), "utf-8-sig"))
+    update_2010 = defaultdict(list)
+    for row in reader:
+        areas[row["PCON24CD"]]["predecessor"].append(row["PCON10CD"])
+        update_2010[row["PCON10CD"]].append(row["PCON24CD"])
+
+    # create new areas and update old areas
+    to_update = [
+        {
+            "_index": area_index,
+            "_type": "_doc",
+            "_op_type": "update",
+            "_id": area_id,
+            "doc_as_upsert": True,
+            "doc": area,
+        }
+        for area_id, area in areas.items()
+    ] + [
+        {
+            "_index": area_index,
+            "_type": "_doc",
+            "_op_type": "update",
+            "_id": area_id,
+            "doc_as_upsert": True,
+            "doc": {
+                "active": False,
+                "successor": successors,
+                "date_end": "2024-07-05T00:00:00",
+            },
+        }
+        for area_id, successors in update_2010.items()
+    ]
+    print(
+        "[new parliamentary constituencies] Processed %s new parliamentary constituencies"
+        % len(to_update)
+    )
+    print(
+        "[elasticsearch] %s parliamentary constituencies to create or update"
+        % len(to_update)
+    )
+    results = bulk(es, to_update)
+    print(
+        "[elasticsearch] saved %s new parliamentary constituencies to %s index"
+        % (results[0], area_index)
+    )
+    print("[elasticsearch] %s errors reported" % len(results[1]))
+
+    # fetch postcode data
+    r = requests.get(PCON_POSTCODE_URL)
+    z = zipfile.ZipFile(io.BytesIO(r.content))
+
+    for f in z.namelist():
+        if not f.endswith(".csv"):
+            continue
+        with z.open(f, "r") as infile:
+            reader = csv.DictReader(io.TextIOWrapper(infile, encoding="Windows-1252"))
+            postcode_updates = []
+            for row in tqdm.tqdm(reader):
+                postcode = row["pcd"]
+                # convert to "pcds" format
+                postcode = "%s %s" % (postcode[:-3].strip(), postcode[-3:])
+                record = {
+                    "_index": postcode_index,
+                    "_type": "_doc",
+                    "_op_type": "update",
+                    "_id": postcode,
+                    "doc": {
+                        "pcon": row["pconcd"],
+                    },
+                }
+                postcode_updates.append(record)
+    print(
+        "[new parliamentary constituencies] Processed %s postcodes to update"
+        % len(postcode_updates)
+    )
+    print("[elasticsearch] %s postcodes to update" % len(postcode_updates))
+    results = bulk(es, postcode_updates, raise_on_error=False)
+    print(
+        "[elasticsearch] updated %s postcodes in %s index"
+        % (results[0], postcode_index)
+    )
+    print("[elasticsearch] %s errors reported" % len(results[1]))
diff --git a/findthatpostcode/commands/placenames.py b/findthatpostcode/commands/placenames.py
@@ -1,6 +1,7 @@
 """
 Import commands for placenames
 """
+
 import csv
 import io
 import zipfile

diff --git a/findthatpostcode/commands/postcodes.py b/findthatpostcode/commands/postcodes.py
@@ -1,6 +1,7 @@
 """
 Import commands for the register of geographic codes and code history database
 """
+
 import csv
 import datetime
 import hashlib
@@ -19,8 +20,8 @@
 PC_INDEX = "geo_postcode"
 
 NSPL_URL = {
-    2011: "https://www.arcgis.com/sharing/rest/content/items/782899bd01934a8099ae8516cc021f68/data",
-    2021: "https://www.arcgis.com/sharing/rest/content/items/b86748732a054592bcf0218e86a43870/data",
+    2011: "https://www.arcgis.com/sharing/rest/content/items/521edce4159a451a932539b7fc786322/data",
+    2021: "https://www.arcgis.com/sharing/rest/content/items/f7464f3658ba439ba577651b32014cfe/data",
 }
 DEFAULT_YEAR = 2021
 

diff --git a/readme.md b/readme.md
@@ -121,9 +121,6 @@ flask import boundaries "https://opendata.arcgis.com/datasets/094f326b0b1247e3bc
 
 You can add more than one URL to each import script.
 
-These imports will also take a while, and add significantly to the size of the
-elasticsearch index. It may increase in size to over 5GB.
-
 ### 7. Import placenames (optional)
 
 A further related dataset is placenames. The [ONS has a list of these](http://geoportal.statistics.gov.uk/datasets/a6c138d17ac54532b0ca8ee693922f10_0)

diff --git a/requirements.in b/requirements.in
@@ -7,7 +7,6 @@ requests-cache
 tqdm
 python-dotenv
 dictlib
-black
 ruff
 ElasticMock
 pytest
@@ -16,4 +15,6 @@ blinker
 sqlite-utils
 ua-parser
 boto3
-sentry-sdk[flask]
+sentry-sdk[flask]
+pyproj
+shapely