WordPress · rwidom · Aug 19, 2022 · May 10, 2022 · May 16, 2022 · May 22, 2022
@@ -28,7 +28,7 @@ services:
       MINIO_ROOT_PASSWORD: ${AWS_SECRET_KEY}
       # Comma separated list of buckets to create on startup
       BUCKETS_TO_CREATE: ${OPENVERSE_BUCKET},openverse-airflow-logs,commonsmapper-v2,commonsmapper
-    # Create the buckets on every container startup
+    # Create empty buckets on every container startup
     # Note: $0 is included in the exec because "/bin/bash -c" swallows the first
     # argument, so it must be re-added at the beginning of the exec call
     entrypoint: >-
@@ -46,6 +46,36 @@ services:
       timeout: 20s
       retries: 3
 
+  load_to_s3:
+    image: minio/mc:latest
+    env_file:
+      - .env
+    depends_on:
+      - s3
+    volumes:
+      # Buckets for testing provider data imported from s3 are subdirectories under
+      # /tests/s3-data/
+      - ./tests/s3-data:/data:rw
+    # Loop through subdirectories mounted to the volume and load them to s3/minio.
+    # This takes care of filesystem delays on some local dev environments that may make
+    # minio miss files included directly in the minio volume.
+    # More info here: https://stackoverflow.com/questions/72867045
+    # This does *not* allow for testing permissions issues that may come up in real AWS.
+    # And, if you remove files from /tests/s3-data, you will need to use `just down -v`
+    # and `just up` or `just recreate` to see the minio bucket without those files.
+    entrypoint: >
+      /bin/sh -c "
+      /usr/bin/mc config host add s3 http://s3:5000 ${AWS_ACCESS_KEY} ${AWS_SECRET_KEY};
+      cd /data;
+      for b in */ ; do
+        echo \"Loading bucket $$b\"
+        /usr/bin/mc mb --ignore-existing s3/$$b
+        /usr/bin/mc cp --r $$b s3/$$b
+        /usr/bin/mc ls s3/$$b;
+      done ;
+      exit 0;
+      "
+
   # Dev changes for the webserver container
   webserver:
     depends_on:

@@ -13,22 +13,27 @@ LANGUAGE plpython3u
 AS $$
     import os
     import boto3
+    from datetime import datetime as dt
     s3_obj = boto3.resource(
         's3',
         aws_access_key_id=os.getenv('AWS_ACCESS_KEY', 'test_key'),
         aws_secret_access_key=os.getenv('AWS_SECRET_KEY', 'test_secret'),
         region_name=region,
         endpoint_url=os.getenv('S3_LOCAL_ENDPOINT', 'http://s3:5000')
     ).Object(bucket, file_path)
-    temp_location = '/tmp/postgres_loading.tsv'
+    temp_location = f"/tmp/pg_load_{dt.now().timestamp()}_{file_path.split('/')[-1]}"
     s3_obj.download_file(temp_location)
+    if file_path[-3:]=='.gz':
+        copy_from = f"PROGRAM 'gzip -dc {temp_location}'"
+    else:
+        copy_from = plpy.quote_literal(temp_location)
     with open(temp_location) as f:
         columns = '({})'.format(column_list) if column_list else ''
         res = plpy.execute(
             'COPY {} {} FROM {} {};'.format(
                 table_name,
                 columns,
-                plpy.quote_literal(temp_location),
+                copy_from,
                 options
             )
         )

@@ -52,8 +52,12 @@ deploy:
 lint:
     pre-commit run --all-files
 
+# Load any dependencies necessary for actions on the stack without running the webserver
+_deps:
+    @just up "postgres s3 load_to_s3"
+
 # Mount the tests directory and run a particular command
-@_mount-tests command: (up "postgres s3")
+@_mount-tests command: _deps
     # The test directory is mounted into the container only during testing
     docker-compose {{ DOCKER_FILES }} run \
         -v {{ justfile_directory() }}/tests:/usr/local/airflow/tests/ \
@@ -75,7 +79,7 @@ shell: up
     docker-compose {{ DOCKER_FILES }} exec {{ SERVICE }} /bin/bash
 
 # Launch an IPython REPL using the webserver image
-ipython: (up "postgres s3")
+ipython: _deps
     docker-compose {{ DOCKER_FILES }} run \
         --rm \
         -w /usr/local/airflow/openverse_catalog/dags \
@@ -84,7 +88,7 @@ ipython: (up "postgres s3")
         /usr/local/airflow/.local/bin/ipython
 
 # Run a given command using the webserver image
-run *args: (up "postgres s3")
+run *args: _deps
     docker-compose {{ DOCKER_FILES }} run --rm {{ SERVICE }} {{ args }}
 
 # Launch a pgcli shell on the postgres container (defaults to openledger) use "airflow" for airflow metastore

@@ -31,6 +31,7 @@
 STOCKSNAP_DEFAULT_PROVIDER = "stocksnap"
 WORDPRESS_DEFAULT_PROVIDER = "wordpress"
 FREESOUND_DEFAULT_PROVIDER = "freesound"
+INATURALIST_DEFAULT_PROVIDER = "inaturalist"
 
 # Finnish parameters
 FINNISH_SUB_PROVIDERS = {

@@ -0,0 +1,172 @@
+"""
+Provider:   iNaturalist
+
+Output:     TSV file containing the media metadata.
+
+Notes:      [The iNaturalist API is not intended for data scraping.]
+            (https://api.inaturalist.org/v1/docs/)
+
+            [But there is a full dump intended for sharing on S3.]
+            (https://github.com/inaturalist/inaturalist-open-data/tree/documentation/Metadata)
+
+            Because these are very large normalized tables, as opposed to more document
+            oriented API responses, we found that bringing the data into postgres first
+            was the most effective approach. [More detail in slack here.]
+            (https://wordpress.slack.com/archives/C02012JB00N/p1653145643080479?thread_ts=1653082292.714469&cid=C02012JB00N)
+
+            We use the table structure defined [here,]
+            (https://github.com/inaturalist/inaturalist-open-data/blob/main/Metadata/structure.sql)
+            except for adding ancestry tags to the taxa table.
+"""
+
+import os
+from pathlib import Path
+from textwrap import dedent
+from typing import Dict
+
+import pendulum
+from airflow.exceptions import AirflowSkipException
+from airflow.operators.python import PythonOperator
+from airflow.providers.amazon.aws.hooks.s3 import S3Hook
+from airflow.providers.postgres.hooks.postgres import PostgresHook
+from airflow.providers.postgres.operators.postgres import PostgresOperator
+from airflow.utils.task_group import TaskGroup
+from common.constants import POSTGRES_CONN_ID
+from common.licenses import LicenseInfo, get_license_info
+from common.loader import provider_details as prov
+from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester
+
+
+AWS_CONN_ID = os.getenv("AWS_CONN_ID", "test_conn_id")
+PROVIDER = prov.INATURALIST_DEFAULT_PROVIDER
+SCRIPT_DIR = Path(__file__).parents[1] / "provider_csv_load_scripts/inaturalist"
+SOURCE_FILE_NAMES = ["photos", "observations", "taxa", "observers"]
+
+
+class INaturalistDataIngester(ProviderDataIngester):
+
+    providers = {"image": prov.INATURALIST_DEFAULT_PROVIDER}
+
+    def __init__(self, *kwargs):
+        super(INaturalistDataIngester, self).__init__()
+        self.pg = PostgresHook(POSTGRES_CONN_ID)
+
+        # adjustments to buffer limits. TO DO: try to integrate this with the dev
+        # environment logic in the base class, rather than just over-writing it.
+        self.media_stores["image"].buffer_length = 10_000
+        self.batch_limit = 10_000
+        self.sql_template = dedent(
+            (SCRIPT_DIR / "05_export_to_json.template.sql").read_text()
+        )
+
+    def get_next_query_params(self, prev_query_params=None, **kwargs):
+        if prev_query_params is None:
+            return {"offset_num": 0}
+        else:
+            next_offset = prev_query_params["offset_num"] + self.batch_limit
+            return {"offset_num": next_offset}
+
+    def get_response_json(self, query_params: Dict):
+        """
+        Call the SQL to pull json from Postgres, where the raw data has been loaded.
+        """
+        sql_string = self.sql_template.format(
+            batch_limit=self.batch_limit, offset_num=query_params["offset_num"]
+        )
+        sql_result = self.pg.get_records(sql_string)
+        # Postgres returns a a list of tuples, even if it's one tuple with one item.
+        return sql_result[0][0]
+
+    def get_batch_data(self, response_json):
+        if response_json:
+            return response_json
+        return None
+
+    def get_record_data(self, data):
+        if data.get("foreign_identifier") is None:
+            return None
+        license_url = data.get("license_url")
+        license_info = get_license_info(license_url=license_url)
+        if license_info == LicenseInfo(None, None, None, None):
+            return None
+        record_data = {k: data[k] for k in data.keys() if k != "license_url"}
+        record_data["license_info"] = license_info
+        return record_data
+
+    def get_media_type(self, record):
+        # This provider only supports Images via S3, though they have some audio files
+        # on site and in the API.
+        return "image"
+
+    def endpoint(self):
+        raise NotImplementedError("Normalized TSV files from AWS S3 means no endpoint.")
+
+    @staticmethod
+    def compare_update_dates(
+        last_success: pendulum.DateTime | None, s3_keys: list, aws_conn_id=AWS_CONN_ID
+    ):
+        # if it was never run, assume the data is new
+        if last_success is None:
+            return
+        s3 = S3Hook(aws_conn_id=aws_conn_id)
+        s3_client = s3.get_client_type()
+        for key in s3_keys:
+            # this will error out if the files don't exist, and bubble up as an
+            # informative failure
+            last_modified = s3_client.head_object(
+                Bucket="inaturalist-open-data", Key=key
+            )["LastModified"]
+            # if any file has been updated, let's pull them all
+            if last_success < last_modified:
+                return
+        # If no files have been updated, skip the DAG
+        raise AirflowSkipException("Nothing new to ingest")
+
+    @staticmethod
+    def create_preingestion_tasks():
+
+        with TaskGroup(group_id="preingestion_tasks") as preingestion_tasks:
+
+            check_for_file_updates = PythonOperator(
+                task_id="check_for_file_updates",
+                python_callable=INaturalistDataIngester.compare_update_dates,
+                op_kwargs={
+                    # With the templated values ({{ x }}) airflow will fill it in
+                    "last_success": "{{ prev_start_date_success }}",
+                    "s3_keys": [
+                        f"{file_name}.csv.gz" for file_name in SOURCE_FILE_NAMES
+                    ],
+                },
+            )
+
+            create_inaturalist_schema = PostgresOperator(
+                task_id="create_inaturalist_schema",
+                postgres_conn_id=POSTGRES_CONN_ID,
+                sql=dedent((SCRIPT_DIR / "00_create_schema.sql").read_text()),
+            )
+
+            with TaskGroup(group_id="load_source_files") as load_source_files:
+                for idx, source_name in enumerate(SOURCE_FILE_NAMES):
+                    PostgresOperator(
+                        task_id=f"load_{source_name}",
+                        postgres_conn_id=POSTGRES_CONN_ID,
+                        sql=dedent(
+                            (
+                                SCRIPT_DIR
+                                / (str(idx + 1).zfill(2) + f"_{source_name}.sql")
+                            ).read_text()
+                        ),
+                    )
+
+            (check_for_file_updates >> create_inaturalist_schema >> load_source_files)
+
+        return preingestion_tasks
+
+    @staticmethod
+    def create_postingestion_tasks():
+        drop_inaturalist_schema = PostgresOperator(
+            task_id="drop_inaturalist_schema",
+            postgres_conn_id=POSTGRES_CONN_ID,
+            sql="DROP SCHEMA IF EXISTS inaturalist CASCADE",
+        )
+        return drop_inaturalist_schema
diff --git a/openverse_catalog/dags/providers/provider_csv_load_scripts/inaturalist/00_create_schema.sql b/openverse_catalog/dags/providers/provider_csv_load_scripts/inaturalist/00_create_schema.sql
@@ -0,0 +1,4 @@
+CREATE SCHEMA IF NOT EXISTS inaturalist;
+COMMIT;
+SELECT schema_name
+FROM information_schema.schemata WHERE schema_name = 'inaturalist';
diff --git a/openverse_catalog/dags/providers/provider_csv_load_scripts/inaturalist/01_photos.sql b/openverse_catalog/dags/providers/provider_csv_load_scripts/inaturalist/01_photos.sql
@@ -0,0 +1,42 @@
+/*
+-------------------------------------------------------------------------------
+PHOTOS
+-------------------------------------------------------------------------------
+--  despite the 2022-05-30 data set having complete observer IDs, we do not use an FK constraint on observer_id in order to save load time
+--  photo_id is not unique. There are ~130,000 photo_ids that appear more than
+    once, maybe because an earlier version of the photo was deleted (unclear),
+    but for now assuming that will be taken care of later in the processing.
+
+Taking DDL from
+https://github.com/inaturalist/inaturalist-open-data/blob/main/Metadata/structure.sql
+*/
+
+DROP TABLE IF EXISTS inaturalist.photos CASCADE;
+COMMIT;
+
+CREATE TABLE inaturalist.photos (
+    photo_uuid uuid NOT NULL,
+    photo_id integer NOT NULL,
+    observation_uuid uuid NOT NULL,
+    observer_id integer,
+    extension character varying(5),
+    license character varying(255),
+    width smallint,
+    height smallint,
+    position smallint
+);
+COMMIT;
+
+SELECT aws_s3.table_import_from_s3('inaturalist.photos',
+    '',
+    '(FORMAT ''csv'', DELIMITER E''\t'', HEADER, QUOTE E''\b'')',
+    'inaturalist-open-data',
+    'photos.csv.gz',
+    'us-east-1');
+
+-- Not unique, because photo id isn't unique, and it will take too long to check.
+-- btree because that is the only one that will support limit/offset without sorting.
+-- more here: https://www.postgresql.org/docs/current/indexes-ordering.html
+CREATE INDEX ON INATURALIST.PHOTOS USING btree (PHOTO_ID);
+
+SELECT count(*) FROM inaturalist.photos;
diff --git a/openverse_catalog/dags/providers/provider_csv_load_scripts/inaturalist/02_observations.sql b/openverse_catalog/dags/providers/provider_csv_load_scripts/inaturalist/02_observations.sql
@@ -0,0 +1,38 @@
+/*
+-------------------------------------------------------------------------------
+OBSERVATIONS
+-------------------------------------------------------------------------------
+--  ~400,000 observations do not have a taxon_id that is in the taxa table.
+--  Their photos are not included in the final transformed view on the
+    assumption that photos are not useful to us without a title or tags
+
+Taking DDL from
+https://github.com/inaturalist/inaturalist-open-data/blob/main/Metadata/structure.sql
+*/
+
+DROP TABLE IF EXISTS inaturalist.observations;
+COMMIT;
+
+CREATE TABLE inaturalist.observations (
+    observation_uuid uuid,
+    observer_id integer,
+    latitude numeric(15, 10),
+    longitude numeric(15, 10),
+    positional_accuracy integer,
+    taxon_id integer,
+    quality_grade character varying(255),
+    observed_on date
+);
+COMMIT;
+
+SELECT aws_s3.table_import_from_s3('inaturalist.observations',
+    '',
+    '(FORMAT ''csv'', DELIMITER E''\t'', HEADER, QUOTE E''\b'')',
+    'inaturalist-open-data',
+    'observations.csv.gz',
+    'us-east-1');
+
+ALTER TABLE inaturalist.observations ADD PRIMARY KEY (observation_uuid);
+COMMIT;
+
+SELECT count(*) FROM inaturalist.observations;