WordPress · rwidom · Aug 19, 2022 · May 10, 2022 · May 16, 2022 · May 22, 2022
diff --git a/.gitignore b/.gitignore
@@ -144,3 +144,10 @@ si_samples_*
 
 # Vim detritus
 *.swp
+
+# minio testing detritus
+/tests/s3-data/.minio.sys/
+/tests/s3-data/commonsmapper/
+/tests/s3-data/commonsmapper-v2/
+/tests/s3-data/openverse-airflow-logs/
+/tests/s3-data/openverse-storage/
@@ -27,8 +27,11 @@ services:
       MINIO_ROOT_USER: ${AWS_ACCESS_KEY}
       MINIO_ROOT_PASSWORD: ${AWS_SECRET_KEY}
       # Comma separated list of buckets to create on startup
-      BUCKETS_TO_CREATE: ${OPENVERSE_BUCKET},openverse-airflow-logs,commonsmapper-v2,commonsmapper
+      BUCKETS_TO_CREATE: ${OPENVERSE_BUCKET},openverse-airflow-logs,commonsmapper-v2,commonsmapper,inaturalist-open-data
     # Create the buckets on every container startup
+    # Is there something about creating the buckets in the entrypoint that might make them
+    # visible as buckets, as opposed to syncing them over from the volume? Doesn't seem likely,
+    # so maybe this could be dropped in favor of fixed empty subdirectories?
     # Note: $0 is included in the exec because "/bin/bash -c" swallows the first
     # argument, so it must be re-added at the beginning of the exec call
     entrypoint: >-
@@ -39,7 +42,17 @@ services:
       exec $$0 \"$$@\""
     command: minio server /data --address :5000 --console-address :5001
     volumes:
-      - minio:/data
+      # Any buckets used for testing provider data imported from s3 should be subdirectories under 
+      # /tests/s3-data/
+      # The trick to the issues here (https://wordpress.slack.com/archives/C02012JB00N/p1655118749223179)
+      # was to have a single volume, and to represent each bucket as a subdirectory under that.
+      # BUT, the test runs still can't see the inaturalist-open-data bucket, even though I have 
+      # verified through docker that the files exist on the container.
+      # TO DO: The openverse user has root access to all buckets on this minio instance, which
+      # makes sense for openverse-owned buckets, but not for read-only buckets owned by providers.
+      # Maybe we add a minio/mc + minio/minio pair with more realistic permissions for buckets
+      # containing sample provider data?
+      - ./tests/s3-data:/data:rw
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:5010/minio/health/live"]
       interval: 30s

@@ -22,13 +22,17 @@ AS $$
     ).Object(bucket, file_path)
     temp_location = '/tmp/postgres_loading.tsv'
     s3_obj.download_file(temp_location)
+    if file_path[-3:]=='.gz':
+        copy_from = "PROGRAM 'gzip -dc "+temp_location+"'"
+    else:
+        copy_from = plpy.quote_literal(temp_location)
     with open(temp_location) as f:
         columns = '({})'.format(column_list) if column_list else ''
         res = plpy.execute(
             'COPY {} {} FROM {} {};'.format(
                 table_name,
                 columns,
-                plpy.quote_literal(temp_location),
+                copy_from,
                 options
             )
         )

@@ -31,6 +31,7 @@
 STOCKSNAP_DEFAULT_PROVIDER = "stocksnap"
 WORDPRESS_DEFAULT_PROVIDER = "wordpress"
 FREESOUND_DEFAULT_PROVIDER = "freesound"
+INATURALIST_DEFAULT_PROVIDER = "inaturalist"
 
 # Finnish parameters
 FINNISH_SUB_PROVIDERS = {

diff --git a/openverse_catalog/dags/providers/inaturalist_workflow.py b/openverse_catalog/dags/providers/inaturalist_workflow.py
@@ -0,0 +1,26 @@
+"""
+This file configures the Apache Airflow DAG to (re)ingest Inaturalist data.
+"""
+# airflow DAG (necessary for Airflow to find this file)
+from datetime import datetime
+import logging
+
+from providers.provider_api_scripts import inaturalist
+from common.provider_dag_factory import create_provider_api_workflow
+
+
+logging.basicConfig(
+    format='%(asctime)s: [%(levelname)s - DAG Loader] %(message)s',
+    level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+DAG_ID = "inaturalist_workflow"
+
+globals()[DAG_ID] = create_provider_api_workflow(
+    DAG_ID,
+    inaturalist.main,
+    start_date=datetime(1970, 1, 1),
+    max_active_tasks=1,
+    schedule_string='@monthly',
+    dated=False,
+)
@@ -0,0 +1,111 @@
+## There are a bunch of bigger picture notes below, but right now, this is failing 
+## in the dev environment in that it can't access the sample files in minio.
+## All of the other s3 tests run fine in my dev environment though.
+
+"""
+Content Provider:       Inaturalist
+
+ETL Process:            With Inaturalist, for reasons described below, we aren't really doing ETL, but ELT.
+                        Part of the challenge is to figure out how much to try to fit an ELT process into 
+                        code developed for ETL. The compromise here is a lot of SQL wrapped in Python. 
+
+                        Another approach to consider would be an open source tool developed specifically for 
+                        SQL transformations -- dbt. Reading this article -- https://docs.getdbt.com/blog/dbt-airflow-spiritual-alignment
+                        -- made me wonder if a proof of concept with Inaturalist data might be worthwhile.
+                        dbt core is fully open source, and assuming that there will be more ELT providers, 
+                        it could be worthwhile.
+
+Output:                 This had been "TSV file containing the media metadata." but I'm not 100% sure that 
+                        that makes sense given the whole ELT vs ETL thing.
+
+Notes:                  The inaturalist API is not intended for data scraping.
+                        https://api.inaturalist.org/v1/docs/
+                        But there is a full dump intended for sharing on S3.
+                        https://github.com/inaturalist/inaturalist-open-data/tree/documentation/Metadata
+                        Because these are very large normalized tables, as opposed to more document oriented API 
+                        responses, we found that bringing the data into postgres first was the most effective approach.
+                        More detail in slack here:
+                        https://wordpress.slack.com/archives/C02012JB00N/p1653145643080479?thread_ts=1653082292.714469&cid=C02012JB00N
+                        This uses the structure defined here, except for adding ancestry tags to the taxa table:
+                        https://github.com/inaturalist/inaturalist-open-data/blob/main/Metadata/structure.sql
+
+TO DO:                  There is nothing here that actually updates the images table.
+                        Need to figure out a stand-in for metrics from the saved json counter
+
+"""
+import json
+import os
+import logging
+from pathlib import Path
+from urllib.parse import urlparse
+
+from textwrap import dedent
+import psycopg2
+
+from common.loader import provider_details as prov
+from common.storage.image import ImageStore
+
+
+logging.basicConfig(
+    format='%(asctime)s - %(name)s - %(levelname)s:  %(message)s',
+    level=logging.INFO
+)
+logger = logging.getLogger(__name__)
+
+PROVIDER = prov.INATURALIST_DEFAULT_PROVIDER
+SCRIPT_DIR = '/usr/local/airflow/openverse_catalog/dags/providers/provider_csv_load_scripts/inaturalist/'
+
+# This is set up here as a single thread linear process, to be more consisten with the 
+# structure of the provider dag factory, but if I were to start from scratch with airflow, 
+# each of these would be a task, and the dag would be 00 >> [01, 02, 03, 04] >> 05.
+# In dbt, the schema would be created separately, and steps 1-5 would each be a "model"
+# and dbt would navigate the dependencies more or less automagically.
+LOAD_PROCESS = [
+    '00_create_schema.sql',
+    '01_photos.sql',
+    '02_observations.sql',
+    '03_taxa.sql',
+    '04_observers.sql',
+    '05_final_tsv.sql'
+]
+CONNECTION_ID = os.getenv("AIRFLOW_CONN_POSTGRES_OPENLEDGER_TESTING")
+
+
+def run_sql_file(file_name, file_path=SCRIPT_DIR, conn_id=CONNECTION_ID):
+    """
+    The process is really written in SQL so this script just enables logging 
+    and monitoring jobs, but this is the basic function to run the SQL files 
+    for each step.
+    """
+    logger.info(f"Running {file_name} using DB connection {conn_id}")
+    result = 'SQL failed. See log for details.'
+    try:
+        assert file_name[-4:]=='.sql'
+        assert os.path.exists(file_path + file_name)
+        db = psycopg2.connect(conn_id)
+        cursor = db.cursor()
+        query = dedent(open(file_path + file_name, 'r').read())
+        cursor.execute(query)
+        result = cursor.fetchall()
+        db.commit()
+        logger.info("Success!")
+    except Exception as e:
+        logger.warning(f"SQL step failed due to {e}")
+    return result
+
+
+def main():
+    """
+    This is really just looping through the SQL steps defined above, with some additional logging.
+    """
+    logger.info("Begin: Inaturalist script")
+
+    for f in LOAD_PROCESS:
+        image_count = run_sql_file(f)
+        logger.info(f"Results: {str(image_count)}")
+    logger.info(f"Total images pulled: {image_count}")
+    logger.info('Terminated!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/openverse_catalog/dags/providers/provider_csv_load_scripts/inaturalist/00_create_schema.sql b/openverse_catalog/dags/providers/provider_csv_load_scripts/inaturalist/00_create_schema.sql
@@ -0,0 +1,2 @@
+CREATE SCHEMA IF NOT EXISTS inaturalist;
+select schema_name from information_schema.schemata where schema_name='inaturalist';
diff --git a/openverse_catalog/dags/providers/provider_csv_load_scripts/inaturalist/01_photos.sql b/openverse_catalog/dags/providers/provider_csv_load_scripts/inaturalist/01_photos.sql
@@ -0,0 +1,39 @@
+/* 
+********************************************************************************
+PHOTOS 
+********************************************************************************
+--  not using an FK constraint on observer_id to save load time, but the 5/30/2022 dataset does have complete observer ids
+--  not using an FK constraint on observation_uuid to save load time, but the 5/30/2022 dataset does have complete observer ids
+--  photo_id is not unique. There are ~130,000 photo_ids that appear more than once, maybe because an earlier version of 
+    the photo was deleted (?) unclear, but for now I'm going to assume that they will be taken care of later in the processing 
+    of these data. It does mean that we can't add an index and things will go slower when selecting on photo id. 
+    The documentation suggests indexing on photo UUID, but the AWS files of the actual photos, are stored under photo_id
+
+Taking DDL from https://github.com/inaturalist/inaturalist-open-data/blob/main/Metadata/structure.sql
+*/
+
+DROP TABLE IF EXISTS inaturalist.photos;
+commit;
+
+CREATE TABLE inaturalist.photos (
+    photo_uuid uuid NOT NULL,
+    photo_id integer NOT NULL,
+    observation_uuid uuid NOT NULL,
+    observer_id integer,
+    extension character varying(5),
+    license character varying(255),
+    width smallint,
+    height smallint,
+    position smallint
+);
+
+select aws_s3.table_import_from_s3('inaturalist.photos', 
+    '', 
+    '(FORMAT ''csv'', DELIMITER E''\t'', HEADER, QUOTE E''\b'')', 
+    'inaturalist-open-data', 
+    'photos.csv.gz', 
+    'us-east-1');
+
+CREATE INDEX index_inaturalist_photo_id ON inaturalist.photos (photo_id);
+
+select count(*) from inaturalist.photos;
diff --git a/openverse_catalog/dags/providers/provider_csv_load_scripts/inaturalist/02_observations.sql b/openverse_catalog/dags/providers/provider_csv_load_scripts/inaturalist/02_observations.sql
@@ -0,0 +1,35 @@
+/*
+********************************************************************************
+OBSERVATIONS
+********************************************************************************
+--  ~400,000 observations do not have a taxon_id that is in the taxa table. 
+--  Their photos are not included in the final transformed view on the assumption 
+    that photos are not useful to us without a title or tags
+
+Taking DDL from https://github.com/inaturalist/inaturalist-open-data/blob/main/Metadata/structure.sql
+*/
+
+DROP TABLE IF EXISTS inaturalist.observations;
+commit;
+
+CREATE TABLE CREATE TABLE inaturalist.observations (
+    observation_uuid uuid,
+    observer_id integer,
+    latitude numeric(15,10),
+    longitude numeric(15,10),
+    positional_accuracy integer,
+    taxon_id integer,
+    quality_grade character varying(255),
+    observed_on date
+);
+
+select aws_s3.table_import_from_s3('inaturalist.observations', 
+    '', 
+    '(FORMAT ''csv'', DELIMITER E''\t'', HEADER, QUOTE E''\b'')',
+    'inaturalist-open-data', 
+    'observations.csv.gz', 
+    'us-east-1');
+
+ALTER TABLE inaturalist.observations ADD PRIMARY KEY (observation_uuid);
+
+select count(*) from inaturalist.observations;
diff --git a/openverse_catalog/dags/providers/provider_csv_load_scripts/inaturalist/03_taxa.sql b/openverse_catalog/dags/providers/provider_csv_load_scripts/inaturalist/03_taxa.sql
@@ -0,0 +1,58 @@
+/* 
+********************************************************************************
+TAXA 
+********************************************************************************
+
+Taking DDL from https://github.com/inaturalist/inaturalist-open-data/blob/main/Metadata/structure.sql
+Plus adding a field for ancestry tags.
+*/
+
+DROP TABLE IF EXISTS inaturalist.taxa;
+commit;
+
+CREATE TABLE inaturalist.taxa (
+    taxon_id integer,
+    ancestry character varying(255),
+    rank_level double precision,
+    rank character varying(255),
+    name character varying(255),
+    active boolean,
+    tags text
+);
+
+-- Load from S3
+select aws_s3.table_import_from_s3('inaturalist.taxa', 
+    'taxon_id, ancestry, rank_level, rank, name, active', 
+    '(FORMAT ''csv'', DELIMITER E''\t'', HEADER, QUOTE E''\b'')', 
+    'inaturalist-open-data', 
+    'taxa.csv.gz', 
+    'us-east-1');
+
+-- doing this after the load to help performance, but will need a way to 
+-- handle non-uniqueness if it comes up
+ALTER TABLE inaturalist.taxa ADD PRIMARY KEY (taxon_id);
+
+-- Aggregate ancestry names as tags
+create temporary table unnest_ancestry as
+(
+    SELECT 
+        unnest(string_to_array(ancestry, '/'))::int as linked_taxon_id, 
+        taxon_id 
+    FROM inaturalist.taxa
+);
+
+create temporary table taxa_tags as
+(
+    select u.taxon_id, STRING_AGG(taxa.name, '; ') as tags 
+    from unnest_ancestry as u 
+        join inaturalist.taxa on u.linked_taxon_id = taxa.taxon_id
+    where taxa.rank not in ('kingdom', 'stateofmatter')
+    group by u.taxon_id
+);
+
+update inaturalist.taxa
+set tags = taxa_tags.tags
+from inaturalist.taxa_tags
+where taxa_tags.taxon_id = taxa.taxon_id;
+
+select count(*) from inaturalist.taxa;
diff --git a/openverse_catalog/dags/providers/provider_csv_load_scripts/inaturalist/04_observers.sql b/openverse_catalog/dags/providers/provider_csv_load_scripts/inaturalist/04_observers.sql
@@ -0,0 +1,28 @@
+/* 
+********************************************************************************
+OBSERVERS 
+********************************************************************************
+
+Taking DDL from https://github.com/inaturalist/inaturalist-open-data/blob/main/Metadata/structure.sql
+*/
+
+DROP TABLE IF EXISTS inaturalist.observers;
+commit;
+
+CREATE TABLE inaturalist.observers (
+    observer_id integer,
+    login character varying(255),
+    name character varying(255)
+);
+
+select aws_s3.table_import_from_s3('inaturalist.observers', 
+    '', 
+    '(FORMAT ''csv'', DELIMITER E''\t'', HEADER, QUOTE E''\b'')', 
+    'inaturalist-open-data', 
+    'observers.csv.gz', 
+    'us-east-1');
+
+-- doing this after the load to speed performance, but will need a way to handle non-uniqueness
+ALTER TABLE inaturalist.observers ADD PRIMARY KEY (observer_id);
+
+select count(*) from inaturalist.observers;