NeotomaDB
diff --git a/‎.gitignore‎
Lines changed: 6 additions & 0 deletions b/‎.gitignore‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎datachecks/clean_empty_strings.py‎
Lines changed: 73 additions & 0 deletions b/‎datachecks/clean_empty_strings.py‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎function/ap/dailyquerytable.sql‎
Lines changed: 97 additions & 0 deletions b/‎function/ap/dailyquerytable.sql‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎function/ap/dailysummaries.sql‎
Lines changed: 19 additions & 0 deletions b/‎function/ap/dailysummaries.sql‎
Lines changed: 19 additions & 0 deletions
@@ -26,3 +26,9 @@ helpers/localDup.sh
 helpers/archives/
 
 *.gz
+
+.vscode/
+
+helpers/figshareUpload/settings.yaml
+
+helpers/figshareUpload/lib/__pycache__/
@@ -0,0 +1,73 @@
+"""_Check for non-breaking spaces in text fields_
+   This issue arose as part of some text-searching that a 
+   user was doing. The code here connects to a PostgreSQL
+   database, 
+"""
+import json
+import psycopg2
+from psycopg2 import sql
+
+print("\nRunning database tests.")
+with open('../connect_remote.json', encoding='UTF-8') as f:
+    data = json.load(f)
+
+conn = psycopg2.connect(**data)
+conn.autocommit = True
+cur = conn.cursor()
+
+# List all text columns.
+TEXT_COLS = """
+    select col.table_schema,
+        col.table_name,
+        col.ordinal_position as column_id,
+        col.column_name,
+        col.data_type,
+        col.character_maximum_length as maximum_length
+    from information_schema.columns col
+    join information_schema.tables tab on tab.table_schema = col.table_schema
+                                    and tab.table_name = col.table_name
+                                    and tab.table_type = 'BASE TABLE'
+    where col.data_type in ('character varying', 'character',
+                            'text', '"char"', 'name')
+        and col.table_schema not in ('information_schema', 'pg_catalog', 'public', 'tmp', 'pglogical')
+    order by col.table_schema,
+            col.table_name,
+            col.ordinal_position;"""
+
+cur.execute(TEXT_COLS)
+tables = cur.fetchall()
+
+EMPTY_SPACE = """
+SELECT {}
+FROM {}
+WHERE {} ~ '.*[\u00A0\u1680\u180E\u2000-\u200B\u202F\u205F\u3000\uFEFF].*'"""
+
+runcounter = []
+
+for row in tables:
+    tableobj = {'schema': row[0], 'table': row[1], 'column': row[3]}
+    cur.execute(
+        sql.SQL(EMPTY_SPACE).format(sql.Identifier(row[3]),
+                                    sql.Identifier(row[0], row[1]),
+                                    sql.Identifier(row[3])),
+        tableobj)
+    tableobj.update({'rows': cur.fetchall()})
+    runcounter.append(tableobj)
+
+FIELDS = list(filter(lambda x: len(x['rows']) > 0, runcounter))
+
+len(FIELDS)
+
+UPDATE_QUERY = """
+UPDATE {}
+SET {} = regexp_replace({},
+    '[\u00A0\u1680\u180E\u2000\u200B\u202F\u205F\u3000\uFEFF]',
+    ' ')
+    WHERE {} ~ '.*[\u00A0\u1680\u180E\u2000-\u200B\u202F\u205F\u3000\uFEFF].*'"""
+
+for row in FIELDS:
+    cur.execute(
+        sql.SQL(UPDATE_QUERY).format(sql.Identifier(row['schema'], row['table']),
+                                     sql.Identifier(row['column']),
+                                     sql.Identifier(row['column']),
+                                     sql.Identifier(row['column'])))
@@ -0,0 +1,97 @@
+CREATE FUNCTION ap.dailyquerytable(_interval VARCHAR)
+RETURNS TABLE (siteid INT,
+        sitename VARCHAR,
+        datasetid INT,
+        chronologyid INT,
+        altitude FLOAT,
+        datasettype VARCHAR,
+        databaseid INT,
+        collectionunitid INT,
+        colltype VARCHAR,
+        depenvt VARCHAR,
+        geog GEOGRAPHY,
+        older FLOAT,
+        younger FLOAT,
+        agetype VARCHAR,
+        publications INT[],
+        taxa INT[],
+        keywords INT[],
+        contacts INT[],
+        collectionunit JSONB,
+        geopol INT[])
+AS $$
+WITH allids AS (
+        SELECT st.siteid,
+        unnest(array_append(gp.geoout, gp.geoin::int)) AS geopol
+        FROM ndb.sites AS st
+        INNER JOIN ndb.sitegeopolitical AS sgp ON st.siteid = sgp.siteid
+        INNER JOIN ndb.geopoliticalunits AS gpu ON gpu.geopoliticalid = sgp.geopoliticalid
+        INNER JOIN ndb.geopaths AS gp ON gp.geoin = sgp.geopoliticalid
+    ),
+    sgp AS (
+        SELECT siteid, array_agg(DISTINCT geopol) AS geopol
+        FROM allids
+        GROUP BY siteid
+    )
+    SELECT st.siteid,
+        st.sitename,
+        ds.datasetid,
+        chron.chronologyid,
+        st.altitude,
+        dst.datasettype,
+        dsdb.databaseid,
+        cu.collectionunitid,
+        cut.colltype,
+        dvt.depenvt,
+        st.geog,
+        arg.older,
+        arg.younger,
+        agetypes.agetype,
+        array_remove(array_agg(DISTINCT dspb.publicationid), NULL) AS publications,
+        array_remove(array_agg(DISTINCT var.taxonid), NULL) AS taxa,
+        array_remove(array_agg(DISTINCT smpkw.keywordid), NULL) AS keywords,
+        array_remove(array_agg(DISTINCT dpi.contactid) || array_agg(DISTINCT sma.contactid), NULL) AS contacts,
+        jsonb_build_object('collectionunitid', cu.collectionunitid,
+							 'collectionunit', cu.collunitname,
+									 'handle', cu.handle,
+						 'collectionunittype', cut.colltype,
+								   'datasets', json_agg(DISTINCT jsonb_build_object('datasetid', ds.datasetid,
+                                                                        'datasettype', dst.datasettype))) AS collectionunit,
+        sgp.geopol
+    FROM ndb.sites AS st
+    LEFT OUTER JOIN ndb.collectionunits AS cu ON cu.siteid = st.siteid
+    LEFT OUTER JOIN ndb.collectiontypes AS cut ON cut.colltypeid = cu.colltypeid
+    LEFT OUTER JOIN ndb.datasets AS ds ON ds.collectionunitid = cu.collectionunitid
+    LEFT OUTER JOIN ndb.depenvttypes AS dvt ON dvt.depenvtid = cu.depenvtid
+    LEFT OUTER JOIN ndb.datasetpis AS dpi ON dpi.datasetid = ds.datasetid
+    LEFT OUTER JOIN ndb.datasettypes AS dst ON dst.datasettypeid = ds.datasettypeid
+    LEFT OUTER JOIN ndb.datasetdatabases AS dsdb ON ds.datasetid = dsdb.datasetid
+    LEFT OUTER JOIN ndb.datasetpublications AS dspb ON dspb.datasetid = ds.datasetid
+    LEFT OUTER JOIN ndb.chronologies AS chron ON chron.collectionunitid = ds.collectionunitid
+    LEFT OUTER JOIN ndb.dsageranges AS arg ON ds.datasetid = arg.datasetid AND chron.agetypeid = arg.agetypeid
+    LEFT OUTER JOIN ndb.agetypes  AS agetypes ON agetypes.agetypeid = arg.agetypeid
+    LEFT OUTER JOIN ndb.samples AS smp ON smp.datasetid = ds.datasetid 
+    LEFT OUTER JOIN ndb.sampleanalysts AS sma ON sma.sampleid = smp.sampleid
+    LEFT OUTER JOIN ndb.samplekeywords AS smpkw ON smpkw.sampleid = smp.sampleid
+    LEFT OUTER JOIN ndb.data AS dt ON dt.sampleid = smp.sampleid
+    LEFT OUTER JOIN ndb.variables AS var ON var.variableid = dt.variableid
+    LEFT OUTER JOIN sgp AS sgp ON st.siteid = sgp.siteid
+    WHERE ds.recdatemodified > current_date - (_interval || 'day')::INTERVAL OR
+          smp.recdatemodified > current_date - (_interval || 'day')::INTERVAL OR
+          st.recdatemodified > current_date - (_interval || 'day')::INTERVAL    
+    GROUP BY st.siteid,
+        cu.collectionunitid,
+        st.sitename,
+        ds.datasetid,
+        cut.colltype,
+        chron.chronologyid,
+        dsdb.databaseid,
+        st.altitude,
+        dst.datasettype,
+        st.geog,
+        arg.older,
+        arg.younger,
+        agetypes.agetype,
+        sgp.geopol,
+        dvt.depenvt
+$$ LANGUAGE sql;
@@ -0,0 +1,19 @@
+CREATE or REPLACE FUNCTION ap.dailysummaries(_interval VARCHAR DEFAULT '1')
+RETURNS TABLE (dbdate DATE, sites BIGINT, datasets BIGINT, publications BIGINT, observations BIGINT)
+AS $$
+SELECT DISTINCT date_trunc('day', ds.recdatecreated)::date AS dbdate,
+       COUNT(DISTINCT st.siteid) AS sites,
+       COUNT(DISTINCT ds.datasetid) AS datasets,
+	   COUNT(DISTINCT pu.publicationid) AS publications,
+	   COUNT(DISTINCT dt.dataid) AS observations
+FROM ndb.sites AS st
+INNER JOIN ndb.collectionunits AS cu ON cu.siteid = st.siteid
+INNER JOIN ndb.datasets AS ds ON ds.collectionunitid = cu.collectionunitid
+INNER JOIN ndb.datasetpublications AS dspu ON dspu.datasetid = ds.datasetid
+INNER JOIN ndb.publications AS pu ON pu.publicationid = dspu.publicationid
+INNER JOIN ndb.analysisunits AS au ON au.collectionunitid = cu.collectionunitid
+INNER JOIN ndb.samples AS smp ON smp.analysisunitid = au.analysisunitid
+INNER JOIN ndb.data AS dt ON dt.sampleid = smp.sampleid
+WHERE ds.recdatecreated > current_date - (_interval || 'day')::INTERVAL
+GROUP BY date_trunc('day', ds.recdatecreated)
+$$ LANGUAGE sql;