Skip to content

Commit 9b4cfbd

Browse files
committed
Pushing all changes up so things are synced.
1 parent 2a44d4a commit 9b4cfbd

25 files changed

+2782
-146
lines changed

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,9 @@ helpers/localDup.sh
2626
helpers/archives/
2727

2828
*.gz
29+
30+
.vscode/
31+
32+
helpers/figshareUpload/settings.yaml
33+
34+
helpers/figshareUpload/lib/__pycache__/

datachecks/clean_empty_strings.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
"""_Check for non-breaking spaces in text fields_
2+
This issue arose as part of some text-searching that a
3+
user was doing. The code here connects to a PostgreSQL
4+
database,
5+
"""
6+
import json
7+
import psycopg2
8+
from psycopg2 import sql
9+
10+
print("\nRunning database tests.")
11+
with open('../connect_remote.json', encoding='UTF-8') as f:
12+
data = json.load(f)
13+
14+
conn = psycopg2.connect(**data)
15+
conn.autocommit = True
16+
cur = conn.cursor()
17+
18+
# List all text columns.
19+
TEXT_COLS = """
20+
select col.table_schema,
21+
col.table_name,
22+
col.ordinal_position as column_id,
23+
col.column_name,
24+
col.data_type,
25+
col.character_maximum_length as maximum_length
26+
from information_schema.columns col
27+
join information_schema.tables tab on tab.table_schema = col.table_schema
28+
and tab.table_name = col.table_name
29+
and tab.table_type = 'BASE TABLE'
30+
where col.data_type in ('character varying', 'character',
31+
'text', '"char"', 'name')
32+
and col.table_schema not in ('information_schema', 'pg_catalog', 'public', 'tmp', 'pglogical')
33+
order by col.table_schema,
34+
col.table_name,
35+
col.ordinal_position;"""
36+
37+
cur.execute(TEXT_COLS)
38+
tables = cur.fetchall()
39+
40+
EMPTY_SPACE = """
41+
SELECT {}
42+
FROM {}
43+
WHERE {} ~ '.*[\u00A0\u1680\u180E\u2000-\u200B\u202F\u205F\u3000\uFEFF].*'"""
44+
45+
runcounter = []
46+
47+
for row in tables:
48+
tableobj = {'schema': row[0], 'table': row[1], 'column': row[3]}
49+
cur.execute(
50+
sql.SQL(EMPTY_SPACE).format(sql.Identifier(row[3]),
51+
sql.Identifier(row[0], row[1]),
52+
sql.Identifier(row[3])),
53+
tableobj)
54+
tableobj.update({'rows': cur.fetchall()})
55+
runcounter.append(tableobj)
56+
57+
FIELDS = list(filter(lambda x: len(x['rows']) > 0, runcounter))
58+
59+
len(FIELDS)
60+
61+
UPDATE_QUERY = """
62+
UPDATE {}
63+
SET {} = regexp_replace({},
64+
'[\u00A0\u1680\u180E\u2000\u200B\u202F\u205F\u3000\uFEFF]',
65+
' ')
66+
WHERE {} ~ '.*[\u00A0\u1680\u180E\u2000-\u200B\u202F\u205F\u3000\uFEFF].*'"""
67+
68+
for row in FIELDS:
69+
cur.execute(
70+
sql.SQL(UPDATE_QUERY).format(sql.Identifier(row['schema'], row['table']),
71+
sql.Identifier(row['column']),
72+
sql.Identifier(row['column']),
73+
sql.Identifier(row['column'])))

function/ap/dailyquerytable.sql

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
CREATE FUNCTION ap.dailyquerytable(_interval VARCHAR)
2+
RETURNS TABLE (siteid INT,
3+
sitename VARCHAR,
4+
datasetid INT,
5+
chronologyid INT,
6+
altitude FLOAT,
7+
datasettype VARCHAR,
8+
databaseid INT,
9+
collectionunitid INT,
10+
colltype VARCHAR,
11+
depenvt VARCHAR,
12+
geog GEOGRAPHY,
13+
older FLOAT,
14+
younger FLOAT,
15+
agetype VARCHAR,
16+
publications INT[],
17+
taxa INT[],
18+
keywords INT[],
19+
contacts INT[],
20+
collectionunit JSONB,
21+
geopol INT[])
22+
AS $$
23+
WITH allids AS (
24+
SELECT st.siteid,
25+
unnest(array_append(gp.geoout, gp.geoin::int)) AS geopol
26+
FROM ndb.sites AS st
27+
INNER JOIN ndb.sitegeopolitical AS sgp ON st.siteid = sgp.siteid
28+
INNER JOIN ndb.geopoliticalunits AS gpu ON gpu.geopoliticalid = sgp.geopoliticalid
29+
INNER JOIN ndb.geopaths AS gp ON gp.geoin = sgp.geopoliticalid
30+
),
31+
sgp AS (
32+
SELECT siteid, array_agg(DISTINCT geopol) AS geopol
33+
FROM allids
34+
GROUP BY siteid
35+
)
36+
SELECT st.siteid,
37+
st.sitename,
38+
ds.datasetid,
39+
chron.chronologyid,
40+
st.altitude,
41+
dst.datasettype,
42+
dsdb.databaseid,
43+
cu.collectionunitid,
44+
cut.colltype,
45+
dvt.depenvt,
46+
st.geog,
47+
arg.older,
48+
arg.younger,
49+
agetypes.agetype,
50+
array_remove(array_agg(DISTINCT dspb.publicationid), NULL) AS publications,
51+
array_remove(array_agg(DISTINCT var.taxonid), NULL) AS taxa,
52+
array_remove(array_agg(DISTINCT smpkw.keywordid), NULL) AS keywords,
53+
array_remove(array_agg(DISTINCT dpi.contactid) || array_agg(DISTINCT sma.contactid), NULL) AS contacts,
54+
jsonb_build_object('collectionunitid', cu.collectionunitid,
55+
'collectionunit', cu.collunitname,
56+
'handle', cu.handle,
57+
'collectionunittype', cut.colltype,
58+
'datasets', json_agg(DISTINCT jsonb_build_object('datasetid', ds.datasetid,
59+
'datasettype', dst.datasettype))) AS collectionunit,
60+
sgp.geopol
61+
FROM ndb.sites AS st
62+
LEFT OUTER JOIN ndb.collectionunits AS cu ON cu.siteid = st.siteid
63+
LEFT OUTER JOIN ndb.collectiontypes AS cut ON cut.colltypeid = cu.colltypeid
64+
LEFT OUTER JOIN ndb.datasets AS ds ON ds.collectionunitid = cu.collectionunitid
65+
LEFT OUTER JOIN ndb.depenvttypes AS dvt ON dvt.depenvtid = cu.depenvtid
66+
LEFT OUTER JOIN ndb.datasetpis AS dpi ON dpi.datasetid = ds.datasetid
67+
LEFT OUTER JOIN ndb.datasettypes AS dst ON dst.datasettypeid = ds.datasettypeid
68+
LEFT OUTER JOIN ndb.datasetdatabases AS dsdb ON ds.datasetid = dsdb.datasetid
69+
LEFT OUTER JOIN ndb.datasetpublications AS dspb ON dspb.datasetid = ds.datasetid
70+
LEFT OUTER JOIN ndb.chronologies AS chron ON chron.collectionunitid = ds.collectionunitid
71+
LEFT OUTER JOIN ndb.dsageranges AS arg ON ds.datasetid = arg.datasetid AND chron.agetypeid = arg.agetypeid
72+
LEFT OUTER JOIN ndb.agetypes AS agetypes ON agetypes.agetypeid = arg.agetypeid
73+
LEFT OUTER JOIN ndb.samples AS smp ON smp.datasetid = ds.datasetid
74+
LEFT OUTER JOIN ndb.sampleanalysts AS sma ON sma.sampleid = smp.sampleid
75+
LEFT OUTER JOIN ndb.samplekeywords AS smpkw ON smpkw.sampleid = smp.sampleid
76+
LEFT OUTER JOIN ndb.data AS dt ON dt.sampleid = smp.sampleid
77+
LEFT OUTER JOIN ndb.variables AS var ON var.variableid = dt.variableid
78+
LEFT OUTER JOIN sgp AS sgp ON st.siteid = sgp.siteid
79+
WHERE ds.recdatemodified > current_date - (_interval || 'day')::INTERVAL OR
80+
smp.recdatemodified > current_date - (_interval || 'day')::INTERVAL OR
81+
st.recdatemodified > current_date - (_interval || 'day')::INTERVAL
82+
GROUP BY st.siteid,
83+
cu.collectionunitid,
84+
st.sitename,
85+
ds.datasetid,
86+
cut.colltype,
87+
chron.chronologyid,
88+
dsdb.databaseid,
89+
st.altitude,
90+
dst.datasettype,
91+
st.geog,
92+
arg.older,
93+
arg.younger,
94+
agetypes.agetype,
95+
sgp.geopol,
96+
dvt.depenvt
97+
$$ LANGUAGE sql;

function/ap/dailysummaries.sql

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
CREATE or REPLACE FUNCTION ap.dailysummaries(_interval VARCHAR DEFAULT '1')
2+
RETURNS TABLE (dbdate DATE, sites BIGINT, datasets BIGINT, publications BIGINT, observations BIGINT)
3+
AS $$
4+
SELECT DISTINCT date_trunc('day', ds.recdatecreated)::date AS dbdate,
5+
COUNT(DISTINCT st.siteid) AS sites,
6+
COUNT(DISTINCT ds.datasetid) AS datasets,
7+
COUNT(DISTINCT pu.publicationid) AS publications,
8+
COUNT(DISTINCT dt.dataid) AS observations
9+
FROM ndb.sites AS st
10+
INNER JOIN ndb.collectionunits AS cu ON cu.siteid = st.siteid
11+
INNER JOIN ndb.datasets AS ds ON ds.collectionunitid = cu.collectionunitid
12+
INNER JOIN ndb.datasetpublications AS dspu ON dspu.datasetid = ds.datasetid
13+
INNER JOIN ndb.publications AS pu ON pu.publicationid = dspu.publicationid
14+
INNER JOIN ndb.analysisunits AS au ON au.collectionunitid = cu.collectionunitid
15+
INNER JOIN ndb.samples AS smp ON smp.analysisunitid = au.analysisunitid
16+
INNER JOIN ndb.data AS dt ON dt.sampleid = smp.sampleid
17+
WHERE ds.recdatecreated > current_date - (_interval || 'day')::INTERVAL
18+
GROUP BY date_trunc('day', ds.recdatecreated)
19+
$$ LANGUAGE sql;

0 commit comments

Comments
 (0)