Skip to content

Commit

Permalink
Templated queries working on sql creation, require check on documenta…
Browse files Browse the repository at this point in the history
…tion
  • Loading branch information
cameronneylon committed Aug 5, 2022
1 parent 32ead0e commit 84abbd3
Show file tree
Hide file tree
Showing 39 changed files with 4,579 additions and 63 deletions.
30 changes: 21 additions & 9 deletions observatory/reports/provndoc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@

from precipy.analytics_function import AnalyticsFunction


# from .report_data_processing.sql import load_sql_to_string
from report_data_processing.sql import load_sql_to_string


def process_sql_to_queries(af: AnalyticsFunction,
Expand All @@ -16,16 +15,29 @@ def process_sql_to_queries(af: AnalyticsFunction,
sql_processed_dir: Union[Path, str]):
if not rerun:
return
sql_templates = sorted(Path(sql_template_dir).glob('*.sql_templates'))
sql_templates = sorted(Path(sql_template_dir).glob('*.sql'))
jinja_templates = sorted(Path(sql_template_dir).glob('*.jinja2'))

for template in sql_templates:
query = load_sql_to_string(template.name,
parameters=parameters,
directory=sql_template_dir)
filepath = Path(sql_processed_dir) / template
with open(filepath) as f:
f.write(query)
try:
query = load_sql_to_string(template.name,
parameters=parameters,
directory=sql_template_dir)
filepath = Path(sql_processed_dir) / template.name
with open(filepath, 'w') as f:
f.write(query)
except KeyError: # Case for templates that need to be processed for each year
for year in parameters.get('years'):
parameters.update(dict(
year=year
))
query = load_sql_to_string(template.name,
parameters=parameters,
directory=sql_template_dir)
filepath = Path(sql_processed_dir) / f'{template.stem}_{year}{template.suffix}'
with open(filepath, 'w') as f:
f.write(query)
parameters.pop('year')

# af.add_existing_file(filepath)

Expand Down
23 changes: 19 additions & 4 deletions process.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,27 @@
TEMPDIR = Path('tempdir')


def process_sql_templates_to_queries(af: AnalyticsFunction,
rerun: bool=RERUN):

parameters = dict(

)
provndoc_utils.process_sql_to_queries(af,
SQL_TEMPLATE_PARAMETERS,
rerun,
SQL_TEMPLATES_DIRECTORY,
SQL_PROCESSED_DIRECTORY)

def provenance_n_documentation(af: AnalyticsFunction,
rerun: bool = RERUN):

provndoc_utils.process_sql_to_queries(af,rerun, SQL_TEMPLATES_DIRECTORY, SQL_PROCESSED_DIRECTORY)
provndoc_utils.build_sql_dag(af, rerun, SQL_PROCESSED_DIRECTORY)


def calculate_citation_diversity(af: AnalyticsFunction,
rerun: bool = RERUN,
verbose: bool = True):
def create_global_citation_diversity_table(af: AnalyticsFunction,
rerun: bool = RERUN,
verbose: bool = True):
"""
Run global_citations_query.sql_templates to generate article level citation diversity data
"""
Expand Down Expand Up @@ -1024,3 +1035,7 @@ def create_figure3b(af: AnalyticsFunction):
fig.write_image('report_graphs/figure3/figure3b.png', scale=FIG_SCALE, width=630, height=700)
af.add_existing_file('report_graphs/figure3/figure3b.png')
print('... completed')


if __name__ == '__main__':
process_sql_templates_to_queries(af='mock', rerun=True)
3 changes: 2 additions & 1 deletion report_data_processing/parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# which citation diversity metric to run
METRICS = ['GiniSim', 'Shannon']

# which data year to run, i.e., 2010 to 2019
# which data year to run. The final year is *not* included so for eg 2010 to 2019 use (2010, 2020)
YEARS = list(range(2010, 2020))

# measures of central location
Expand Down Expand Up @@ -102,6 +102,7 @@
CITATION_DIVERSITY_TABLE = 'coki-scratch-space.karl.citation_diversity_global'

SQL_TEMPLATE_PARAMETERS = dict(
years=YEARS,
first_year=YEARS[0],
last_year=YEARS[-1],
doi_table=DOI_TABLE,
Expand Down
2 changes: 1 addition & 1 deletion report_data_processing/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def load_sql_to_string(filepath: Union[str, Path],
if directory:
filepath = Path(directory) / filepath

assert ((filepath.suffix == '.sql_templates') or (filepath.suffix == '.jinja2'))
assert ((filepath.suffix == '.sql') or (filepath.suffix == '.jinja2'))

with open(filepath, 'r') as f:
sql_string = f.read()
Expand Down
132 changes: 132 additions & 0 deletions report_data_processing/sql_processed/cit_div_vs_cit_count.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/*
## Summary
NEEDS A SUMMARY HERE
## Description
## Contacts
karl.huang@curtin.edu.au
## Requires
table bigquery://coki-scratch-space.karl.citation_diversity_global
## Creates
file cit_div_vs_cit_count.csv
*/

WITH
data_perc AS (
SELECT
year,
CitationCount,

PERCENTILE_CONT(CitingInstitutions_GiniSim,0) OVER(PARTITION BY CitationCount, year) AS CitingInstitutions_GiniSim_perc0,
PERCENTILE_CONT(CitingInstitutions_GiniSim,0.25) OVER(PARTITION BY CitationCount, year) AS CitingInstitutions_GiniSim_perc25,
PERCENTILE_CONT(CitingInstitutions_GiniSim,0.5) OVER(PARTITION BY CitationCount, year) AS CitingInstitutions_GiniSim_perc50,
PERCENTILE_CONT(CitingInstitutions_GiniSim,0.75) OVER(PARTITION BY CitationCount, year) AS CitingInstitutions_GiniSim_perc75,
PERCENTILE_CONT(CitingInstitutions_GiniSim,1) OVER(PARTITION BY CitationCount, year) AS CitingInstitutions_GiniSim_perc100,

PERCENTILE_CONT(CitingCountries_GiniSim,0) OVER(PARTITION BY CitationCount, year) AS CitingCountries_GiniSim_perc0,
PERCENTILE_CONT(CitingCountries_GiniSim,0.25) OVER(PARTITION BY CitationCount, year) AS CitingCountries_GiniSim_perc25,
PERCENTILE_CONT(CitingCountries_GiniSim,0.5) OVER(PARTITION BY CitationCount, year) AS CitingCountries_GiniSim_perc50,
PERCENTILE_CONT(CitingCountries_GiniSim,0.75) OVER(PARTITION BY CitationCount, year) AS CitingCountries_GiniSim_perc75,
PERCENTILE_CONT(CitingCountries_GiniSim,1) OVER(PARTITION BY CitationCount, year) AS CitingCountries_GiniSim_perc100,

PERCENTILE_CONT(CitingSubregions_GiniSim,0) OVER(PARTITION BY CitationCount, year) AS CitingSubregions_GiniSim_perc0,
PERCENTILE_CONT(CitingSubregions_GiniSim,0.25) OVER(PARTITION BY CitationCount, year) AS CitingSubregions_GiniSim_perc25,
PERCENTILE_CONT(CitingSubregions_GiniSim,0.5) OVER(PARTITION BY CitationCount, year) AS CitingSubregions_GiniSim_perc50,
PERCENTILE_CONT(CitingSubregions_GiniSim,0.75) OVER(PARTITION BY CitationCount, year) AS CitingSubregions_GiniSim_perc75,
PERCENTILE_CONT(CitingSubregions_GiniSim,1) OVER(PARTITION BY CitationCount, year) AS CitingSubregions_GiniSim_perc100,

PERCENTILE_CONT(CitingRegions_GiniSim,0) OVER(PARTITION BY CitationCount, year) AS CitingRegions_GiniSim_perc0,
PERCENTILE_CONT(CitingRegions_GiniSim,0.25) OVER(PARTITION BY CitationCount, year) AS CitingRegions_GiniSim_perc25,
PERCENTILE_CONT(CitingRegions_GiniSim,0.5) OVER(PARTITION BY CitationCount, year) AS CitingRegions_GiniSim_perc50,
PERCENTILE_CONT(CitingRegions_GiniSim,0.75) OVER(PARTITION BY CitationCount, year) AS CitingRegions_GiniSim_perc75,
PERCENTILE_CONT(CitingRegions_GiniSim,1) OVER(PARTITION BY CitationCount, year) AS CitingRegions_GiniSim_perc100,

PERCENTILE_CONT(CitingInstitutions_Shannon,0) OVER(PARTITION BY CitationCount, year) AS CitingInstitutions_Shannon_perc0,
PERCENTILE_CONT(CitingInstitutions_Shannon,0.25) OVER(PARTITION BY CitationCount, year) AS CitingInstitutions_Shannon_perc25,
PERCENTILE_CONT(CitingInstitutions_Shannon,0.5) OVER(PARTITION BY CitationCount, year) AS CitingInstitutions_Shannon_perc50,
PERCENTILE_CONT(CitingInstitutions_Shannon,0.75) OVER(PARTITION BY CitationCount, year) AS CitingInstitutions_Shannon_perc75,
PERCENTILE_CONT(CitingInstitutions_Shannon,1) OVER(PARTITION BY CitationCount, year) AS CitingInstitutions_Shannon_perc100,

PERCENTILE_CONT(CitingCountries_Shannon,0) OVER(PARTITION BY CitationCount, year) AS CitingCountries_Shannon_perc0,
PERCENTILE_CONT(CitingCountries_Shannon,0.25) OVER(PARTITION BY CitationCount, year) AS CitingCountries_Shannon_perc25,
PERCENTILE_CONT(CitingCountries_Shannon,0.5) OVER(PARTITION BY CitationCount, year) AS CitingCountries_Shannon_perc50,
PERCENTILE_CONT(CitingCountries_Shannon,0.75) OVER(PARTITION BY CitationCount, year) AS CitingCountries_Shannon_perc75,
PERCENTILE_CONT(CitingCountries_Shannon,1) OVER(PARTITION BY CitationCount, year) AS CitingCountries_Shannon_perc100,

PERCENTILE_CONT(CitingSubregions_Shannon,0) OVER(PARTITION BY CitationCount, year) AS CitingSubregions_Shannon_perc0,
PERCENTILE_CONT(CitingSubregions_Shannon,0.25) OVER(PARTITION BY CitationCount, year) AS CitingSubregions_Shannon_perc25,
PERCENTILE_CONT(CitingSubregions_Shannon,0.5) OVER(PARTITION BY CitationCount, year) AS CitingSubregions_Shannon_perc50,
PERCENTILE_CONT(CitingSubregions_Shannon,0.75) OVER(PARTITION BY CitationCount, year) AS CitingSubregions_Shannon_perc75,
PERCENTILE_CONT(CitingSubregions_Shannon,1) OVER(PARTITION BY CitationCount, year) AS CitingSubregions_Shannon_perc100,

PERCENTILE_CONT(CitingRegions_Shannon,0) OVER(PARTITION BY CitationCount, year) AS CitingRegions_Shannon_perc0,
PERCENTILE_CONT(CitingRegions_Shannon,0.25) OVER(PARTITION BY CitationCount, year) AS CitingRegions_Shannon_perc25,
PERCENTILE_CONT(CitingRegions_Shannon,0.5) OVER(PARTITION BY CitationCount, year) AS CitingRegions_Shannon_perc50,
PERCENTILE_CONT(CitingRegions_Shannon,0.75) OVER(PARTITION BY CitationCount, year) AS CitingRegions_Shannon_perc75,
PERCENTILE_CONT(CitingRegions_Shannon,1) OVER(PARTITION BY CitationCount, year) AS CitingRegions_Shannon_perc100
FROM
`coki-scratch-space.karl.citation_diversity_global`
WHERE
(CitationCount >= 2) AND (is_oa IS NOT NULL)
)
SELECT
year,
CitationCount,

ANY_VALUE(CitingInstitutions_GiniSim_perc0) AS CitingInstitutions_GiniSim_perc0,
ANY_VALUE(CitingInstitutions_GiniSim_perc25) AS CitingInstitutions_GiniSim_perc25,
ANY_VALUE(CitingInstitutions_GiniSim_perc50) AS CitingInstitutions_GiniSim_perc50,
ANY_VALUE(CitingInstitutions_GiniSim_perc75) AS CitingInstitutions_GiniSim_perc75,
ANY_VALUE(CitingInstitutions_GiniSim_perc100) AS CitingInstitutions_GiniSim_perc100,

ANY_VALUE(CitingCountries_GiniSim_perc0) AS CitingCountries_GiniSim_perc0,
ANY_VALUE(CitingCountries_GiniSim_perc25) AS CitingCountries_GiniSim_perc25,
ANY_VALUE(CitingCountries_GiniSim_perc50) AS CitingCountries_GiniSim_perc50,
ANY_VALUE(CitingCountries_GiniSim_perc75) AS CitingCountries_GiniSim_perc75,
ANY_VALUE(CitingCountries_GiniSim_perc100) AS CitingCountries_GiniSim_perc100,

ANY_VALUE(CitingSubregions_GiniSim_perc0) AS CitingSubregions_GiniSim_perc0,
ANY_VALUE(CitingSubregions_GiniSim_perc25) AS CitingSubregions_GiniSim_perc25,
ANY_VALUE(CitingSubregions_GiniSim_perc50) AS CitingSubregions_GiniSim_perc50,
ANY_VALUE(CitingSubregions_GiniSim_perc75) AS CitingSubregions_GiniSim_perc75,
ANY_VALUE(CitingSubregions_GiniSim_perc100) AS CitingSubregions_GiniSim_perc100,

ANY_VALUE(CitingRegions_GiniSim_perc0) AS CitingRegions_GiniSim_perc0,
ANY_VALUE(CitingRegions_GiniSim_perc25) AS CitingRegions_GiniSim_perc25,
ANY_VALUE(CitingRegions_GiniSim_perc50) AS CitingRegions_GiniSim_perc50,
ANY_VALUE(CitingRegions_GiniSim_perc75) AS CitingRegions_GiniSim_perc75,
ANY_VALUE(CitingRegions_GiniSim_perc100) AS CitingRegions_GiniSim_perc100,

ANY_VALUE(CitingInstitutions_Shannon_perc0) AS CitingInstitutions_Shannon_perc0,
ANY_VALUE(CitingInstitutions_Shannon_perc25) AS CitingInstitutions_Shannon_perc25,
ANY_VALUE(CitingInstitutions_Shannon_perc50) AS CitingInstitutions_Shannon_perc50,
ANY_VALUE(CitingInstitutions_Shannon_perc75) AS CitingInstitutions_Shannon_perc75,
ANY_VALUE(CitingInstitutions_Shannon_perc100) AS CitingInstitutions_Shannon_perc100,

ANY_VALUE(CitingCountries_Shannon_perc0) AS CitingCountries_Shannon_perc0,
ANY_VALUE(CitingCountries_Shannon_perc25) AS CitingCountries_Shannon_perc25,
ANY_VALUE(CitingCountries_Shannon_perc50) AS CitingCountries_Shannon_perc50,
ANY_VALUE(CitingCountries_Shannon_perc75) AS CitingCountries_Shannon_perc75,
ANY_VALUE(CitingCountries_Shannon_perc100) AS CitingCountries_Shannon_perc100,

ANY_VALUE(CitingSubregions_Shannon_perc0) AS CitingSubregions_Shannon_perc0,
ANY_VALUE(CitingSubregions_Shannon_perc25) AS CitingSubregions_Shannon_perc25,
ANY_VALUE(CitingSubregions_Shannon_perc50) AS CitingSubregions_Shannon_perc50,
ANY_VALUE(CitingSubregions_Shannon_perc75) AS CitingSubregions_Shannon_perc75,
ANY_VALUE(CitingSubregions_Shannon_perc100) AS CitingSubregions_Shannon_perc100,

ANY_VALUE(CitingRegions_Shannon_perc0) AS CitingRegions_Shannon_perc0,
ANY_VALUE(CitingRegions_Shannon_perc25) AS CitingRegions_Shannon_perc25,
ANY_VALUE(CitingRegions_Shannon_perc50) AS CitingRegions_Shannon_perc50,
ANY_VALUE(CitingRegions_Shannon_perc75) AS CitingRegions_Shannon_perc75,
ANY_VALUE(CitingRegions_Shannon_perc100) AS CitingRegions_Shannon_perc100

FROM
data_perc
GROUP BY year, CitationCount
ORDER BY year, CitationCount
Loading

0 comments on commit 84abbd3

Please sign in to comment.