Skip to content

Commit

Permalink
updated sql files with summaries and descriptions; updated summary st…
Browse files Browse the repository at this point in the history
…ats by regions and subregions queries to create unnested tables and removed redundant variables; updated functions for graphs comparing citations between regions and subregions; updated figure 3 functions to cater for the above changes.
  • Loading branch information
huangkarl committed Aug 17, 2022
1 parent 28c8859 commit bace3e2
Show file tree
Hide file tree
Showing 26 changed files with 6,204 additions and 1,242 deletions.
326 changes: 121 additions & 205 deletions process.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -12,46 +12,33 @@ karl.huang@curtin.edu.au
table bigquery://coki-scratch-space.karl.citation_diversity_global
## Creates
file summary_stats_by_region_atleast2cit.json
file summary_stats_by_region_atleast2cit.csv
*/

WITH
datatemp1 AS (
SELECT
doi,
region.name AS region_cited,
year,
is_oa,
region.name AS region,
CitingRegions_name,
CitingRegions_table,
CitingRegions_count_uniq,
CitingRegions_GiniSim,
CitingRegions_Shannon,
PERCENTILE_CONT(CitingRegions_count_uniq,0.5) OVER(PARTITION BY region.name, year, is_oa) AS CitingRegions_count_uniq_median,
PERCENTILE_CONT(CitingRegions_GiniSim,0.5) OVER(PARTITION BY region.name, year, is_oa) AS CitingRegions_GiniSim_median,
PERCENTILE_CONT(CitingRegions_Shannon,0.5) OVER(PARTITION BY region.name, year, is_oa) AS CitingRegions_Shannon_median
FROM `coki-scratch-space.karl.citation_diversity_global`, UNNEST(regions) AS region
WHERE CitationCount >= 2
ARRAY_CONCAT_AGG(CitingRegions_table) AS CitingRegions_table_temp,
COUNT(doi) AS count_doi
FROM `coki-scratch-space.citation_diversity_analysis.citation_diversity_global`, UNNEST(regions) AS region
WHERE (CitationCount >= 2) AND (region.name IS NOT NULL) AND (year IS NOT NULL) AND (is_oa IS NOT NULL)
GROUP BY region.name, year, is_oa
),
datatemp2 AS (
SELECT
region,
year,
is_oa,
ARRAY_CONCAT_AGG(CitingRegions_table) AS CitingRegions_table_temp,
AVG(CitingRegions_count_uniq) AS count_uniq_mean,
ANY_VALUE(CitingRegions_count_uniq_median) AS count_uniq_median,
AVG(CitingRegions_GiniSim) AS GiniSim_mean,
ANY_VALUE(CitingRegions_GiniSim_median) AS GiniSim_median,
AVG(CitingRegions_Shannon) AS Shannon_mean,
ANY_VALUE(CitingRegions_Shannon_median) AS Shannon_median,
COUNT(doi) AS count_doi
FROM datatemp1
WHERE (region IS NOT NULL) AND (year IS NOT NULL) AND (is_oa IS NOT NULL)
GROUP BY region, year, is_oa
* EXCEPT(CitingRegions_table_temp),
ARRAY(SELECT AS STRUCT name, SUM(count) AS total FROM UNNEST(CitingRegions_table_temp) AS X GROUP BY name) AS CitingRegions_table_all
FROM datatemp1
)
SELECT
* EXCEPT(CitingRegions_table_temp),
ARRAY(SELECT AS STRUCT name, SUM(count) AS total FROM UNNEST(CitingRegions_table_temp) AS X GROUP BY name) AS CitingRegions_table_all
FROM datatemp2
region_cited,
year,
is_oa,
count_doi,
X.name AS region_citing,
X.total AS region_citing_count
FROM datatemp2, UNNEST(CitingRegions_table_all) AS X
Original file line number Diff line number Diff line change
Expand Up @@ -12,45 +12,32 @@ karl.huang@curtin.edu.au
table bigquery://coki-scratch-space.karl.citation_diversity_global
## Creates
file summary_stats_by_subregion_atleast2cit.json
file summary_stats_by_subregion_atleast2cit.csv
*/
WITH
datatemp1 AS (
SELECT
doi,
subregion.name AS subregion_cited,
year,
is_oa,
subregion.name AS subregion,
CitingSubregions_name,
CitingSubregions_table,
CitingSubregions_count_uniq,
CitingSubregions_GiniSim,
CitingSubregions_Shannon,
PERCENTILE_CONT(CitingSubregions_count_uniq,0.5) OVER(PARTITION BY subregion.name, year, is_oa) AS CitingSubregions_count_uniq_median,
PERCENTILE_CONT(CitingSubregions_GiniSim,0.5) OVER(PARTITION BY subregion.name, year, is_oa) AS CitingSubregions_GiniSim_median,
PERCENTILE_CONT(CitingSubregions_Shannon,0.5) OVER(PARTITION BY subregion.name, year, is_oa) AS CitingSubregions_Shannon_median
FROM `coki-scratch-space.karl.citation_diversity_global`, UNNEST(subregions) AS subregion
WHERE CitationCount >= 2
ARRAY_CONCAT_AGG(CitingSubregions_table) AS CitingSubregions_table_temp,
COUNT(doi) AS count_doi
FROM `coki-scratch-space.citation_diversity_analysis.citation_diversity_global`, UNNEST(subregions) AS subregion
WHERE (CitationCount >= 2) AND (subregion.name IS NOT NULL) AND (year IS NOT NULL) AND (is_oa IS NOT NULL)
GROUP BY subregion.name, year, is_oa
),
datatemp2 AS (
SELECT
subregion,
year,
is_oa,
ARRAY_CONCAT_AGG(CitingSubregions_table) AS CitingSubregions_table_temp,
AVG(CitingSubregions_count_uniq) AS count_uniq_mean,
ANY_VALUE(CitingSubregions_count_uniq_median) AS count_uniq_median,
AVG(CitingSubregions_GiniSim) AS GiniSim_mean,
ANY_VALUE(CitingSubregions_GiniSim_median) AS GiniSim_median,
AVG(CitingSubregions_Shannon) AS Shannon_mean,
ANY_VALUE(CitingSubregions_Shannon_median) AS Shannon_median,
COUNT(doi) AS count_doi
FROM datatemp1
WHERE (subregion IS NOT NULL) AND (year IS NOT NULL) AND (is_oa IS NOT NULL)
GROUP BY subregion, year, is_oa
* EXCEPT(CitingSubregions_table_temp),
ARRAY(SELECT AS STRUCT name, SUM(count) AS total FROM UNNEST(CitingSubregions_table_temp) AS X GROUP BY name) AS CitingSubregions_table_all
FROM datatemp1
)
SELECT
* EXCEPT(CitingSubregions_table_temp),
ARRAY(SELECT AS STRUCT name, SUM(count) AS total FROM UNNEST(CitingSubregions_table_temp) AS X GROUP BY name) AS CitingSubregions_table_all
FROM datatemp2
subregion_cited,
year,
is_oa,
count_doi,
X.name AS subregion_citing,
X.total AS subregion_citing_count
FROM datatemp2, UNNEST(CitingSubregions_table_all) AS X
30 changes: 26 additions & 4 deletions report_data_processing/sql_templates/cit_div_by_field.sql
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
/*
## Summary
Generates yearly summary diversity scores for each field of study
## Description
Creates a table that lists, for each field and each publication year, the median and mean:
- numbers of unique citing groups;
- GiniSim scores by citing groups;
- Shannon scores by citing groups;
as per OA category.
## Contacts
karl.huang@curtin.edu.au
## Requires
table bigquery://{citation_diversity_table}
## Creates
file cit_div_by_field.csv
*/

WITH
data_noa AS (
SELECT
Expand Down Expand Up @@ -68,7 +90,7 @@ WITH
PERCENTILE_CONT(CitingFields_count_uniq,0.5) OVER(PARTITION BY temp_field.DisplayName, year) AS CitingFields_count_uniq_perc50,
PERCENTILE_CONT(CitingFields_GiniSim,0.5) OVER(PARTITION BY temp_field.DisplayName, year) AS CitingFields_GiniSim_perc50,
PERCENTILE_CONT(CitingFields_Shannon,0.5) OVER(PARTITION BY temp_field.DisplayName, year) AS CitingFields_Shannon_perc50
FROM (SELECT * FROM `coki-scratch-space.citation_diversity_analysis.citation_diversity_global` WHERE (CitationCount >= 2) AND (is_oa IS false)),
FROM (SELECT * FROM `{citation_diversity_table}` WHERE (CitationCount >= 2) AND (is_oa IS false)),
UNNEST(fields) AS temp_field
)
GROUP BY field, year
Expand Down Expand Up @@ -142,7 +164,7 @@ WITH
PERCENTILE_CONT(CitingFields_count_uniq,0.5) OVER(PARTITION BY temp_field.DisplayName, year) AS CitingFields_count_uniq_perc50,
PERCENTILE_CONT(CitingFields_GiniSim,0.5) OVER(PARTITION BY temp_field.DisplayName, year) AS CitingFields_GiniSim_perc50,
PERCENTILE_CONT(CitingFields_Shannon,0.5) OVER(PARTITION BY temp_field.DisplayName, year) AS CitingFields_Shannon_perc50
FROM (SELECT * FROM `coki-scratch-space.citation_diversity_analysis.citation_diversity_global` WHERE (CitationCount >= 2) AND (is_oa IS true)),
FROM (SELECT * FROM `{citation_diversity_table}` WHERE (CitationCount >= 2) AND (is_oa IS true)),
UNNEST(fields) AS temp_field
)
GROUP BY field, year
Expand Down Expand Up @@ -216,7 +238,7 @@ WITH
PERCENTILE_CONT(CitingFields_count_uniq,0.5) OVER(PARTITION BY temp_field.DisplayName, year) AS CitingFields_count_uniq_perc50,
PERCENTILE_CONT(CitingFields_GiniSim,0.5) OVER(PARTITION BY temp_field.DisplayName, year) AS CitingFields_GiniSim_perc50,
PERCENTILE_CONT(CitingFields_Shannon,0.5) OVER(PARTITION BY temp_field.DisplayName, year) AS CitingFields_Shannon_perc50
FROM (SELECT * FROM `coki-scratch-space.citation_diversity_analysis.citation_diversity_global` WHERE (CitationCount >= 2) AND (gold IS true)),
FROM (SELECT * FROM `{citation_diversity_table}` WHERE (CitationCount >= 2) AND (gold IS true)),
UNNEST(fields) AS temp_field
)
GROUP BY field, year
Expand Down Expand Up @@ -290,7 +312,7 @@ WITH
PERCENTILE_CONT(CitingFields_count_uniq,0.5) OVER(PARTITION BY temp_field.DisplayName, year) AS CitingFields_count_uniq_perc50,
PERCENTILE_CONT(CitingFields_GiniSim,0.5) OVER(PARTITION BY temp_field.DisplayName, year) AS CitingFields_GiniSim_perc50,
PERCENTILE_CONT(CitingFields_Shannon,0.5) OVER(PARTITION BY temp_field.DisplayName, year) AS CitingFields_Shannon_perc50
FROM (SELECT * FROM `coki-scratch-space.citation_diversity_analysis.citation_diversity_global` WHERE (CitationCount >= 2) AND (green IS true)),
FROM (SELECT * FROM `{citation_diversity_table}` WHERE (CitationCount >= 2) AND (green IS true)),
UNNEST(fields) AS temp_field
)
GROUP BY field, year
Expand Down
6 changes: 4 additions & 2 deletions report_data_processing/sql_templates/cit_div_vs_cit_count.sql
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
/*
## Summary
NEEDS A SUMMARY HERE
Generates annually the quartile diversity scores as per citation count
## Description
Creates a table that lists, for each publication year and each citation count, the quartiles in:
- GiniSim scores by citing groups;
- Shannon scores by citing groups.
## Contacts
karl.huang@curtin.edu.au
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
/*
## Summary
Creates the main DOI level citation diversity table to be deployed to BigQuery
## Description
Using the DOI table in Academic Observatory and MAG reference table to link all citing papers to each DOI.
Then calculate various diversity statistics/summaries for each DOI.
## Contacts
karl.huang@curtin.edu.au
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
/*
## Summary
NEW SUMMARY DESCRIPTION
Generates samples of outputs across OA and non-OA papers and across citation groups for a given year
## Description
Creates a table, for a given publication year, that contains samples of DOIs, with their corresponding:
- numbers of unique citing groups;
- GiniSim scores by citing groups;
- Shannon scores by citing groups.
The DOIs are randomly sampled for each citation group, with 2000 OA outputs and 2000 non-OA outputs per citation group.
## Contacts
karl.huang@curtin.edu.au
Expand Down
8 changes: 6 additions & 2 deletions report_data_processing/sql_templates/samples_by_oa.sql
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
/*
## Summary
NEW SUMMARY DESCRIPTION
Generates samples of outputs across OA categories for a given year
## Description
Creates a table, for a given publication year, that contains samples of DOIs, with their corresponding:
- numbers of unique citing groups;
- GiniSim scores by citing groups;
- Shannon scores by citing groups.
The DOIs are randomly sampled for each OA category, with 10,000 outputs per category.
## Contacts
karl.huang@curtin.edu.au
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
/*
## Summary
CHANGE THIS
Generates annual citation counts between regions
## Description
Creates a table that lists for each cited region and publication year:
- DOI counts for OA and non-OA outputs;
- Number of citations from all citing region to OA papers affiliated to the cited region;
- Number of citations from all citing region to non-OA papers affiliated to the cited region.
## Contacts
karl.huang@curtin.edu.au
Expand All @@ -12,46 +15,33 @@ karl.huang@curtin.edu.au
table bigquery://{citation_diversity_table}
## Creates
file summary_stats_by_region_atleast2cit.json
file summary_stats_by_region_atleast2cit.csv
*/

WITH
datatemp1 AS (
SELECT
doi,
region.name AS region_cited,
year,
is_oa,
region.name AS region,
CitingRegions_name,
CitingRegions_table,
CitingRegions_count_uniq,
CitingRegions_GiniSim,
CitingRegions_Shannon,
PERCENTILE_CONT(CitingRegions_count_uniq,0.5) OVER(PARTITION BY region.name, year, is_oa) AS CitingRegions_count_uniq_median,
PERCENTILE_CONT(CitingRegions_GiniSim,0.5) OVER(PARTITION BY region.name, year, is_oa) AS CitingRegions_GiniSim_median,
PERCENTILE_CONT(CitingRegions_Shannon,0.5) OVER(PARTITION BY region.name, year, is_oa) AS CitingRegions_Shannon_median
ARRAY_CONCAT_AGG(CitingRegions_table) AS CitingRegions_table_temp,
COUNT(doi) AS count_doi
FROM `{citation_diversity_table}`, UNNEST(regions) AS region
WHERE CitationCount >= 2
WHERE (CitationCount >= 2) AND (region.name IS NOT NULL) AND (year IS NOT NULL) AND (is_oa IS NOT NULL)
GROUP BY region.name, year, is_oa
),
datatemp2 AS (
SELECT
region,
year,
is_oa,
ARRAY_CONCAT_AGG(CitingRegions_table) AS CitingRegions_table_temp,
AVG(CitingRegions_count_uniq) AS count_uniq_mean,
ANY_VALUE(CitingRegions_count_uniq_median) AS count_uniq_median,
AVG(CitingRegions_GiniSim) AS GiniSim_mean,
ANY_VALUE(CitingRegions_GiniSim_median) AS GiniSim_median,
AVG(CitingRegions_Shannon) AS Shannon_mean,
ANY_VALUE(CitingRegions_Shannon_median) AS Shannon_median,
COUNT(doi) AS count_doi
FROM datatemp1
WHERE (region IS NOT NULL) AND (year IS NOT NULL) AND (is_oa IS NOT NULL)
GROUP BY region, year, is_oa
* EXCEPT(CitingRegions_table_temp),
ARRAY(SELECT AS STRUCT name, SUM(count) AS total FROM UNNEST(CitingRegions_table_temp) AS X GROUP BY name) AS CitingRegions_table_all
FROM datatemp1
)
SELECT
* EXCEPT(CitingRegions_table_temp),
ARRAY(SELECT AS STRUCT name, SUM(count) AS total FROM UNNEST(CitingRegions_table_temp) AS X GROUP BY name) AS CitingRegions_table_all
FROM datatemp2
region_cited,
year,
is_oa,
count_doi,
X.name AS region_citing,
X.total AS region_citing_count
FROM datatemp2, UNNEST(CitingRegions_table_all) AS X
Loading

0 comments on commit bace3e2

Please sign in to comment.