Skip to content

Commit

Permalink
Clean up and improve get studies sql statements (#113)
Browse files Browse the repository at this point in the history
* refactor: Replace textual sql w/ orm'd queries

* refactor: Use model classes directly in sql

purely for tidiness

* refactor: Use newer api for any() construct

* tests: Add one more test case for get studies

* docs: Remove code todo
  • Loading branch information
bdewilde authored May 16, 2024
1 parent bba11db commit 9edc2db
Show file tree
Hide file tree
Showing 2 changed files with 197 additions and 118 deletions.
314 changes: 196 additions & 118 deletions colandr/apis/resources/studies.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,6 @@ def put(self, args, id):
return StudySchema().dump(study)


# TODO: port these text subqueries over to sqlalchemy orm equivalents


@ns.route("")
@ns.doc(
summary="get collections of matching studies",
Expand Down Expand Up @@ -305,149 +302,230 @@ def get(
if fields and "id" not in fields:
fields.append("id")

stmt = sa.select(models.Study).where(models.Study.review_id == review_id)
from ...models import Screening, Study

stmt = sa.select(Study).where(Study.review_id == review_id)

if dedupe_status is not None:
stmt = stmt.where(models.Study.dedupe_status == dedupe_status)
stmt = stmt.where(Study.dedupe_status == dedupe_status)

if citation_status is not None:
if citation_status in {"conflict", "excluded", "included"}:
stmt = stmt.where(models.Study.citation_status == citation_status)
stmt = stmt.where(Study.citation_status == citation_status)
elif citation_status == "pending":
substmt = """
SELECT t.id
FROM (
SELECT
studies.id,
studies.dedupe_status,
studies.citation_status,
screenings_.user_ids
FROM studies
LEFT JOIN (
SELECT
study_id,
ARRAY_AGG(user_id) AS user_ids
FROM screenings
WHERE stage = 'citation'
GROUP BY study_id
) AS screenings_ ON studies.id = screenings_.study_id
) AS t
WHERE
t.dedupe_status = 'not_duplicate' -- this is necessary!
AND t.citation_status NOT IN ('excluded', 'included', 'conflict')
AND (t.citation_status = 'not_screened' OR NOT {user_id} = ANY(t.user_ids))
""".format(user_id=current_user.id)
stmt = stmt.where(models.Study.id.in_(sa.text(substmt)))
user_screened_sq = (
sa.select(Screening.study_id).where(
Screening.stage == "citation",
Screening.user_id == current_user.id,
)
).subquery("user_screened")
study_ids_sq = (
sa.select(Study.id)
.where(
Study.dedupe_status == "not_duplicate",
Study.citation_status.not_in(
["included", "excluded", "conflict"]
),
)
.join(
user_screened_sq,
Study.id == user_screened_sq.c.study_id,
isouter=True,
)
).where(
sa.or_(
Study.citation_status == "not_screened",
user_screened_sq.c.study_id == None,
)
)
stmt = stmt.where(Study.id.in_(study_ids_sq))
# old, less performant, textual equivalent
# substmt = """
# SELECT t.id
# FROM (
# SELECT
# studies.id,
# studies.dedupe_status,
# studies.citation_status,
# screenings_.user_ids
# FROM studies
# LEFT JOIN (
# SELECT
# study_id,
# ARRAY_AGG(user_id) AS user_ids
# FROM screenings
# WHERE stage = 'citation'
# GROUP BY study_id
# ) AS screenings_ ON studies.id = screenings_.study_id
# ) AS t
# WHERE
# t.dedupe_status = 'not_duplicate' -- this is necessary!
# AND t.citation_status NOT IN ('excluded', 'included', 'conflict')
# AND (t.citation_status = 'not_screened' OR NOT {user_id} = ANY(t.user_ids))
# """.format(user_id=current_user.id)
# stmt = stmt.where(models.Study.id.in_(sa.text(substmt)))
elif citation_status == "awaiting_coscreener":
substmt = """
SELECT t.id
FROM (
SELECT
studies.id,
studies.citation_status,
screenings.user_ids
FROM studies
LEFT JOIN (
SELECT
study_id,
ARRAY_AGG(user_id) AS user_ids
FROM screenings
WHERE stage = 'citation'
GROUP BY study_id
) AS screenings_ ON studies.id = screenings_.study_id
) AS t
WHERE
t.citation_status = 'screened_once'
AND {user_id} = ANY(t.user_ids)
""".format(user_id=current_user.id)
stmt = stmt.where(models.Study.id.in_(sa.text(substmt)))
user_screened_sq = (
sa.select(Screening.study_id).where(
Screening.stage == "citation",
Screening.user_id == current_user.id,
)
).subquery("user_screened")
study_ids_sq = (
sa.select(Study.id)
.where(Study.citation_status == "screened_once")
.join(
user_screened_sq,
Study.id == user_screened_sq.c.study_id,
isouter=True,
)
).where(user_screened_sq.c.study_id != None)
stmt = stmt.where(Study.id.in_(study_ids_sq))
# old, less performant, textual equivalent
# substmt = """
# SELECT t.id
# FROM (
# SELECT
# studies.id,
# studies.citation_status,
# screenings_.user_ids
# FROM studies
# LEFT JOIN (
# SELECT
# study_id,
# ARRAY_AGG(user_id) AS user_ids
# FROM screenings
# WHERE stage = 'citation'
# GROUP BY study_id
# ) AS screenings_ ON studies.id = screenings_.study_id
# ) AS t
# WHERE
# t.citation_status = 'screened_once'
# AND {user_id} = ANY(t.user_ids)
# """.format(user_id=current_user.id)
# stmt = stmt.where(models.Study.id.in_(sa.text(substmt)))

if fulltext_status is not None:
if fulltext_status in {"conflict", "excluded", "included"}:
stmt = stmt.where(models.Study.fulltext_status == fulltext_status)
stmt = stmt.where(Study.fulltext_status == fulltext_status)
elif fulltext_status == "pending":
substmt = """
SELECT id
FROM (
SELECT
studies.id,
studies.citation_status,
studies.fulltext_status,
screenings_.user_ids
FROM studies
LEFT JOIN (
SELECT
study_id,
ARRAY_AGG(user_id) AS user_ids
FROM screenings
WHERE stage = 'fulltext'
GROUP BY study_id
) AS screenings_ ON studies.id = screenings_.study_id
) AS t
WHERE
citation_status = 'included' -- this is necessary!
AND fulltext_status NOT IN ('excluded', 'included', 'conflict')
AND (fulltext_status = 'not_screened' OR NOT {user_id} = ANY(user_ids))
""".format(user_id=current_user.id)
stmt = stmt.where(models.Study.id.in_(sa.text(substmt)))
user_screened_sq = (
sa.select(Screening.study_id).where(
Screening.stage == "fulltext",
Screening.user_id == current_user.id,
)
).subquery("user_screened")
study_ids_sq = (
sa.select(Study.id)
.where(
Study.citation_status == "included",
Study.fulltext_status.not_in(
["included", "excluded", "conflict"]
),
)
.join(
user_screened_sq,
Study.id == user_screened_sq.c.study_id,
isouter=True,
)
).where(
sa.or_(
Study.fulltext_status == "not_screened",
user_screened_sq.c.study_id == None,
)
)
stmt = stmt.where(Study.id.in_(study_ids_sq))
# old, less performant, textual equivalent
# substmt = """
# SELECT id
# FROM (
# SELECT
# studies.id,
# studies.citation_status,
# studies.fulltext_status,
# screenings_.user_ids
# FROM studies
# LEFT JOIN (
# SELECT
# study_id,
# ARRAY_AGG(user_id) AS user_ids
# FROM screenings
# WHERE stage = 'fulltext'
# GROUP BY study_id
# ) AS screenings_ ON studies.id = screenings_.study_id
# ) AS t
# WHERE
# citation_status = 'included' -- this is necessary!
# AND fulltext_status NOT IN ('excluded', 'included', 'conflict')
# AND (fulltext_status = 'not_screened' OR NOT {user_id} = ANY(user_ids))
# """.format(user_id=current_user.id)
# stmt = stmt.where(models.Study.id.in_(sa.text(substmt)))
elif fulltext_status == "awaiting_coscreener":
substmt = """
SELECT t.id
FROM (
SELECT
studies.id,
studies.fulltext_status,
screenings_.user_ids
FROM studies
LEFT JOIN (
SELECT
study_id,
ARRAY_AGG(user_id) AS user_ids
FROM screenings
WHERE stage = 'fulltext'
GROUP BY study_id
) AS screenings_ ON studies.id = screenings_.study_id
) AS t
WHERE
t.fulltext_status = 'screened_once'
AND {user_id} = ANY(t.user_ids)
""".format(user_id=current_user.id)
stmt = stmt.where(models.Study.id.in_(sa.text(substmt)))
user_screened_sq = (
sa.select(Screening.study_id).where(
Screening.stage == "fulltext",
Screening.user_id == current_user.id,
)
).subquery("user_screened")
study_ids_sq = (
sa.select(Study.id)
.where(Study.fulltext_status == "screened_once")
.join(
user_screened_sq,
Study.id == user_screened_sq.c.study_id,
isouter=True,
)
).where(user_screened_sq.c.study_id != None)
stmt = stmt.where(Study.id.in_(study_ids_sq))
# old, less performant, textual equivalent
# substmt = """
# SELECT t.id
# FROM (
# SELECT
# studies.id,
# studies.fulltext_status,
# screenings_.user_ids
# FROM studies
# LEFT JOIN (
# SELECT
# study_id,
# ARRAY_AGG(user_id) AS user_ids
# FROM screenings
# WHERE stage = 'fulltext'
# GROUP BY study_id
# ) AS screenings_ ON studies.id = screenings_.study_id
# ) AS t
# WHERE
# t.fulltext_status = 'screened_once'
# AND {user_id} = ANY(t.user_ids)
# """.format(user_id=current_user.id)
# stmt = stmt.where(models.Study.id.in_(sa.text(substmt)))

if data_extraction_status is not None:
if data_extraction_status == "not_started":
stmt = stmt.where(
models.Study.data_extraction_status == data_extraction_status
).where(
models.Study.fulltext_status == "included"
) # this is necessary!
Study.fulltext_status == "included", # this is necessary!
Study.data_extraction_status == data_extraction_status,
)
else:
stmt = stmt.where(
models.Study.data_extraction_status == data_extraction_status
Study.data_extraction_status == data_extraction_status
)

if num_citation_reviewers is not None:
stmt = stmt.where(
models.Study.num_citation_reviewers == num_citation_reviewers
)
stmt = stmt.where(Study.num_citation_reviewers == num_citation_reviewers)
if num_fulltext_reviewers is not None:
stmt = stmt.where(
models.Study.num_fulltext_reviewers == num_fulltext_reviewers
)
stmt = stmt.where(Study.num_fulltext_reviewers == num_fulltext_reviewers)

if tag:
stmt = stmt.where(models.Study.tags.any(tag, operator=operators.eq))
stmt = stmt.where(Study.tags.any_() == tag)

if tsquery and order_by != "relevance": # HACK...
stmt = stmt.where(models.Study.citation_text_content.match(tsquery))
stmt = stmt.where(Study.citation_text_content.match(tsquery))

# order, offset, and limit
if order_by == "recency":
order_by = (
sa.desc(models.Study.id)
if order_dir == "DESC"
else sa.asc(models.Study.id)
)
order_by = sa.desc(Study.id) if order_dir == "DESC" else sa.asc(Study.id)
stmt = stmt.order_by(order_by)
stmt = stmt.offset(page * per_page).limit(per_page)
return StudySchema(many=True, only=fields).dump(
Expand All @@ -456,7 +534,7 @@ def get(

elif order_by == "relevance":
if tsquery:
stmt = stmt.where(models.Study.citation_text_content.match(tsquery))
stmt = stmt.where(Study.citation_text_content.match(tsquery))

# get results and corresponding relevance scores
stmt = stmt.order_by(db.func.random()).limit(1000)
Expand Down
1 change: 1 addition & 0 deletions tests/api/test_studies.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ class TestStudiesResource:
(1, {"review_id": 1, "citation_status": "included"}, [1, 2]),
(1, {"review_id": 1, "citation_status": "excluded"}, [3]),
(1, {"review_id": 1, "fulltext_status": "included"}, [1]),
(2, {"review_id": 1, "citation_status": "awaiting_coscreener"}, []),
(3, {"review_id": 2, "fulltext_status": "pending"}, [4]),
(1, {"review_id": 1, "num_citation_reviewers": 1}, [1, 2, 3]),
(1, {"review_id": 1, "num_citation_reviewers": 2}, []),
Expand Down

0 comments on commit 9edc2db

Please sign in to comment.