Clean up and improve get studies sql statements (#113)

* refactor: Replace textual sql w/ orm'd queries * refactor: Use model classes directly in sql purely for tidiness * refactor: Use newer api for any() construct * tests: Add one more test case for get studies * docs: Remove code todo
datakind · May 16, 2024 · 9edc2db · 9edc2db
1 parent bba11db
commit 9edc2db
Show file tree

Hide file tree

Showing 2 changed files with 197 additions and 118 deletions.
diff --git a/colandr/apis/resources/studies.py b/colandr/apis/resources/studies.py
@@ -139,9 +139,6 @@ def put(self, args, id):
         return StudySchema().dump(study)
 
 
-# TODO: port these text subqueries over to sqlalchemy orm equivalents
-
-
 @ns.route("")
 @ns.doc(
     summary="get collections of matching studies",
@@ -305,149 +302,230 @@ def get(
         if fields and "id" not in fields:
             fields.append("id")
 
-        stmt = sa.select(models.Study).where(models.Study.review_id == review_id)
+        from ...models import Screening, Study
+
+        stmt = sa.select(Study).where(Study.review_id == review_id)
 
         if dedupe_status is not None:
-            stmt = stmt.where(models.Study.dedupe_status == dedupe_status)
+            stmt = stmt.where(Study.dedupe_status == dedupe_status)
 
         if citation_status is not None:
             if citation_status in {"conflict", "excluded", "included"}:
-                stmt = stmt.where(models.Study.citation_status == citation_status)
+                stmt = stmt.where(Study.citation_status == citation_status)
             elif citation_status == "pending":
-                substmt = """
-                    SELECT t.id
-                    FROM (
-                        SELECT
-                            studies.id,
-                            studies.dedupe_status,
-                            studies.citation_status,
-                            screenings_.user_ids
-                        FROM studies
-                        LEFT JOIN (
-                            SELECT
-                                study_id,
-                                ARRAY_AGG(user_id) AS user_ids
-                            FROM screenings
-                            WHERE stage = 'citation'
-                            GROUP BY study_id
-                        ) AS screenings_ ON studies.id = screenings_.study_id
-                    ) AS t
-                    WHERE
-                        t.dedupe_status = 'not_duplicate' -- this is necessary!
-                        AND t.citation_status NOT IN ('excluded', 'included', 'conflict')
-                        AND (t.citation_status = 'not_screened' OR NOT {user_id} = ANY(t.user_ids))
-                    """.format(user_id=current_user.id)
-                stmt = stmt.where(models.Study.id.in_(sa.text(substmt)))
+                user_screened_sq = (
+                    sa.select(Screening.study_id).where(
+                        Screening.stage == "citation",
+                        Screening.user_id == current_user.id,
+                    )
+                ).subquery("user_screened")
+                study_ids_sq = (
+                    sa.select(Study.id)
+                    .where(
+                        Study.dedupe_status == "not_duplicate",
+                        Study.citation_status.not_in(
+                            ["included", "excluded", "conflict"]
+                        ),
+                    )
+                    .join(
+                        user_screened_sq,
+                        Study.id == user_screened_sq.c.study_id,
+                        isouter=True,
+                    )
+                ).where(
+                    sa.or_(
+                        Study.citation_status == "not_screened",
+                        user_screened_sq.c.study_id == None,
+                    )
+                )
+                stmt = stmt.where(Study.id.in_(study_ids_sq))
+                # old, less performant, textual equivalent
+                # substmt = """
+                #     SELECT t.id
+                #     FROM (
+                #         SELECT
+                #             studies.id,
+                #             studies.dedupe_status,
+                #             studies.citation_status,
+                #             screenings_.user_ids
+                #         FROM studies
+                #         LEFT JOIN (
+                #             SELECT
+                #                 study_id,
+                #                 ARRAY_AGG(user_id) AS user_ids
+                #             FROM screenings
+                #             WHERE stage = 'citation'
+                #             GROUP BY study_id
+                #         ) AS screenings_ ON studies.id = screenings_.study_id
+                #     ) AS t
+                #     WHERE
+                #         t.dedupe_status = 'not_duplicate' -- this is necessary!
+                #         AND t.citation_status NOT IN ('excluded', 'included', 'conflict')
+                #         AND (t.citation_status = 'not_screened' OR NOT {user_id} = ANY(t.user_ids))
+                #     """.format(user_id=current_user.id)
+                # stmt = stmt.where(models.Study.id.in_(sa.text(substmt)))
             elif citation_status == "awaiting_coscreener":
-                substmt = """
-                    SELECT t.id
-                    FROM (
-                        SELECT
-                            studies.id,
-                            studies.citation_status,
-                            screenings.user_ids
-                        FROM studies
-                        LEFT JOIN (
-                            SELECT
-                                study_id,
-                                ARRAY_AGG(user_id) AS user_ids
-                            FROM screenings
-                            WHERE stage = 'citation'
-                            GROUP BY study_id
-                        ) AS screenings_ ON studies.id = screenings_.study_id
-                    ) AS t
-                    WHERE
-                        t.citation_status = 'screened_once'
-                        AND {user_id} = ANY(t.user_ids)
-                    """.format(user_id=current_user.id)
-                stmt = stmt.where(models.Study.id.in_(sa.text(substmt)))
+                user_screened_sq = (
+                    sa.select(Screening.study_id).where(
+                        Screening.stage == "citation",
+                        Screening.user_id == current_user.id,
+                    )
+                ).subquery("user_screened")
+                study_ids_sq = (
+                    sa.select(Study.id)
+                    .where(Study.citation_status == "screened_once")
+                    .join(
+                        user_screened_sq,
+                        Study.id == user_screened_sq.c.study_id,
+                        isouter=True,
+                    )
+                ).where(user_screened_sq.c.study_id != None)
+                stmt = stmt.where(Study.id.in_(study_ids_sq))
+                # old, less performant, textual equivalent
+                # substmt = """
+                #     SELECT t.id
+                #     FROM (
+                #         SELECT
+                #             studies.id,
+                #             studies.citation_status,
+                #             screenings_.user_ids
+                #         FROM studies
+                #         LEFT JOIN (
+                #             SELECT
+                #                 study_id,
+                #                 ARRAY_AGG(user_id) AS user_ids
+                #             FROM screenings
+                #             WHERE stage = 'citation'
+                #             GROUP BY study_id
+                #         ) AS screenings_ ON studies.id = screenings_.study_id
+                #     ) AS t
+                #     WHERE
+                #         t.citation_status = 'screened_once'
+                #         AND {user_id} = ANY(t.user_ids)
+                #     """.format(user_id=current_user.id)
+                # stmt = stmt.where(models.Study.id.in_(sa.text(substmt)))
 
         if fulltext_status is not None:
             if fulltext_status in {"conflict", "excluded", "included"}:
-                stmt = stmt.where(models.Study.fulltext_status == fulltext_status)
+                stmt = stmt.where(Study.fulltext_status == fulltext_status)
             elif fulltext_status == "pending":
-                substmt = """
-                    SELECT id
-                    FROM (
-                        SELECT
-                            studies.id,
-                            studies.citation_status,
-                            studies.fulltext_status,
-                            screenings_.user_ids
-                        FROM studies
-                        LEFT JOIN (
-                            SELECT
-                                study_id,
-                                ARRAY_AGG(user_id) AS user_ids
-                            FROM screenings
-                            WHERE stage = 'fulltext'
-                            GROUP BY study_id
-                        ) AS screenings_ ON studies.id = screenings_.study_id
-                    ) AS t
-                    WHERE
-                        citation_status = 'included' -- this is necessary!
-                        AND fulltext_status NOT IN ('excluded', 'included', 'conflict')
-                        AND (fulltext_status = 'not_screened' OR NOT {user_id} = ANY(user_ids))
-                    """.format(user_id=current_user.id)
-                stmt = stmt.where(models.Study.id.in_(sa.text(substmt)))
+                user_screened_sq = (
+                    sa.select(Screening.study_id).where(
+                        Screening.stage == "fulltext",
+                        Screening.user_id == current_user.id,
+                    )
+                ).subquery("user_screened")
+                study_ids_sq = (
+                    sa.select(Study.id)
+                    .where(
+                        Study.citation_status == "included",
+                        Study.fulltext_status.not_in(
+                            ["included", "excluded", "conflict"]
+                        ),
+                    )
+                    .join(
+                        user_screened_sq,
+                        Study.id == user_screened_sq.c.study_id,
+                        isouter=True,
+                    )
+                ).where(
+                    sa.or_(
+                        Study.fulltext_status == "not_screened",
+                        user_screened_sq.c.study_id == None,
+                    )
+                )
+                stmt = stmt.where(Study.id.in_(study_ids_sq))
+                # old, less performant, textual equivalent
+                # substmt = """
+                #     SELECT id
+                #     FROM (
+                #         SELECT
+                #             studies.id,
+                #             studies.citation_status,
+                #             studies.fulltext_status,
+                #             screenings_.user_ids
+                #         FROM studies
+                #         LEFT JOIN (
+                #             SELECT
+                #                 study_id,
+                #                 ARRAY_AGG(user_id) AS user_ids
+                #             FROM screenings
+                #             WHERE stage = 'fulltext'
+                #             GROUP BY study_id
+                #         ) AS screenings_ ON studies.id = screenings_.study_id
+                #     ) AS t
+                #     WHERE
+                #         citation_status = 'included' -- this is necessary!
+                #         AND fulltext_status NOT IN ('excluded', 'included', 'conflict')
+                #         AND (fulltext_status = 'not_screened' OR NOT {user_id} = ANY(user_ids))
+                #     """.format(user_id=current_user.id)
+                # stmt = stmt.where(models.Study.id.in_(sa.text(substmt)))
             elif fulltext_status == "awaiting_coscreener":
-                substmt = """
-                    SELECT t.id
-                    FROM (
-                        SELECT
-                            studies.id,
-                            studies.fulltext_status,
-                            screenings_.user_ids
-                        FROM studies
-                        LEFT JOIN (
-                            SELECT
-                                study_id,
-                                ARRAY_AGG(user_id) AS user_ids
-                            FROM screenings
-                            WHERE stage = 'fulltext'
-                            GROUP BY study_id
-                        ) AS screenings_ ON studies.id = screenings_.study_id
-                    ) AS t
-                    WHERE
-                        t.fulltext_status = 'screened_once'
-                        AND {user_id} = ANY(t.user_ids)
-                    """.format(user_id=current_user.id)
-                stmt = stmt.where(models.Study.id.in_(sa.text(substmt)))
+                user_screened_sq = (
+                    sa.select(Screening.study_id).where(
+                        Screening.stage == "fulltext",
+                        Screening.user_id == current_user.id,
+                    )
+                ).subquery("user_screened")
+                study_ids_sq = (
+                    sa.select(Study.id)
+                    .where(Study.fulltext_status == "screened_once")
+                    .join(
+                        user_screened_sq,
+                        Study.id == user_screened_sq.c.study_id,
+                        isouter=True,
+                    )
+                ).where(user_screened_sq.c.study_id != None)
+                stmt = stmt.where(Study.id.in_(study_ids_sq))
+                # old, less performant, textual equivalent
+                # substmt = """
+                #     SELECT t.id
+                #     FROM (
+                #         SELECT
+                #             studies.id,
+                #             studies.fulltext_status,
+                #             screenings_.user_ids
+                #         FROM studies
+                #         LEFT JOIN (
+                #             SELECT
+                #                 study_id,
+                #                 ARRAY_AGG(user_id) AS user_ids
+                #             FROM screenings
+                #             WHERE stage = 'fulltext'
+                #             GROUP BY study_id
+                #         ) AS screenings_ ON studies.id = screenings_.study_id
+                #     ) AS t
+                #     WHERE
+                #         t.fulltext_status = 'screened_once'
+                #         AND {user_id} = ANY(t.user_ids)
+                #     """.format(user_id=current_user.id)
+                # stmt = stmt.where(models.Study.id.in_(sa.text(substmt)))
 
         if data_extraction_status is not None:
             if data_extraction_status == "not_started":
                 stmt = stmt.where(
-                    models.Study.data_extraction_status == data_extraction_status
-                ).where(
-                    models.Study.fulltext_status == "included"
-                )  # this is necessary!
+                    Study.fulltext_status == "included",  # this is necessary!
+                    Study.data_extraction_status == data_extraction_status,
+                )
             else:
                 stmt = stmt.where(
-                    models.Study.data_extraction_status == data_extraction_status
+                    Study.data_extraction_status == data_extraction_status
                 )
 
         if num_citation_reviewers is not None:
-            stmt = stmt.where(
-                models.Study.num_citation_reviewers == num_citation_reviewers
-            )
+            stmt = stmt.where(Study.num_citation_reviewers == num_citation_reviewers)
         if num_fulltext_reviewers is not None:
-            stmt = stmt.where(
-                models.Study.num_fulltext_reviewers == num_fulltext_reviewers
-            )
+            stmt = stmt.where(Study.num_fulltext_reviewers == num_fulltext_reviewers)
 
         if tag:
-            stmt = stmt.where(models.Study.tags.any(tag, operator=operators.eq))
+            stmt = stmt.where(Study.tags.any_() == tag)
 
         if tsquery and order_by != "relevance":  # HACK...
-            stmt = stmt.where(models.Study.citation_text_content.match(tsquery))
+            stmt = stmt.where(Study.citation_text_content.match(tsquery))
 
         # order, offset, and limit
         if order_by == "recency":
-            order_by = (
-                sa.desc(models.Study.id)
-                if order_dir == "DESC"
-                else sa.asc(models.Study.id)
-            )
+            order_by = sa.desc(Study.id) if order_dir == "DESC" else sa.asc(Study.id)
             stmt = stmt.order_by(order_by)
             stmt = stmt.offset(page * per_page).limit(per_page)
             return StudySchema(many=True, only=fields).dump(
@@ -456,7 +534,7 @@ def get(
 
         elif order_by == "relevance":
             if tsquery:
-                stmt = stmt.where(models.Study.citation_text_content.match(tsquery))
+                stmt = stmt.where(Study.citation_text_content.match(tsquery))
 
             # get results and corresponding relevance scores
             stmt = stmt.order_by(db.func.random()).limit(1000)

diff --git a/tests/api/test_studies.py b/tests/api/test_studies.py
@@ -158,6 +158,7 @@ class TestStudiesResource:
             (1, {"review_id": 1, "citation_status": "included"}, [1, 2]),
             (1, {"review_id": 1, "citation_status": "excluded"}, [3]),
             (1, {"review_id": 1, "fulltext_status": "included"}, [1]),
+            (2, {"review_id": 1, "citation_status": "awaiting_coscreener"}, []),
             (3, {"review_id": 2, "fulltext_status": "pending"}, [4]),
             (1, {"review_id": 1, "num_citation_reviewers": 1}, [1, 2, 3]),
             (1, {"review_id": 1, "num_citation_reviewers": 2}, []),