Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GH-1172] Start System Test for sarscov2_illumina_all Automation #334

Merged
merged 18 commits into from
Mar 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions api/src/wfl/environment.clj
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
#(-> "https://clio.gotc-dev.broadinstitute.org")
"WFL_COOKIE_SECRET"
#(-> "secret/dsde/gotc/dev/zero" vault-secrets :cookie_secret)
"WFL_TDR_URL"
#(-> "https://jade.datarepo-dev.broadinstitute.org/")
"WFL_OAUTH2_CLIENT_ID"
#(-> "secret/dsde/gotc/dev/zero" vault-secrets :oauth2_client_id)
"WFL_POSTGRES_PASSWORD"
Expand All @@ -45,19 +47,22 @@
#(-> "jdbc:postgresql:wfl")
"WFL_POSTGRES_USERNAME"
#(-> nil)
;; -- test parameters below this line --
"WFL_FIRECLOUD_URL"
#(-> "https://api.firecloud.org/")

;; -- variables used in test code below this line --
"WFL_CROMWELL_URL"
#(-> "https://cromwell-gotc-auth.gotc-dev.broadinstitute.org")
"WFL_DATA_REPO_SA"
"WFL_TDR_DEFAULT_PROFILE"
#(-> "390e7a85-d47f-4531-b612-165fc977d3bd")
"WFL_TDR_SA"
#(-> "jade-k8-sa@broad-jade-dev.iam.gserviceaccount.com")
"WFL_FIRECLOUD_URL"
#(-> "https://api.firecloud.org/")
"WFL_TERRA_DATA_REPO_URL"
#(-> "https://jade.datarepo-dev.broadinstitute.org/")
"WFL_WFL_URL"
#(-> "https://dev-wfl.gotc-dev.broadinstitute.org")})

(def ^:private __getenv (memoize #(or (System/getenv %) ((defaults %)))))
(def ^:private __getenv
(memoize #(or (System/getenv %) (when-let [init (defaults %)] (init)))))
tbl3rd marked this conversation as resolved.
Show resolved Hide resolved

(def testing
"Override the environment used by `getenv` for testing. DO NOT USE THIS.
Use `wfl.tools.fixtures/with-temporary-environment` instead."
Expand Down
3 changes: 1 addition & 2 deletions api/src/wfl/service/datarepo.clj
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
(java.util.concurrent TimeUnit)))

(defn ^:private datarepo-url [& parts]
(let [url (util/slashify (env/getenv "WFL_TERRA_DATA_REPO_URL"))]
(let [url (util/slashify (env/getenv "WFL_TDR_URL"))]
(apply str url parts)))

(def ^:private repository
Expand Down Expand Up @@ -73,7 +73,6 @@
"ingest"
dataset-id
{:format "json"
:ignore_unknown_values true
:load_tag "string"
:max_bad_records 0
:path path
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@
{
"name": "terra_project",
"datatype": "string"
},
{
"name": "extra",
"datatype": "string"
}
],
"primaryKey": [
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{:flowcell_tgz "flowcell_tgz",
:reference_fasta "reference_fasta",
:amplicon_bed_prefix "amplicon_bed_prefix",
:biosample_attributes "biosample_attributes",
:instrument_model "instrument_model",
:min_genome_bases "min_genome_bases",
:max_vadr_alerts "max_vadr_alerts",
:sra_title "sra_title",
:workspace_name "$SARSCoV2-Illumina-Full",
:terra_project "$wfl-dev",
:extra {:demux_deplete.spikein_db "demux_deplete.spikein_db",
:demux_deplete.samplesheets "demux_deplete.samplesheets",
:demux_deplete.sample_rename_map "demux_deplete.sample_rename_map",
:demux_deplete.bwaDbs "demux_deplete.bwaDbs",
:demux_deplete.blastDbs "demux_deplete.blastDbs",
:gisaid_meta_prep.username "gisaid_meta_prep.username",
:gisaid_meta_prep.submitting_lab_addr "gisaid_meta_prep.submitting_lab_addr",
:gisaid_meta_prep.submitting_lab_name "gisaid_meta_prep.submitting_lab_name",
:package_genbank_ftp_submission.spuid_namespace "package_genbank_ftp_submission.spuid_namespace",
:package_genbank_ftp_submission.author_template_sbt "package_genbank_ftp_submission.author_template_sbt",
:package_genbank_ftp_submission.account_name "package_genbank_ftp_submission.account_name"}}
26 changes: 14 additions & 12 deletions api/test/resources/workflows/sarscov2_illumina_full/inputs.edn
Original file line number Diff line number Diff line change
@@ -1,21 +1,23 @@
{:package_genbank_ftp_submission.account_name "broad_gcid-srv",
{:demux_deplete.spikein_db "gs://broad-gotc-dev-wfl-ptc-test-inputs/sarscov2_illumina_full/inputs/ercc_sdsi_spike-ins-20210123.fasta",
:demux_deplete.sample_rename_map "gs://broad-gotc-dev-wfl-ptc-test-inputs/sarscov2_illumina_full/inputs/sample_rename_map-2021.01.01.txt",
:demux_deplete.bwaDbs ["gs://pathogen-public-dbs/v0/hg19.bwa_idx.tar.zst"]
:demux_deplete.blastDbs ["gs://pathogen-public-dbs/v0/GRCh37.68_ncRNA.fasta.zst",
"gs://pathogen-public-dbs/v0/hybsel_probe_adapters.fasta"],
:demux_deplete.samplesheets ["gs://broad-gotc-dev-wfl-ptc-test-inputs/sarscov2_illumina_full/inputs/SeqPrep_Batch26_27-combined-samplesheet_extra.tsv"
"gs://broad-gotc-dev-wfl-ptc-test-inputs/sarscov2_illumina_full/inputs/SeqPrep_Batch26_27-combined-samplesheet_extra.tsv"],
:gisaid_meta_prep.submitting_lab_name "Infectious Disease Program, Broad Institute of Harvard and MIT",
:gisaid_meta_prep.submitting_lab_addr "75 Ames St, Cambridge, MA 02142, USA",
:gisaid_meta_prep.username "dpark",
:package_genbank_ftp_submission.account_name "broad_gcid-srv",
:package_genbank_ftp_submission.author_template_sbt "gs://broad-gotc-dev-wfl-ptc-test-inputs/sarscov2_illumina_full/inputs/authors-broad_mgh_dph.sbt",
:package_genbank_ftp_submission.spuid_namespace "Broad_GCID",
:flowcell_tgz "gs://broad-gotc-dev-wfl-ptc-test-inputs/sarscov2_illumina_full/inputs/201025_SL-NVQ_0282_BHTHC3DRXX.tar.gz",
:gisaid_meta_prep.submitting_lab_addr "75 Ames St, Cambridge, MA 02142, USA",
:terra_project "wfl-dev",
:package_genbank_ftp_submission.spuid_namespace "Broad_GCID",
:demux_deplete.sample_rename_map "gs://fc-ac04311b-a7a7-4613-b9e9-2c7e1ed3fa72/input_files/sample_rename_map-2021.01.01.txt",
:amplicon_bed_prefix "gs://pathogen-public-dbs/v1/amplicon_primers-",
:gisaid_meta_prep.username "dpark",

:biosample_attributes ["gs://broad-gotc-dev-wfl-ptc-test-inputs/sarscov2_illumina_full/inputs/biosample-20210103-attributes.tsv"],
:demux_deplete.samplesheets ["gs://broad-gotc-dev-wfl-ptc-test-inputs/sarscov2_illumina_full/inputs/SeqPrep_Batch26_27-combined-samplesheet_extra.tsv"
"gs://broad-gotc-dev-wfl-ptc-test-inputs/sarscov2_illumina_full/inputs/SeqPrep_Batch26_27-combined-samplesheet_extra.tsv"],
:demux_deplete.blastDbs ["gs://pathogen-public-dbs/v0/GRCh37.68_ncRNA.fasta.zst"
"gs://pathogen-public-dbs/v0/hybsel_probe_adapters.fasta"],

:instrument_model "Illumina NovaSeq 6000",
:demux_deplete.bwaDbs ["gs://pathogen-public-dbs/v0/hg19.bwa_idx.tar.zst"],
:gisaid_meta_prep.submitting_lab_name "Infectious Disease Program, Broad Institute of Harvard and MIT",
:demux_deplete.spikein_db "gs://broad-gotc-dev-wfl-ptc-test-inputs/sarscov2_illumina_full/inputs/ercc_sdsi_spike-ins-20210123.fasta",
:reference_fasta "gs://broad-gotc-dev-wfl-ptc-test-inputs/sarscov2_illumina_full/inputs/ref-sarscov2-NC_045512.2.fasta",
:sra_title "Metagenomic RNA-Seq of SARS-CoV-2 from patients",
:workspace_name "SARSCoV2-Illumina-Full"}
72 changes: 19 additions & 53 deletions api/test/wfl/integration/datarepo_test.clj
Original file line number Diff line number Diff line change
@@ -1,37 +1,26 @@
(ns wfl.integration.datarepo-test
(:require [clojure.data.json :as json]
[clojure.string :as str]
[clojure.test :refer [deftest is testing]]
[wfl.environment :as env]
[wfl.service.datarepo :as datarepo]
[wfl.service.google.storage :as gcs]
[wfl.tools.datasets :as datasets]
[wfl.tools.fixtures :as fixtures]
[wfl.tools.workflows :as workflows]
[wfl.util :as util])
(:import [java.util UUID]))

;; UUIDs known to the Data Repo.
;;
(def dataset "f359303e-15d7-4cd8-a4c7-c50499c90252")
(def profile "390e7a85-d47f-4531-b612-165fc977d3bd")

(defn ^:private make-dataset-request [dataset-basename]
(-> (str "test/resources/datasets/" dataset-basename)
slurp
json/read-str
;; give it a unique name to avoid collisions with other tests
(update "name" #(str % (-> (UUID/randomUUID) (str/replace "-" ""))))
(update "defaultProfileId" (constantly profile))))

(deftest test-create-dataset
;; To test that your dataset json file is valid, add its path to the list!
(doseq [definition ["assemble-refbased-outputs.json"
"sarscov2-illumina-full-inputs.json"
"sarscov2-illumina-full-outputs.json"]]
(testing (str "creating dataset " (util/basename definition))
(fixtures/with-temporary-dataset (make-dataset-request definition)
#(let [dataset (datarepo/dataset %)]
(is (= % (:id dataset))))))))
(let [tdr-profile (env/getenv "WFL_TDR_DEFAULT_PROFILE")]
(doseq [definition ["assemble-refbased-outputs.json"
"sarscov2-illumina-full-inputs.json"
"sarscov2-illumina-full-outputs.json"]]
(testing (str "creating dataset " (util/basename definition))
(fixtures/with-temporary-dataset
(datasets/unique-dataset-request tdr-profile definition)
#(let [dataset (datarepo/dataset %)]
(is (= % (:id dataset)))))))))

(defn ^:private replace-urls-with-file-ids
[file->fileid type value]
Expand All @@ -42,52 +31,29 @@
(throw (ex-info "Unknown type" {:type type :value value}))))
(workflows/traverse type value)))

(defn ^:private ingest-files [workflow-id dataset-id profile-id bkt-obj-pairs]
(letfn [(target-name [obj] (str/join "/" ["" workflow-id obj]))
(mk-url [bkt obj] (format "gs://%s/%s" bkt obj))
(ingest-batch [batch]
(->> (for [[bkt obj] batch] [(mk-url bkt obj) (target-name obj)])
(datarepo/bulk-ingest dataset-id profile-id)))]
;; TDR limits size of bulk ingest request to 1000 files. Muscles says
;; requests at this limit are "probably fine" and testing with large
;; numbers of files (and size thereof) supports this. If this is becomes
;; a serious bottleneck, suggest we benchmark and adjust the `split-at`
;; value input accordingly.
(->> bkt-obj-pairs
(split-at 1000)
(mapv ingest-batch)
(mapcat #(-> % datarepo/poll-job :loadFileResults)))))

(deftest test-ingest-workflow-outputs
(let [dataset-json "assemble-refbased-outputs.json"
pipeline-outputs (workflows/read-resource "assemble_refbased/outputs")
outputs-type (-> "assemble_refbased/description"
workflows/read-resource
:outputs
workflows/make-object-type)
rename-gather identity ;; collect and map outputs onto dataset names
table-name "assemble_refbased_outputs"
workflow-id (UUID/randomUUID)
tdr-sa (env/getenv "WFL_DATA_REPO_SA")]
tdr-profile (env/getenv "WFL_TDR_DEFAULT_PROFILE")
tdr-sa (env/getenv "WFL_TDR_SA")]
(fixtures/with-fixtures
[(fixtures/with-temporary-cloud-storage-folder fixtures/gcs-test-bucket)
(fixtures/with-temporary-dataset (make-dataset-request dataset-json))]
[(fixtures/with-temporary-cloud-storage-folder
"broad-gotc-dev-wfl-ptc-test-inputs")
(fixtures/with-temporary-dataset
(datasets/unique-dataset-request tdr-profile dataset-json))]
(fn [[url dataset-id]]
(let [bkt-obj-pairs (map
gcs/parse-gs-url
(set (workflows/get-files outputs-type pipeline-outputs)))
table-url (str url "table.json")]
(run!
(partial gcs/add-storage-object-viewer tdr-sa)
(into #{} (map first bkt-obj-pairs)))
(-> (->> (ingest-files workflow-id dataset-id profile bkt-obj-pairs)
(map #(mapv % [:sourcePath :fileId]))
(into {}))
(let [table-url (str url "table.json")]
(-> (->> (workflows/get-files outputs-type pipeline-outputs)
(datasets/ingest-files tdr-profile dataset-id workflow-id))
(replace-urls-with-file-ids outputs-type pipeline-outputs)
rename-gather
(json/write-str :escape-slash false)
(gcs/upload-content table-url))
(gcs/add-object-reader tdr-sa table-url)
(let [{:keys [bad_row_count row_count]}
(datarepo/poll-job
(datarepo/ingest-table dataset-id table-url table-name))]
Expand Down
83 changes: 83 additions & 0 deletions api/test/wfl/system/automation_test.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
(ns wfl.system.automation-test
(:require [clojure.data.json :as json]
[clojure.test :refer [deftest is]]
[wfl.environment :as env]
[wfl.service.datarepo :as datarepo]
[wfl.service.google.storage :as storage]
[wfl.tools.datasets :as datasets]
[wfl.tools.fixtures :as fixtures]
[wfl.tools.workflows :as workflows])
(:import (java.util UUID)))

(defn ^:private replace-urls-with-file-ids
[file->fileid type value]
(-> (fn [type value]
(case type
("Boolean" "Float" "Int" "Number" "String") value
"File" (file->fileid value)
(throw (ex-info "Unknown type" {:type type :value value}))))
(workflows/traverse type value)))

(deftest test-automate-sarscov2-illumina-full
(let [tdr-profile (env/getenv "WFL_TDR_DEFAULT_PROFILE")]
(fixtures/with-fixtures
[(fixtures/with-temporary-cloud-storage-folder
"broad-gotc-dev-wfl-ptc-test-inputs")
(fixtures/with-temporary-dataset
(datasets/unique-dataset-request
tdr-profile
"sarscov2-illumina-full-inputs.json"))
(fixtures/with-temporary-dataset
(datasets/unique-dataset-request
tdr-profile
"sarscov2-illumina-full-outputs.json"))]
(fn [[temp source sink]]
;; TODO: create + start the workload
;; upload a sample
(let [inputs (workflows/read-resource "sarscov2_illumina_full/inputs")
inputs-type (-> "sarscov2_illumina_full/description"
workflows/read-resource
:inputs
workflows/make-object-type)
table-name "sarscov2_illumina_full_inputs"
unique-prefix (UUID/randomUUID)
table-url (str temp "inputs.json")
;; This defines how we'll convert the inputs of the pipeline into
;; a form that can be ingested as a new row in the dataset.
;; I think a user would specify something like this in the initial
;; workload request, one mapping for dataset to inputs and one for
;; outputs to dataset.
from-inputs (workflows/read-resource "sarscov2_illumina_full/dataset-from-inputs")]
(-> (->> (workflows/get-files inputs-type inputs)
(datasets/ingest-files tdr-profile source unique-prefix))
(replace-urls-with-file-ids inputs-type inputs)
(datasets/rename-gather from-inputs)
(json/write-str :escape-slash false)
(storage/upload-content table-url))
(let [{:keys [bad_row_count row_count]}
(datarepo/poll-job
(datarepo/ingest-table source table-url table-name))]
(is (= 1 row_count))
(is (= 0 bad_row_count))))
;; At this point, workflow-launcher should run the workflow. The code
;; below simulates this effect.
(let [outputs (workflows/read-resource "sarscov2_illumina_full/outputs")
outputs-type (-> "sarscov2_illumina_full/description"
workflows/read-resource
:outputs
workflows/make-object-type)
table-name "sarscov2_illumina_full_outputs"
unique-prefix (UUID/randomUUID)
table-url (str temp "outputs.json")]
(-> (->> (workflows/get-files outputs-type outputs)
(datasets/ingest-files tdr-profile sink unique-prefix))
(replace-urls-with-file-ids outputs-type outputs)
(json/write-str :escape-slash false)
(storage/upload-content table-url))
(let [{:keys [bad_row_count row_count]}
(datarepo/poll-job
(datarepo/ingest-table sink table-url table-name))]
(is (= 1 row_count))
(is (= 0 bad_row_count))))
;; TODO: verify the outputs have been written to TDR
))))
51 changes: 51 additions & 0 deletions api/test/wfl/tools/datasets.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
(ns wfl.tools.datasets
(:require [clojure.string :as str]
[clojure.data.json :as json]
[wfl.service.datarepo :as datarepo]
[wfl.service.google.storage :as storage])
(:import (java.util UUID)))

(defn unique-dataset-request
"Create a dataset request for a uniquely-named dataset defined by the json
file `dataset-basename` and `tdr-profile`."
[tdr-profile dataset-basename]
(-> (str "test/resources/datasets/" dataset-basename)
slurp
json/read-str
(update "name" #(str % (-> (UUID/randomUUID) (str/replace "-" ""))))
(update "defaultProfileId" (constantly tdr-profile))))
ehigham marked this conversation as resolved.
Show resolved Hide resolved

;; TDR limits size of bulk ingest request to 1000 files. Muscles says
;; requests at this limit are "probably fine" and testing with large
;; numbers of files (and size thereof) supports this. If this is becomes
;; a serious bottleneck, suggest we benchmark and adjust the `split-at`
;; value input accordingly.
(defn ingest-files
"Ingest `files` into a TDR dataset with `dataset-id` under `prefix` and
return a map from url to ingested file-id."
[tdr-profile dataset-id prefix files]
(letfn [(target-name [url]
(let [[_ obj] (storage/parse-gs-url url)]
(str/join "/" ["" prefix obj])))
(ingest-batch [batch]
(->> (for [url batch] [url (target-name url)])
(datarepo/bulk-ingest dataset-id tdr-profile)))]
(->> (split-at 1000 files)
(mapv ingest-batch)
(mapcat #(-> % datarepo/poll-job :loadFileResults))
(map (juxt :sourcePath :fileId))
(into {}))))
tbl3rd marked this conversation as resolved.
Show resolved Hide resolved

(defn rename-gather
"Transform the `values` using the transformation defined in `mapping`."
[values mapping]
(letfn [(literal? [x] (str/starts-with? x "$"))
(go! [v]
(cond (literal? v) (subs v 1 (count v))
(string? v) (get values (keyword v))
(map? v) (json/write-str (rename-gather values v)
:escape-slash false)
(coll? v) (keep go! v)
:else (throw (ex-info "Unknown operation"
{:operation v}))))]
(into {} (for [[k v] mapping] [k (go! v)]))))
tbl3rd marked this conversation as resolved.
Show resolved Hide resolved
4 changes: 2 additions & 2 deletions api/test/wfl/tools/workflows.clj
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,6 @@
type object)))

(defn get-files [type value]
"Return all values in `value` with WDL type `File`."
"Return the unique set of objects in `value` of WDL type `File`."
(letfn [(f [type object] (if (= "File" type) [object] []))]
(flatten (vals (traverse f type value)))))
(set (flatten (vals (traverse f type value))))))
ehigham marked this conversation as resolved.
Show resolved Hide resolved
Loading