From 3cfa0fafb26f456396bc1b8ab09224cf5afba175 Mon Sep 17 00:00:00 2001 From: Dave Shiga Date: Fri, 9 Mar 2018 14:50:03 -0500 Subject: [PATCH] Make it possible to handle different metadata versions using the same version of pipeline-tools code. Switch to pytest and add a metadata schema integration test. Add some support for v5 metadata. (596) --- .travis.yml | 5 +- README.md | 23 ++++ adapter_pipelines/Optimus/adapter.wdl | 26 ++-- .../ss2_single_sample/adapter.wdl | 20 +-- pipeline_tools/README.rst | 6 +- pipeline_tools/dcp_utils.py | 15 ++- pipeline_tools/input_utils.py | 87 ++++++++++++ .../tests/data/metadata/v4/optimus_assay.json | 72 ++++++++++ .../data/metadata/v4/optimus_manifest.json | 126 ++++++++++++++++++ .../tests/data/metadata/v4/ss2_assay.json | 49 +++++++ .../tests/data/metadata/v4/ss2_manifest.json | 74 ++++++++++ .../tests/data/metadata/v5/ss2_files.json | 55 ++++++++ .../data/metadata/v5/ss2_manifest_stub.json | 22 +++ .../get_latest_schema_example_version.py | 48 +++++++ pipeline_tools/tests/test_dcp_utils.py | 53 ++++---- pipeline_tools/tests/test_input_utils.py | 121 +++++++++++++++++ test-requirements.txt | 1 + test.sh | 40 ++++++ 18 files changed, 788 insertions(+), 55 deletions(-) create mode 100644 pipeline_tools/input_utils.py create mode 100644 pipeline_tools/tests/data/metadata/v4/optimus_assay.json create mode 100644 pipeline_tools/tests/data/metadata/v4/optimus_manifest.json create mode 100644 pipeline_tools/tests/data/metadata/v4/ss2_assay.json create mode 100644 pipeline_tools/tests/data/metadata/v4/ss2_manifest.json create mode 100644 pipeline_tools/tests/data/metadata/v5/ss2_files.json create mode 100644 pipeline_tools/tests/data/metadata/v5/ss2_manifest_stub.json create mode 100644 pipeline_tools/tests/get_latest_schema_example_version.py create mode 100644 pipeline_tools/tests/test_input_utils.py create mode 100644 test.sh diff --git a/.travis.yml b/.travis.yml index b85f2344..43d475f2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,7 +3,10 @@ python: - '2.7' - '3.6' install: pip install -r requirements.txt -r test-requirements.txt -script: python -m unittest discover -v +env: +- TEST_SUITE=unit +- TEST_SUITE=latest_schema +script: bash test.sh $TEST_SUITE notifications: slack: on_success: change diff --git a/README.md b/README.md index 67545c0f..e73807bd 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,26 @@ This repo contains Python code and pipelines for interacting with the Human Cell Atlas Data Coordination Platform. They are used by the Secondary Analysis Service. The pipelines wrap analysis pipelines from the Skylab repo and provide some glue to interface with the DCP. The adapter pipelines take bundle ids as inputs, query the Data Storage Service to find the input files needed by the analysis pipelines, then run the analysis pipelines and submit the results to the Ingest Service. This helps us keep the analysis pipelines themselves free of dependencies on the DCP. + +## Run tests + +### Create a virtual environment + +``` +virtualenv pipeline-tools-test-env +source pipeline-tools-test-env/bin/activate +pip install -r test-requirements.txt +``` + +### Run unit tests + +``` +bash test.sh +``` + +### Run schema tests + +``` +export TEST_SUITE=latest_schema +bash test.sh +``` diff --git a/adapter_pipelines/Optimus/adapter.wdl b/adapter_pipelines/Optimus/adapter.wdl index 23899c6c..7c44dae5 100644 --- a/adapter_pipelines/Optimus/adapter.wdl +++ b/adapter_pipelines/Optimus/adapter.wdl @@ -12,6 +12,7 @@ task GetInputs { command <<< python <>> runtime { - docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.9" + docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.11" } output { String sample_id = read_string("inputs.tsv") @@ -88,7 +88,7 @@ task inputs_for_submit { >>> runtime { - docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.9" + docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.11" } output { @@ -127,7 +127,7 @@ task outputs_for_submit { >>> runtime { - docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.9" + docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.11" } output { diff --git a/adapter_pipelines/ss2_single_sample/adapter.wdl b/adapter_pipelines/ss2_single_sample/adapter.wdl index 3842a67a..98261569 100644 --- a/adapter_pipelines/ss2_single_sample/adapter.wdl +++ b/adapter_pipelines/ss2_single_sample/adapter.wdl @@ -12,6 +12,7 @@ task GetInputs { command <<< python <>> runtime { - docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.9" + docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:v0.1.11" } output { Object inputs = read_object("inputs.tsv") diff --git a/pipeline_tools/README.rst b/pipeline_tools/README.rst index 07354171..2cd8d11b 100644 --- a/pipeline_tools/README.rst +++ b/pipeline_tools/README.rst @@ -41,8 +41,12 @@ To run unit tests, first create a virtual environment with the requirements:: Then, run unit tests from the root of the pipeline-tools repo like this:: - python -m unittest discover -v + bash test.sh +To run schema integration tests, do:: + + export TEST_SUITE="latest_schema" + bash test.sh create_analysis_json.py ======================= diff --git a/pipeline_tools/dcp_utils.py b/pipeline_tools/dcp_utils.py index bf05785c..fd1c4f17 100644 --- a/pipeline_tools/dcp_utils.py +++ b/pipeline_tools/dcp_utils.py @@ -19,7 +19,7 @@ def get_file_by_uuid(file_id, dss_url): return response.json() -def get_manifest_files(bundle_uuid, bundle_version, dss_url, timeout_seconds, retry_seconds): +def get_manifest(bundle_uuid, bundle_version, dss_url, timeout_seconds, retry_seconds): """ Retrieve manifest.json file for a given bundle uuid and version. :param str bundle_uuid: Bundle unique id @@ -48,19 +48,30 @@ def get_manifest_files(bundle_uuid, bundle_version, dss_url, timeout_seconds, re current = time.time() manifest = response.json() + return manifest + + +def get_manifest_file_dicts(manifest): bundle = manifest['bundle'] name_to_meta = {} url_to_name = {} for f in bundle['files']: name_to_meta[f['name']] = f url_to_name[f['url']] = f['name'] - return { 'name_to_meta': name_to_meta, 'url_to_name': url_to_name } +def get_file_uuid(manifest_file_dicts, file_name): + return manifest_file_dicts['name_to_meta'][file_name]['uuid'] + + +def get_file_url(manifest_file_dicts, file_name): + return manifest_file_dicts['name_to_meta'][file_name]['url'] + + def get_auth_token(url="https://danielvaughan.eu.auth0.com/oauth/token", client_id="Zdsog4nDAnhQ99yiKwMQWAPc2qUDlR99", client_secret="t-OAE-GQk_nZZtWn-QQezJxDsLXmU7VSzlAh9cKW5vb87i90qlXGTvVNAjfT9weF", diff --git a/pipeline_tools/input_utils.py b/pipeline_tools/input_utils.py new file mode 100644 index 00000000..5adec6c9 --- /dev/null +++ b/pipeline_tools/input_utils.py @@ -0,0 +1,87 @@ +from pipeline_tools import dcp_utils + + +def get_sample_id(metadata, version='4'): + """Return the sample id from the given metadata""" + if version == '4': + return _get_sample_id_v4(metadata) + else: + raise NotImplementedError('Only implemented for v4 metadata') + + +def _get_sample_id_v4(assay_json): + """Return sample id from assay json""" + return assay_json["has_input"] + + +def get_input_metadata_file_uuid(manifest_files, version='4'): + """Get the uuid of the file containing metadata about pipeline input files, + e.g. assay.json in v4""" + if version == '5': + return _get_input_metadata_file_uuid_v5(manifest_files) + elif version == '4': + return _get_input_metadata_file_uuid_v4(manifest_files) + else: + raise NotImplementedError('Only implemented for v4 and v5 metadata') + + +def _get_input_metadata_file_uuid_v5(manifest_files): + """Get the uuid of the files.json file""" + return dcp_utils.get_file_uuid(manifest_files, 'files.json') + + +def _get_input_metadata_file_uuid_v4(manifest_files): + """Get the uuid of the assay.json file""" + return dcp_utils.get_file_uuid(manifest_files, 'assay.json') + + +def get_smart_seq_2_fastq_names(metadata, version='4'): + """Get the fastq file names from the given metadata""" + if version == '5': + return _get_smart_seq_2_fastq_names_v5(metadata) + elif version == '4': + return _get_smart_seq_2_fastq_names_v4(metadata) + else: + raise NotImplementedError('Only implemented for v4 and v5 metadata') + + +def _get_smart_seq_2_fastq_names_v5(files_json): + """Return fastq file names from files json""" + index_to_name = {} + for f in files_json['files']: + index = f['content']['read_index'] + file_name = f['content']['file_core']['file_name'] + index_to_name[index] = file_name + return index_to_name['read1'], index_to_name['read2'] + + +def _get_smart_seq_2_fastq_names_v4(assay_json): + """Return fastq file names from assay json""" + fastq_1_name = assay_json["content"]["seq"]["lanes"][0]["r1"] + fastq_2_name = assay_json["content"]["seq"]["lanes"][0]["r2"] + return fastq_1_name, fastq_2_name + + +def get_optimus_lanes(metadata_json, version='4'): + """Get the lane metadata""" + if version == '4': + return _get_optimus_lanes_v4(metadata_json) + else: + raise NotImplementedError('Only implemented for v4 metadata') + + +def _get_optimus_lanes_v4(assay_json): + """Return the lane metadata from the assay json""" + lanes = assay_json['content']['seq']['lanes'] + return lanes + + +def get_optimus_inputs(lanes, manifest_files): + """Return three lists of urls, representing fastqs for r1, r2, and i1, respectively. + In each list, the first item is for the first lane, the second item is for the second lane, etc. + """ + r1 = [manifest_files['name_to_meta'][lane['r1']]['url'] for lane in lanes] + r2 = [manifest_files['name_to_meta'][lane['r2']]['url'] for lane in lanes] + i1 = [manifest_files['name_to_meta'][lane['i1']]['url'] for lane in lanes] + + return r1, r2, i1 diff --git a/pipeline_tools/tests/data/metadata/v4/optimus_assay.json b/pipeline_tools/tests/data/metadata/v4/optimus_assay.json new file mode 100644 index 00000000..fcf996e1 --- /dev/null +++ b/pipeline_tools/tests/data/metadata/v4/optimus_assay.json @@ -0,0 +1,72 @@ +{ + "content": { + "single_cell": { + "cell_handling": "10x_v2", + "cell_barcode": { + "read": "Read 1", + "size": 16, + "white_list_file": "pbmc8k_S1_L007_R1_001.fastq.gz,pbmc8k_S1_L008_R1_001.fastq.gz", + "offset": 0 + } + }, + "core": { + "type": "assay", + "schema_url": "https://raw.githubusercontent.com/HumanCellAtlas/metadata-schema/4.6.1/json_schema/assay.json", + "schema_version": "4.6.1" + }, + "rna": { + "end_bias": "three_prime_end", + "primer": "poly-dT", + "strand": "both", + "library_construction": "10x_v2" + }, + "assay_id": "c349cce6-6d63-4976-832e-3c27ca1399ac", + "seq": { + "paired_ends": true, + "lanes": [ + { + "i1": "pbmc8k_S1_L007_I1_001.fastq.gz", + "number": 7, + "r2": "pbmc8k_S1_L007_R2_001.fastq.gz", + "r1": "pbmc8k_S1_L007_R1_001.fastq.gz" + }, + { + "i1": "pbmc8k_S1_L008_I1_001.fastq.gz", + "number": 8, + "r2": "pbmc8k_S1_L008_R2_001.fastq.gz", + "r1": "pbmc8k_S1_L008_R1_001.fastq.gz" + } + ], + "instrument_platform": "Illumina", + "molecule": "polyA RNA", + "instrument_model": "HiSeq 4000", + "umi_barcode": { + "read": "Read 1", + "offset": 16, + "size": 10 + } + } + }, + "core": { + "type": "assay_bundle", + "schema_url": "https://raw.githubusercontent.com/HumanCellAtlas/metadata-schema/4.6.1/json_schema/assay_bundle.json", + "schema_version": "4.6.1" + }, + "has_output": [ + "c34f9bda-1621-4596-b93f-797552368282", + "ed7d5ab4-8589-4e50-bb6c-5d4b459b183c", + "9a4a1656-faab-448e-9717-3fb16843a314", + "b7e2cfc0-8d3f-40b4-adf2-3c44112259dc", + "072461ba-e1da-40e2-aa5d-626eedad7fef", + "58ea2f4b-c4af-4b1b-8b6a-484d46d37de5" + ], + "hca_ingest": { + "accession": "", + "submissionDate": "2018-01-16T16:23:53.023Z", + "lastModifiedUser": "anonymousUser", + "updateDate": "2018-01-16T16:24:04.590Z", + "document_id": "01425de2-dcd2-479c-899a-b84763767e74", + "user": "anonymousUser" + }, + "has_input": "42a6269e-8bc7-47ac-806b-3a53f8ba2a6f" +} diff --git a/pipeline_tools/tests/data/metadata/v4/optimus_manifest.json b/pipeline_tools/tests/data/metadata/v4/optimus_manifest.json new file mode 100644 index 00000000..94dec119 --- /dev/null +++ b/pipeline_tools/tests/data/metadata/v4/optimus_manifest.json @@ -0,0 +1,126 @@ +{ + "bundle": { + "creator_uid": 8008, + "files": [ + { + "content-type": "application/json; dcp-type=\"metadata/project\"", + "crc32c": "c4094a0d", + "indexed": true, + "name": "project.json", + "s3_etag": "cea24b9a97cf6bb012db6eb0c9be8ff9", + "sha1": "e0f2604cb5afffabe3102b4c43a8a6b106fac4de", + "sha256": "76af9a2e20cd09cc35f519bf0f4ef943bfaeff03f9da4e5464b4e5bdddac64c1", + "size": 1803, + "url": "gs://org-humancellatlas-dss-staging/blobs/76af9a2e20cd09cc35f519bf0f4ef943bfaeff03f9da4e5464b4e5bdddac64c1.e0f2604cb5afffabe3102b4c43a8a6b106fac4de.cea24b9a97cf6bb012db6eb0c9be8ff9.c4094a0d", + "uuid": "a5687913-f4a8-475d-8233-3dde25c1b973", + "version": "2018-01-16T163033.305694Z" + }, + { + "content-type": "application/json; dcp-type=\"metadata/sample\"", + "crc32c": "06370f5f", + "indexed": true, + "name": "sample.json", + "s3_etag": "e29ca3705beeb5099435f0c77f0629d7", + "sha1": "c9e694d347574497c4c43ddf73543c1bd75bca06", + "sha256": "83d400d66c7f95168c0d611a2ecf05359c287f35ea5eb2443b222e458af3baaf", + "size": 4206, + "url": "gs://org-humancellatlas-dss-staging/blobs/83d400d66c7f95168c0d611a2ecf05359c287f35ea5eb2443b222e458af3baaf.c9e694d347574497c4c43ddf73543c1bd75bca06.e29ca3705beeb5099435f0c77f0629d7.06370f5f", + "uuid": "d6e3fa37-78d6-41a2-ab39-4ddfc46eb936", + "version": "2018-01-16T163035.085120Z" + }, + { + "content-type": "application/gzip; dcp-type=data", + "crc32c": "c0df5d83", + "indexed": false, + "name": "pbmc8k_S1_L007_R1_001.fastq.gz", + "s3_etag": "b9e913c2541c78e1d3d97ba2e454dbd4", + "sha1": "142b148727007dd08e7c9673c1aecd4f9d47e332", + "sha256": "48dbfa31eb65b0892d21961fbd3bea4307a87490ba497807d0d0694a4178188a", + "size": 215640, + "url": "gs://foo/L7_R1.fastq.gz", + "uuid": "9a4a1656-faab-448e-9717-3fb16843a314", + "version": "2018-01-16T163036.863838Z" + }, + { + "content-type": "application/gzip; dcp-type=data", + "crc32c": "cd965118", + "indexed": false, + "name": "pbmc8k_S1_L007_R2_001.fastq.gz", + "s3_etag": "a276a07b3a8f92b44e2d2fc697397029", + "sha1": "fbb43f8954afc122b49822048fb1b190b5f6ab51", + "sha256": "9e8437502e642ac5e285a5bb767143fd5a32fdcb2c7776d2ef1b3dd684e20d9a", + "size": 758632, + "url": "gs://foo/L7_R2.fastq.gz", + "uuid": "072461ba-e1da-40e2-aa5d-626eedad7fef", + "version": "2018-01-16T163038.065174Z" + }, + { + "content-type": "application/gzip; dcp-type=data", + "crc32c": "2145fec1", + "indexed": false, + "name": "pbmc8k_S1_L007_I1_001.fastq.gz", + "s3_etag": "ccc460a58ce1ed4070d678e84b46a370", + "sha1": "7f5a26af32e36db2960767304cb79b901306597a", + "sha256": "231c9108a264ab8eb9ca42c40d8696234d5bd2fec72a79da5a96fe964cb21768", + "size": 62760, + "url": "gs://foo/L7_I1.fastq.gz", + "uuid": "58ea2f4b-c4af-4b1b-8b6a-484d46d37de5", + "version": "2018-01-16T163039.186523Z" + }, + { + "content-type": "application/gzip; dcp-type=data", + "crc32c": "e5d6dc51", + "indexed": false, + "name": "pbmc8k_S1_L008_R1_001.fastq.gz", + "s3_etag": "b720df1c197045e50b6ef1c6e92b043f", + "sha1": "c1ac97f4b1f28843f8f35658e8ce3925fa38be5d", + "sha256": "e77c8208c02607d6907f79122d2aa01354b41ddfac0185d38421c17e4eeecff4", + "size": 212357, + "url": "gs://foo/L8_R1.fastq.gz", + "uuid": "c34f9bda-1621-4596-b93f-797552368282", + "version": "2018-01-16T163040.206564Z" + }, + { + "content-type": "application/gzip; dcp-type=data", + "crc32c": "cc0e2da1", + "indexed": false, + "name": "pbmc8k_S1_L008_R2_001.fastq.gz", + "s3_etag": "49f82219d250741d2c13587ed0c3f1ca", + "sha1": "1a8c74f0d9ca021e9f85369e871b409b4ee03e9d", + "sha256": "086b40b559454dab11b059a7ea4208371b277fb1638d92ebf61ed98162ee9a41", + "size": 733086, + "url": "gs://foo/L8_R2.fastq.gz", + "uuid": "ed7d5ab4-8589-4e50-bb6c-5d4b459b183c", + "version": "2018-01-16T163041.246123Z" + }, + { + "content-type": "application/gzip; dcp-type=data", + "crc32c": "6e23a64d", + "indexed": false, + "name": "pbmc8k_S1_L008_I1_001.fastq.gz", + "s3_etag": "1ae154756b80f759ee9db368877053d6", + "sha1": "83585aab4be7181820e276e70aa66e6bddb4d39c", + "sha256": "a36fe9ce4956eaeddd00040d0d6f4e92c6f2113c06d0761ccf8b8cc1e828fc36", + "size": 62579, + "url": "gs://foo/L8_I1.fastq.gz", + "uuid": "b7e2cfc0-8d3f-40b4-adf2-3c44112259dc", + "version": "2018-01-16T163042.240894Z" + }, + { + "content-type": "application/json; dcp-type=\"metadata/assay\"", + "crc32c": "a98a29ad", + "indexed": true, + "name": "assay.json", + "s3_etag": "e54d19b92605efeb5e12c7aac32d9293", + "sha1": "2eb73693c07e1b28f3c13aed522a2401c155b841", + "sha256": "b62209866e45e46d2d4e347abe5973aed1d10af92e727c57e6822ae7fcf8ea5a", + "size": 2575, + "url": "gs://org-humancellatlas-dss-staging/blobs/b62209866e45e46d2d4e347abe5973aed1d10af92e727c57e6822ae7fcf8ea5a.2eb73693c07e1b28f3c13aed522a2401c155b841.e54d19b92605efeb5e12c7aac32d9293.a98a29ad", + "uuid": "89634b69-ea40-43ed-9361-546bbdadef7c", + "version": "2018-01-16T163043.443898Z" + } + ], + "uuid": "9e9956b0-f731-4bc1-a000-2b73b5e735b3", + "version": "2018-01-16T163044.805931Z" + } +} diff --git a/pipeline_tools/tests/data/metadata/v4/ss2_assay.json b/pipeline_tools/tests/data/metadata/v4/ss2_assay.json new file mode 100644 index 00000000..103936f1 --- /dev/null +++ b/pipeline_tools/tests/data/metadata/v4/ss2_assay.json @@ -0,0 +1,49 @@ +{ + "content": { + "single_cell": { + "cell_handling": "FACS" + }, + "core": { + "type": "assay", + "schema_url": "https://raw.githubusercontent.com/HumanCellAtlas/metadata-schema/4.6.1/json_schema/assay.json", + "schema_version": "4.6.1" + }, + "rna": { + "end_bias": "five_prime_end", + "strand": "both", + "library_construction": "smart-seq2" + }, + "assay_id": "assay_1", + "seq": { + "instrument_platform": "Illumina", + "molecule": "polyA RNA", + "paired_ends": true, + "lanes": [ + { + "number": 1, + "r2": "R2.fastq.gz", + "r1": "R1.fastq.gz" + } + ], + "instrument_model": "HiSeq 2500" + } + }, + "core": { + "type": "assay_bundle", + "schema_url": "https://raw.githubusercontent.com/HumanCellAtlas/metadata-schema/4.6.1/json_schema/assay_bundle.json", + "schema_version": "4.6.1" + }, + "has_output": [ + "851d312b-9d16-49d4-a0de-4d7def69d126", + "b641c6b5-507b-4585-a6d5-856b276f284c" + ], + "hca_ingest": { + "accession": "", + "submissionDate": "2018-01-08T22:16:08.832Z", + "lastModifiedUser": "anonymousUser", + "updateDate": "2018-01-08T22:16:20.921Z", + "document_id": "dc535737-6576-409e-9906-eafadbbfc56a", + "user": "anonymousUser" + }, + "has_input": "b0c57b9c-860b-4bbf-84aa-5f845508101d" +} diff --git a/pipeline_tools/tests/data/metadata/v4/ss2_manifest.json b/pipeline_tools/tests/data/metadata/v4/ss2_manifest.json new file mode 100644 index 00000000..5619b8e5 --- /dev/null +++ b/pipeline_tools/tests/data/metadata/v4/ss2_manifest.json @@ -0,0 +1,74 @@ +{ + "bundle": { + "creator_uid": 8008, + "files": [ + { + "content-type": "application/json; dcp-type=\"metadata/project\"", + "crc32c": "9fb3c416", + "indexed": true, + "name": "project.json", + "s3_etag": "50285c9f791bf9f076b7d85f44bbe1c6", + "sha1": "43ae0c1cd697acf3ff8a86eced2c9adc3ca9c7b4", + "sha256": "be4cc7bc352eb3cef3e939c0e77bdc20a4a2a26bd4509cc30f85d1efd5c12ae5", + "size": 4010, + "url": "gs://org-humancellatlas-dss-staging/blobs/be4cc7bc352eb3cef3e939c0e77bdc20a4a2a26bd4509cc30f85d1efd5c12ae5.43ae0c1cd697acf3ff8a86eced2c9adc3ca9c7b4.50285c9f791bf9f076b7d85f44bbe1c6.9fb3c416", + "uuid": "ba4fd3fe-dc58-4195-a5b8-fe1433214152", + "version": "2018-01-08T221730.119294Z" + }, + { + "content-type": "application/json; dcp-type=\"metadata/sample\"", + "crc32c": "d58f2fd2", + "indexed": true, + "name": "sample.json", + "s3_etag": "56910e47ccbba082f3db1d713869aded", + "sha1": "5fc2c929c3393c9d64ec57a14fdf57109b7def14", + "sha256": "8f3b76ee4c5e20ec81ad3db2b9f4cba3e17c9eb462921c8b3e1bfa3b556b250b", + "size": 3187, + "url": "gs://org-humancellatlas-dss-staging/blobs/8f3b76ee4c5e20ec81ad3db2b9f4cba3e17c9eb462921c8b3e1bfa3b556b250b.5fc2c929c3393c9d64ec57a14fdf57109b7def14.56910e47ccbba082f3db1d713869aded.d58f2fd2", + "uuid": "359ad90d-5d80-44d9-86b0-120db115bc69", + "version": "2018-01-08T221731.879032Z" + }, + { + "content-type": "application/gzip; dcp-type=data", + "crc32c": "4ef74578", + "indexed": false, + "name": "R1.fastq.gz", + "s3_etag": "c7bbee4c46bbf29432862e05830c8f39", + "sha1": "17f8b4be0cc6e8281a402bb365b1283b458906a3", + "sha256": "fe6d4fdfea2ff1df97500dcfe7085ac3abfb760026bff75a34c20fb97a4b2b29", + "size": 125191, + "url": "gs://org-humancellatlas-dss-staging/blobs/bar.baz", + "uuid": "851d312b-9d16-49d4-a0de-4d7def69d126", + "version": "2018-01-08T221733.036535Z" + }, + { + "content-type": "application/gzip; dcp-type=data", + "crc32c": "69987b3e", + "indexed": false, + "name": "R2.fastq.gz", + "s3_etag": "a3a9f23d07cfc5e40a4c3a8adf3903ae", + "sha1": "f166b6952e30a41e1409e7fb0cb0fb1ad93f3f21", + "sha256": "c305bee37b3c3735585e11306272b6ab085f04cd22ea8703957b4503488cfeba", + "size": 130024, + "url": "gs://org-humancellatlas-dss-staging/blobs/foo.bar", + "uuid": "b641c6b5-507b-4585-a6d5-856b276f284c", + "version": "2018-01-08T221733.922146Z" + }, + { + "content-type": "application/json; dcp-type=\"metadata/assay\"", + "crc32c": "844dfa3e", + "indexed": true, + "name": "assay.json", + "s3_etag": "9db78e31d6b25589720ca378a20571cc", + "sha1": "24729fcb63a5522999f395c82150fc9d4c22b93d", + "sha256": "93f8a3d9345fba71bf87c863949d6eff207f2289a671445f3eb3243cacc7487b", + "size": 1588, + "url": "gs://org-humancellatlas-dss-staging/blobs/93f8a3d9345fba71bf87c863949d6eff207f2289a671445f3eb3243cacc7487b.24729fcb63a5522999f395c82150fc9d4c22b93d.9db78e31d6b25589720ca378a20571cc.844dfa3e", + "uuid": "e56638c7-f026-42d0-9be8-24b71a7d6e86", + "version": "2018-01-08T221734.839225Z" + } + ], + "uuid": "d0686b4d-a4b7-45e2-89b7-ac0ebdb4a6b7", + "version": "2018-01-08T221736.018244Z" + } +} diff --git a/pipeline_tools/tests/data/metadata/v5/ss2_files.json b/pipeline_tools/tests/data/metadata/v5/ss2_files.json new file mode 100644 index 00000000..36e5040a --- /dev/null +++ b/pipeline_tools/tests/data/metadata/v5/ss2_files.json @@ -0,0 +1,55 @@ +{ + "describedBy": "https://schema.humancellatlas.org/bundle/1.0.0/file", + "schema_version": "1.0.0", + "schema_type": "file_bundle", + "files": [ + { + "content": { + "describedBy": "https://schema.humancellatlas.org/type/file/5.0.0/sequence_file", + "schema_version": "5.0.0", + "schema_type": "file", + "file_core": { + "file_name": "R1.fastq.gz", + "file_format": "fastq.gz", + "describedBy": "https://schema.humancellatlas.org/core/file/5.0.0/file_core", + "schema_version": "5.0.0" + }, + "read_index": "read1", + "lane_index": 1, + "read_length": 187 + }, + "hca_ingest": { + "describedBy": "https://schema.humancellatlas.org/bundle/5.0.0/ingest_audit", + "document_id": "db08d9a7-1fcb-4650-8892-6eee95877c42", + "submissionDate": "2018-02-07T10:21:43.228Z", + "updateDate": "2018-02-07T10:21:54.398Z", + "submitter_id": "anonymousUser", + "updater_id": "anonymousUser" + } + }, + { + "content": { + "describedBy": "https://schema.humancellatlas.org/type/file/5.0.0/sequence_file", + "schema_version": "5.0.0", + "schema_type": "file", + "file_core": { + "file_name": "R2.fastq.gz", + "file_format": "fastq.gz", + "describedBy": "https://schema.humancellatlas.org/core/file/5.0.0/file_core", + "schema_version": "5.0.0" + }, + "read_index": "read2", + "lane_index": 1, + "read_length": 225 + }, + "hca_ingest": { + "describedBy": "https://schema.humancellatlas.org/bundle/5.0.0/ingest_audit", + "document_id": "5266a514-5614-420e-a882-a9dff3df9d84", + "submissionDate": "2018-02-07T10:21:43.228Z", + "updateDate": "2018-02-07T10:21:54.398Z", + "submitter_id": "anonymousUser", + "updater_id": "anonymousUser" + } + } + ] +} \ No newline at end of file diff --git a/pipeline_tools/tests/data/metadata/v5/ss2_manifest_stub.json b/pipeline_tools/tests/data/metadata/v5/ss2_manifest_stub.json new file mode 100644 index 00000000..6194e75e --- /dev/null +++ b/pipeline_tools/tests/data/metadata/v5/ss2_manifest_stub.json @@ -0,0 +1,22 @@ +{ + "bundle": { + "creator_uid": 8008, + "files": [ + { + "content-type": "application/gzip; dcp-type=data", + "crc32c": "4ef74578", + "indexed": false, + "name": "files.json", + "s3_etag": "c7bbee4c46bbf29432862e05830c8f39", + "sha1": "17f8b4be0cc6e8281a402bb365b1283b458906a3", + "sha256": "fe6d4fdfea2ff1df97500dcfe7085ac3abfb760026bff75a34c20fb97a4b2b29", + "size": 125191, + "url": "gs://org-humancellatlas-dss-staging/blobs/bar.baz", + "uuid": "851d312b-9d16-49d4-a0de-4d7def69d126", + "version": "2018-01-08T221733.036535Z" + } + ], + "uuid": "d0686b4d-a4b7-45e2-89b7-ac0ebdb4a6b7", + "version": "2018-01-08T221736.018244Z" + } +} diff --git a/pipeline_tools/tests/get_latest_schema_example_version.py b/pipeline_tools/tests/get_latest_schema_example_version.py new file mode 100644 index 00000000..b16747d2 --- /dev/null +++ b/pipeline_tools/tests/get_latest_schema_example_version.py @@ -0,0 +1,48 @@ +import argparse +import os + +class Version: + def __init__(self, dirname): + self.dirname = dirname + # Remove 'v' prefix + d = dirname[1:] + parts = d.strip().split('.') + numeric_parts = list(map(lambda x: int(x), parts)) + self.v = numeric_parts + while len(self.v) < 3: + self.v.append(0) + self.string = '{0}.{1}.{2}'.format(self.v[0], self.v[1], self.v[2]) + + def get_dirname(self): + return self.dirname + + def __str__(self): + return self.string + + def __eq__(self, other): + return self.string == other.string + + def __gt__(self, other): + for s, o in zip(self.v, other.v): + if s < o: + return False + if s > o: + return True + return False + + +def run(directory): + dirs = os.listdir(directory) + versions = [] + for d in dirs: + v = Version(d) + versions.append(v) + versions.sort(reverse=True) + print(versions[0].get_dirname()) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-d', help='Directory containing metadata-schema example version directories', required=True) + args = parser.parse_args() + run(args.d) diff --git a/pipeline_tools/tests/test_dcp_utils.py b/pipeline_tools/tests/test_dcp_utils.py index e9be2a37..e77d500d 100644 --- a/pipeline_tools/tests/test_dcp_utils.py +++ b/pipeline_tools/tests/test_dcp_utils.py @@ -1,11 +1,16 @@ import unittest import requests_mock +import os +import json from pipeline_tools import dcp_utils class TestDCPUtils(unittest.TestCase): @classmethod def setUpClass(cls): + with open(cls.data_file('metadata/v4/ss2_manifest.json')) as f: + cls.ss2_manifest_json_v4 = json.load(f) + cls.ss2_manifest_files_v4 = dcp_utils.get_manifest_file_dicts(cls.ss2_manifest_json_v4) cls.DSS_URL = "https://dss.mock.org/v0" cls.FILE_ID = "test_id" cls.BUNDLE_UUID = "test_uuid" @@ -34,39 +39,26 @@ def _request_callback(request, context): self.assertEquals(json_response['file'], expect_file['file']) - @requests_mock.mock() - def test_get_manifest_files(self, mock_request): - expect_manifest = { - 'name_to_meta': { - 'test_name1': {'name': 'test_name1', 'url': 'test_url1'}, - 'test_name2': {'name': 'test_name2', 'url': 'test_url2'} - }, - 'url_to_name': { - 'test_url1': 'test_name1', - 'test_url2': 'test_name2' - } - } - url = '{dss_url}/bundles/{bundle_uuid}?version={bundle_version}&replica=gcp&directurls=true'.format( - dss_url=self.DSS_URL, bundle_uuid=self.BUNDLE_UUID, bundle_version=self.BUNDLE_VERSION) + def test_get_manifest_file_dicts(self): + result = dcp_utils.get_manifest_file_dicts(self.ss2_manifest_json_v4) - def _request_callback(request, context): - context.status_code = 200 - return { - 'bundle': { - 'files': [ - {'name': 'test_name1', 'url': 'test_url1'}, - {'name': 'test_name2', 'url': 'test_url2'} - ] + name_to_meta = result['name_to_meta'] + url_to_name = result['url_to_name'] + self.assertEquals(len(name_to_meta), 5) + self.assertEquals(len(url_to_name), 5) + self.assertEquals(name_to_meta['R2.fastq.gz']['url'], 'gs://org-humancellatlas-dss-staging/blobs/foo.bar') + self.assertEquals(url_to_name['gs://org-humancellatlas-dss-staging/blobs/foo.bar'], 'R2.fastq.gz') - } - } - mock_request.get(url, json=_request_callback) + def test_get_file_uuid(self): + uuid = dcp_utils.get_file_uuid(self.ss2_manifest_files_v4, 'assay.json') + self.assertEqual(uuid, 'e56638c7-f026-42d0-9be8-24b71a7d6e86') + - result = dcp_utils.get_manifest_files(self.BUNDLE_UUID, self.BUNDLE_VERSION, self.DSS_URL, - self.TIMEOUT_SECONDS, self.RETRY_SECONDS) + def test_get_file_url(self): + url = dcp_utils.get_file_url(self.ss2_manifest_files_v4, 'R2.fastq.gz') + self.assertEqual(url, 'gs://org-humancellatlas-dss-staging/blobs/foo.bar') - self.assertEquals(result, expect_manifest) @requests_mock.mock() def test_auth_token(self, mock_request): @@ -91,3 +83,8 @@ def test_make_auth_header(self): headers = dcp_utils.make_auth_header(self.AUTH_TOKEN) self.assertEquals(headers, expect_header) + + + @staticmethod + def data_file(file_name): + return os.path.split(__file__)[0] + '/data/' + file_name diff --git a/pipeline_tools/tests/test_input_utils.py b/pipeline_tools/tests/test_input_utils.py new file mode 100644 index 00000000..ca84e275 --- /dev/null +++ b/pipeline_tools/tests/test_input_utils.py @@ -0,0 +1,121 @@ +import unittest +import json +import os +import pytest +from pipeline_tools import input_utils +from pipeline_tools import dcp_utils + + +class TestInputUtils(unittest.TestCase): + + @classmethod + def setUpClass(self): + with open(self.data_file('metadata/v4/ss2_assay.json')) as f: + self.ss2_assay_json_v4 = json.load(f) + with open(self.data_file('metadata/v4/ss2_manifest.json')) as f: + self.ss2_manifest_json_v4 = json.load(f) + self.ss2_manifest_files_v4 = dcp_utils.get_manifest_file_dicts(self.ss2_manifest_json_v4) + with open(self.data_file('metadata/v5/ss2_manifest_stub.json')) as f: + self.ss2_manifest_json_v5 = json.load(f) + self.ss2_manifest_files_v5 = dcp_utils.get_manifest_file_dicts(self.ss2_manifest_json_v5) + with open(self.data_file('metadata/v5/ss2_files.json')) as f: + self.ss2_files_json_v5 = json.load(f) + with open(self.data_file('metadata/v4/optimus_assay.json')) as f: + self.optimus_assay_json_v4 = json.load(f) + with open(self.data_file('metadata/v4/optimus_manifest.json')) as f: + self.optimus_manifest_json_v4 = json.load(f) + + def test_get_sample_id_default_version(self): + sample_id = input_utils.get_sample_id(self.ss2_assay_json_v4) + self.assertEqual(sample_id, 'b0c57b9c-860b-4bbf-84aa-5f845508101d') + + def test_get_sample_id_v4(self): + sample_id = input_utils.get_sample_id(self.ss2_assay_json_v4, '4') + self.assertEqual(sample_id, 'b0c57b9c-860b-4bbf-84aa-5f845508101d') + + def test_get_sample_id_v5_raises_error(self): + with self.assertRaises(NotImplementedError): + input_utils.get_sample_id(self.ss2_assay_json_v4, '5') + + def test_get_sample_id_non_existent_version_raises_error(self): + with self.assertRaises(NotImplementedError): + input_utils.get_sample_id(self.ss2_assay_json_v4, '-1') + + def test_get_input_metadata_file_uuid_default_version(self): + uuid = input_utils.get_input_metadata_file_uuid(self.ss2_manifest_files_v4) + self.assertEqual(uuid, 'e56638c7-f026-42d0-9be8-24b71a7d6e86') + + def test_get_input_metadata_file_uuid_v4(self): + uuid = input_utils.get_input_metadata_file_uuid(self.ss2_manifest_files_v4, '4') + self.assertEqual(uuid, 'e56638c7-f026-42d0-9be8-24b71a7d6e86') + + def test_get_input_metadata_file_uuid_v5(self): + uuid = input_utils.get_input_metadata_file_uuid(self.ss2_manifest_files_v5, '5') + self.assertEqual(uuid, '851d312b-9d16-49d4-a0de-4d7def69d126') + + def test_get_smart_seq_2_fastq_names_default_version(self): + fastq_1_name, fastq_2_name = input_utils.get_smart_seq_2_fastq_names(self.ss2_assay_json_v4) + self.assertEqual(fastq_1_name, 'R1.fastq.gz') + self.assertEqual(fastq_2_name, 'R2.fastq.gz') + + def test_get_smart_seq_2_fastq_names_v4(self): + fastq_1_name, fastq_2_name = input_utils.get_smart_seq_2_fastq_names(self.ss2_assay_json_v4, '4') + self.assertEqual(fastq_1_name, 'R1.fastq.gz') + self.assertEqual(fastq_2_name, 'R2.fastq.gz') + + def test_get_smart_seq_2_fastq_names_v5(self): + fastq_1_name, fastq_2_name = input_utils.get_smart_seq_2_fastq_names(self.ss2_files_json_v5, '5') + self.assertEqual(fastq_1_name, 'R1.fastq.gz') + self.assertEqual(fastq_2_name, 'R2.fastq.gz') + + @pytest.mark.latest_schema + def test_get_smart_seq_2_fastq_names_latest(self): + with open(self.data_file('metadata/latest/ss2_files.json')) as f: + ss2_files_json_latest = json.load(f) + fastq_1_name, fastq_2_name = input_utils.get_smart_seq_2_fastq_names(ss2_files_json_latest, '5') + self.assertEqual(fastq_1_name, 'R1.fastq.gz') + self.assertEqual(fastq_2_name, 'R2.fastq.gz') + + def test_get_optimus_lanes_default_version(self): + lanes = input_utils.get_optimus_lanes(self.optimus_assay_json_v4) + self.assertEqual(len(lanes), 2) + self.assertEqual(lanes[0]['r1'], 'pbmc8k_S1_L007_R1_001.fastq.gz') + self.assertEqual(lanes[0]['r2'], 'pbmc8k_S1_L007_R2_001.fastq.gz') + self.assertEqual(lanes[0]['i1'], 'pbmc8k_S1_L007_I1_001.fastq.gz') + self.assertEqual(lanes[1]['r1'], 'pbmc8k_S1_L008_R1_001.fastq.gz') + self.assertEqual(lanes[1]['r2'], 'pbmc8k_S1_L008_R2_001.fastq.gz') + self.assertEqual(lanes[1]['i1'], 'pbmc8k_S1_L008_I1_001.fastq.gz') + + def test_get_optimus_lanes_v4(self): + lanes = input_utils.get_optimus_lanes(self.optimus_assay_json_v4, '4') + self.assertEqual(len(lanes), 2) + self.assertEqual(lanes[0]['r1'], 'pbmc8k_S1_L007_R1_001.fastq.gz') + self.assertEqual(lanes[0]['r2'], 'pbmc8k_S1_L007_R2_001.fastq.gz') + self.assertEqual(lanes[0]['i1'], 'pbmc8k_S1_L007_I1_001.fastq.gz') + self.assertEqual(lanes[1]['r1'], 'pbmc8k_S1_L008_R1_001.fastq.gz') + self.assertEqual(lanes[1]['r2'], 'pbmc8k_S1_L008_R2_001.fastq.gz') + self.assertEqual(lanes[1]['i1'], 'pbmc8k_S1_L008_I1_001.fastq.gz') + + def test_get_optimus_lanes_v5(self): + with self.assertRaises(NotImplementedError): + input_utils.get_optimus_lanes(self.optimus_assay_json_v4, '5') + + def test_get_optimus_inputs(self): + lanes = input_utils.get_optimus_lanes(self.optimus_assay_json_v4) + manifest_files = dcp_utils.get_manifest_file_dicts(self.optimus_manifest_json_v4) + r1, r2, i1 = input_utils.get_optimus_inputs(lanes, manifest_files) + + expected_r1 = ['gs://foo/L7_R1.fastq.gz', 'gs://foo/L8_R1.fastq.gz'] + expected_r2 = ['gs://foo/L7_R2.fastq.gz', 'gs://foo/L8_R2.fastq.gz'] + expected_i1 = ['gs://foo/L7_I1.fastq.gz', 'gs://foo/L8_I1.fastq.gz'] + + self.assertEqual(r1, expected_r1) + self.assertEqual(r2, expected_r2) + self.assertEqual(i1, expected_i1) + + @staticmethod + def data_file(file_name): + return os.path.split(__file__)[0] + '/data/' + file_name + +if __name__ == '__main__': + unittest.main() diff --git a/test-requirements.txt b/test-requirements.txt index 50effb2f..6aae93e1 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -1,2 +1,3 @@ mock requests-mock +pytest diff --git a/test.sh b/test.sh new file mode 100644 index 00000000..a5807b2d --- /dev/null +++ b/test.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +# Runs unit test or integration test suite, depending on value of TEST_SUITE. +# +# Usage: bash test.sh +# +# If TEST_SUITE == unit, this will run unit tests. +# If TEST_SUITE == schema, this will run "latest_schema" tests, which will attempt +# to run functions against the latest example files from the metadata-schema repo. +# +# Using an environment variable here instead of an argument to the script +# for ease of use with Travis CI. + +# Set TEST_SUITE to "unit" by default, for convenience when testing locally. +if [ -z "$TEST_SUITE" ]; then + printf "\nTEST_SUITE not specified. Defaulting to 'unit'.\n" + TEST_SUITE="unit" +fi + +# For integration tests, copy down latest metadata file(s) to test against. +if [ "$TEST_SUITE" = "latest_schema" ]; then + PYTEST_SUITE="latest_schema" + if [ ! -d pipeline_tools/tests/data/metadata/latest ]; then + mkdir pipeline_tools/tests/data/metadata/latest + fi + git clone https://github.com/HumanCellAtlas/metadata-schema.git pipeline_tools/tests/data/metadata-schema + cd pipeline_tools/tests/data/metadata-schema + git checkout develop + cd - + LATEST=$(python pipeline_tools/tests/get_latest_schema_example_version.py -d pipeline_tools/tests/data/metadata-schema/examples/bundles) + cp pipeline_tools/tests/data/metadata-schema/examples/bundles/$LATEST/Q4DemoSS2/files.json pipeline_tools/tests/data/metadata/latest/ss2_files.json +elif [ "$TEST_SUITE" = "unit" ]; then + # Define unit tests to be anything not marked as "latest_schema" + PYTEST_SUITE="not latest_schema" +else + printf "\nTEST_SUITE value $TEST_SUITE not allowed. Must be 'unit' or 'latest_schema'.\n\n" + exit 1 +fi + +pytest -v -m "$PYTEST_SUITE"