-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #24 from HumanCellAtlas/ds_schema_test
Make it possible to handle different metadata versions using the same version of pipeline-tools code. Switch to pytest and add a metadata schema integration test. Add some support for v5 metadata.
- Loading branch information
Showing
18 changed files
with
788 additions
and
55 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
from pipeline_tools import dcp_utils | ||
|
||
|
||
def get_sample_id(metadata, version='4'): | ||
"""Return the sample id from the given metadata""" | ||
if version == '4': | ||
return _get_sample_id_v4(metadata) | ||
else: | ||
raise NotImplementedError('Only implemented for v4 metadata') | ||
|
||
|
||
def _get_sample_id_v4(assay_json): | ||
"""Return sample id from assay json""" | ||
return assay_json["has_input"] | ||
|
||
|
||
def get_input_metadata_file_uuid(manifest_files, version='4'): | ||
"""Get the uuid of the file containing metadata about pipeline input files, | ||
e.g. assay.json in v4""" | ||
if version == '5': | ||
return _get_input_metadata_file_uuid_v5(manifest_files) | ||
elif version == '4': | ||
return _get_input_metadata_file_uuid_v4(manifest_files) | ||
else: | ||
raise NotImplementedError('Only implemented for v4 and v5 metadata') | ||
|
||
|
||
def _get_input_metadata_file_uuid_v5(manifest_files): | ||
"""Get the uuid of the files.json file""" | ||
return dcp_utils.get_file_uuid(manifest_files, 'files.json') | ||
|
||
|
||
def _get_input_metadata_file_uuid_v4(manifest_files): | ||
"""Get the uuid of the assay.json file""" | ||
return dcp_utils.get_file_uuid(manifest_files, 'assay.json') | ||
|
||
|
||
def get_smart_seq_2_fastq_names(metadata, version='4'): | ||
"""Get the fastq file names from the given metadata""" | ||
if version == '5': | ||
return _get_smart_seq_2_fastq_names_v5(metadata) | ||
elif version == '4': | ||
return _get_smart_seq_2_fastq_names_v4(metadata) | ||
else: | ||
raise NotImplementedError('Only implemented for v4 and v5 metadata') | ||
|
||
|
||
def _get_smart_seq_2_fastq_names_v5(files_json): | ||
"""Return fastq file names from files json""" | ||
index_to_name = {} | ||
for f in files_json['files']: | ||
index = f['content']['read_index'] | ||
file_name = f['content']['file_core']['file_name'] | ||
index_to_name[index] = file_name | ||
return index_to_name['read1'], index_to_name['read2'] | ||
|
||
|
||
def _get_smart_seq_2_fastq_names_v4(assay_json): | ||
"""Return fastq file names from assay json""" | ||
fastq_1_name = assay_json["content"]["seq"]["lanes"][0]["r1"] | ||
fastq_2_name = assay_json["content"]["seq"]["lanes"][0]["r2"] | ||
return fastq_1_name, fastq_2_name | ||
|
||
|
||
def get_optimus_lanes(metadata_json, version='4'): | ||
"""Get the lane metadata""" | ||
if version == '4': | ||
return _get_optimus_lanes_v4(metadata_json) | ||
else: | ||
raise NotImplementedError('Only implemented for v4 metadata') | ||
|
||
|
||
def _get_optimus_lanes_v4(assay_json): | ||
"""Return the lane metadata from the assay json""" | ||
lanes = assay_json['content']['seq']['lanes'] | ||
return lanes | ||
|
||
|
||
def get_optimus_inputs(lanes, manifest_files): | ||
"""Return three lists of urls, representing fastqs for r1, r2, and i1, respectively. | ||
In each list, the first item is for the first lane, the second item is for the second lane, etc. | ||
""" | ||
r1 = [manifest_files['name_to_meta'][lane['r1']]['url'] for lane in lanes] | ||
r2 = [manifest_files['name_to_meta'][lane['r2']]['url'] for lane in lanes] | ||
i1 = [manifest_files['name_to_meta'][lane['i1']]['url'] for lane in lanes] | ||
|
||
return r1, r2, i1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
{ | ||
"content": { | ||
"single_cell": { | ||
"cell_handling": "10x_v2", | ||
"cell_barcode": { | ||
"read": "Read 1", | ||
"size": 16, | ||
"white_list_file": "pbmc8k_S1_L007_R1_001.fastq.gz,pbmc8k_S1_L008_R1_001.fastq.gz", | ||
"offset": 0 | ||
} | ||
}, | ||
"core": { | ||
"type": "assay", | ||
"schema_url": "https://raw.githubusercontent.com/HumanCellAtlas/metadata-schema/4.6.1/json_schema/assay.json", | ||
"schema_version": "4.6.1" | ||
}, | ||
"rna": { | ||
"end_bias": "three_prime_end", | ||
"primer": "poly-dT", | ||
"strand": "both", | ||
"library_construction": "10x_v2" | ||
}, | ||
"assay_id": "c349cce6-6d63-4976-832e-3c27ca1399ac", | ||
"seq": { | ||
"paired_ends": true, | ||
"lanes": [ | ||
{ | ||
"i1": "pbmc8k_S1_L007_I1_001.fastq.gz", | ||
"number": 7, | ||
"r2": "pbmc8k_S1_L007_R2_001.fastq.gz", | ||
"r1": "pbmc8k_S1_L007_R1_001.fastq.gz" | ||
}, | ||
{ | ||
"i1": "pbmc8k_S1_L008_I1_001.fastq.gz", | ||
"number": 8, | ||
"r2": "pbmc8k_S1_L008_R2_001.fastq.gz", | ||
"r1": "pbmc8k_S1_L008_R1_001.fastq.gz" | ||
} | ||
], | ||
"instrument_platform": "Illumina", | ||
"molecule": "polyA RNA", | ||
"instrument_model": "HiSeq 4000", | ||
"umi_barcode": { | ||
"read": "Read 1", | ||
"offset": 16, | ||
"size": 10 | ||
} | ||
} | ||
}, | ||
"core": { | ||
"type": "assay_bundle", | ||
"schema_url": "https://raw.githubusercontent.com/HumanCellAtlas/metadata-schema/4.6.1/json_schema/assay_bundle.json", | ||
"schema_version": "4.6.1" | ||
}, | ||
"has_output": [ | ||
"c34f9bda-1621-4596-b93f-797552368282", | ||
"ed7d5ab4-8589-4e50-bb6c-5d4b459b183c", | ||
"9a4a1656-faab-448e-9717-3fb16843a314", | ||
"b7e2cfc0-8d3f-40b4-adf2-3c44112259dc", | ||
"072461ba-e1da-40e2-aa5d-626eedad7fef", | ||
"58ea2f4b-c4af-4b1b-8b6a-484d46d37de5" | ||
], | ||
"hca_ingest": { | ||
"accession": "", | ||
"submissionDate": "2018-01-16T16:23:53.023Z", | ||
"lastModifiedUser": "anonymousUser", | ||
"updateDate": "2018-01-16T16:24:04.590Z", | ||
"document_id": "01425de2-dcd2-479c-899a-b84763767e74", | ||
"user": "anonymousUser" | ||
}, | ||
"has_input": "42a6269e-8bc7-47ac-806b-3a53f8ba2a6f" | ||
} |
Oops, something went wrong.