From 5a0950159a0e6c6e6264cc1043e8e652f37b37a0 Mon Sep 17 00:00:00 2001 From: Saman Ehsan Date: Tue, 12 Feb 2019 17:13:55 -0500 Subject: [PATCH] Standardize Cromwell timestamps to follow analysis JSON schema (#117) * Standardize cromwell datetime format Because the format of Cromwell timestamps are inconsistent, re-format them to follow the JSON date-time format as defined in the analysis process schema. * Add tests for formatting timestamps * Update docker images for testing * Update requirements * Update pipeline-tools version --- adapter_pipelines/Optimus/adapter.wdl | 2 +- adapter_pipelines/cellranger/adapter.wdl | 2 +- .../ss2_single_sample/adapter.wdl | 2 +- pipeline_tools/create_analysis_metadata.py | 24 +++++++++++++++---- .../tests/test_create_analysis_metadata.py | 18 ++++++++++++++ requirements.txt | 5 ++-- setup.py | 4 ++-- test-requirements.txt | 5 ++-- 8 files changed, 49 insertions(+), 13 deletions(-) diff --git a/adapter_pipelines/Optimus/adapter.wdl b/adapter_pipelines/Optimus/adapter.wdl index dceb6305..31cdf101 100644 --- a/adapter_pipelines/Optimus/adapter.wdl +++ b/adapter_pipelines/Optimus/adapter.wdl @@ -126,7 +126,7 @@ workflow AdapterOptimus { Int max_cromwell_retries = 0 Boolean add_md5s = false - String pipeline_tools_version = "v0.46.0" + String pipeline_tools_version = "v0.46.1" call GetInputs as prep { input: diff --git a/adapter_pipelines/cellranger/adapter.wdl b/adapter_pipelines/cellranger/adapter.wdl index fbd244a0..1709c4f7 100644 --- a/adapter_pipelines/cellranger/adapter.wdl +++ b/adapter_pipelines/cellranger/adapter.wdl @@ -150,7 +150,7 @@ workflow Adapter10xCount { Int max_cromwell_retries = 0 Boolean add_md5s = false - String pipeline_tools_version = "v0.46.0" + String pipeline_tools_version = "v0.46.1" call GetInputs { input: diff --git a/adapter_pipelines/ss2_single_sample/adapter.wdl b/adapter_pipelines/ss2_single_sample/adapter.wdl index 27fb3e24..05d3a386 100644 --- a/adapter_pipelines/ss2_single_sample/adapter.wdl +++ b/adapter_pipelines/ss2_single_sample/adapter.wdl @@ -82,7 +82,7 @@ workflow AdapterSmartSeq2SingleCell{ Int max_cromwell_retries = 0 Boolean add_md5s = false - String pipeline_tools_version = "v0.46.0" + String pipeline_tools_version = "v0.46.1" call GetInputs as prep { input: diff --git a/pipeline_tools/create_analysis_metadata.py b/pipeline_tools/create_analysis_metadata.py index 402d3739..59af4606 100644 --- a/pipeline_tools/create_analysis_metadata.py +++ b/pipeline_tools/create_analysis_metadata.py @@ -8,6 +8,7 @@ from google.cloud import storage from typing import List import re +import arrow def create_analysis_process(raw_schema_url, @@ -60,8 +61,8 @@ def create_analysis_process(raw_schema_url, 'schema_type': SCHEMA_TYPE, 'process_core': get_analysis_process_core(analysis_workflow_id=analysis_id), 'process_type': get_analysis_process_type(), - 'timestamp_start_utc': workflow_metadata.get('start'), - 'timestamp_stop_utc': workflow_metadata.get('end'), + 'timestamp_start_utc': format_timestamp(workflow_metadata.get('start')), + 'timestamp_stop_utc': format_timestamp(workflow_metadata.get('end')), 'input_bundles': input_bundles_string.split(','), 'reference_bundle': reference_bundle, 'tasks': workflow_tasks, @@ -352,8 +353,8 @@ def get_workflow_tasks(workflow_metadata): 'disk_size': runtime['disks'], 'docker_image': runtime['docker'], 'zone': runtime['zones'], - 'start_time': task['start'], - 'stop_time': task['end'], + 'start_time': format_timestamp(task['start']), + 'stop_time': format_timestamp(task['end']), 'log_out': task['stdout'], 'log_err': task['stderr'] } @@ -362,6 +363,21 @@ def get_workflow_tasks(workflow_metadata): return sorted_output_tasks +def format_timestamp(timestamp): + """ Standardize Cromwell timestamps to follow the date-time JSON format required by the analysis process schema. + + Args: + timestamp (str): A datetime string in any format + Returns: + formatted_timestamp (str): A datetime string in the format 'YYYY-MM-DDTHH:mm:ss.SSSZ' + + """ + if timestamp: + d = arrow.get(timestamp) + formatted_date = d.format('YYYY-MM-DDTHH:mm:ss.SSS') + return '{}Z'.format(formatted_date) + + def get_file_format(path, extension_to_format): """Returns the file type of the file at the given path. diff --git a/pipeline_tools/tests/test_create_analysis_metadata.py b/pipeline_tools/tests/test_create_analysis_metadata.py index 611c93b4..4bca38da 100644 --- a/pipeline_tools/tests/test_create_analysis_metadata.py +++ b/pipeline_tools/tests/test_create_analysis_metadata.py @@ -253,3 +253,21 @@ def verify_tasks(self, tasks): assert first_task['cpus'] == 1 assert first_task['disk_size'] == 'local-disk 10 HDD' assert first_task['docker_image'] == 'humancellatlas/picard:2.10.10' + + def test_format_timestamp_without_seconds(self): + timestamp = '2019-02-11T01:15Z' + formatted_datetime = cam.format_timestamp(timestamp) + expected_datetime = '2019-02-11T01:15:00.000Z' + assert formatted_datetime == expected_datetime + + def test_format_timestamp_without_milliseconds(self): + timestamp = '2019-02-11T01:15:00Z' + formatted_timestamp = cam.format_timestamp(timestamp) + expected_timestamp = '2019-02-11T01:15:00.000Z' + assert formatted_timestamp == expected_timestamp + + def test_formatting_correct_timestamp(self): + timestamp = '2019-02-11T01:15:00.000Z' + formatted_timestamp = cam.format_timestamp(timestamp) + expected_timestamp = '2019-02-11T01:15:00.000Z' + assert formatted_timestamp == expected_timestamp diff --git a/requirements.txt b/requirements.txt index 0761dd2e..1982a2a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ -requests==2.20.0 +arrow>=0.12.1 +requests>=2.20.0,<3 google-auth>=1.6.1,<2 -google-cloud-storage==1.8.0 +google-cloud-storage>=1.10.0,<2 tenacity==4.10.0 PyJWT==1.6.4 git+git://github.com/HumanCellAtlas/metadata-api@release/1.0b4#egg=hca-metadata-api[dss] diff --git a/setup.py b/setup.py index e1c6235a..decd9ca5 100644 --- a/setup.py +++ b/setup.py @@ -15,8 +15,9 @@ license='BSD 3-clause "New" or "Revised" License', packages=['pipeline_tools'], install_requires=[ + 'arrow>=0.12.1', 'google-auth>=1.6.1,<2', - 'google-cloud-storage>=1.8.0,<2', + 'google-cloud-storage>=1.10.0,<2', 'hca>=4.5.0,<5', 'hca-metadata-api', 'mock>=2.0.0,<3', @@ -24,7 +25,6 @@ 'requests-mock>=1.5.2,<2', 'setuptools_scm>=2.0.0,<3', 'tenacity>=4.10.0,<5', - 'google-cloud-storage>=1.10.0,<2', 'PyJWT==1.6.4' ], entry_points={ diff --git a/test-requirements.txt b/test-requirements.txt index 81d32562..e31f76be 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -1,4 +1,5 @@ -requests-mock==1.5.2 +backports.tempfile==1.0 +mock>=2.0.0,<3 pytest==3.6.3 +requests-mock>=1.5.2,<2 tenacity==4.10.0 -backports.tempfile==1.0