Skip to content

Commit

Permalink
Standardize Cromwell timestamps to follow analysis JSON schema (#117)
Browse files Browse the repository at this point in the history
* Standardize cromwell datetime format

Because the format of Cromwell timestamps
are inconsistent, re-format them to follow
the JSON date-time format as defined in the
analysis process schema.

* Add tests for formatting timestamps

* Update docker images for testing

* Update requirements

* Update pipeline-tools version
  • Loading branch information
samanehsan authored Feb 12, 2019
1 parent a2ab84a commit 5a09501
Show file tree
Hide file tree
Showing 8 changed files with 49 additions and 13 deletions.
2 changes: 1 addition & 1 deletion adapter_pipelines/Optimus/adapter.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ workflow AdapterOptimus {
Int max_cromwell_retries = 0
Boolean add_md5s = false
String pipeline_tools_version = "v0.46.0"
String pipeline_tools_version = "v0.46.1"
call GetInputs as prep {
input:
Expand Down
2 changes: 1 addition & 1 deletion adapter_pipelines/cellranger/adapter.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ workflow Adapter10xCount {
Int max_cromwell_retries = 0
Boolean add_md5s = false
String pipeline_tools_version = "v0.46.0"
String pipeline_tools_version = "v0.46.1"
call GetInputs {
input:
Expand Down
2 changes: 1 addition & 1 deletion adapter_pipelines/ss2_single_sample/adapter.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ workflow AdapterSmartSeq2SingleCell{
Int max_cromwell_retries = 0
Boolean add_md5s = false
String pipeline_tools_version = "v0.46.0"
String pipeline_tools_version = "v0.46.1"
call GetInputs as prep {
input:
Expand Down
24 changes: 20 additions & 4 deletions pipeline_tools/create_analysis_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from google.cloud import storage
from typing import List
import re
import arrow


def create_analysis_process(raw_schema_url,
Expand Down Expand Up @@ -60,8 +61,8 @@ def create_analysis_process(raw_schema_url,
'schema_type': SCHEMA_TYPE,
'process_core': get_analysis_process_core(analysis_workflow_id=analysis_id),
'process_type': get_analysis_process_type(),
'timestamp_start_utc': workflow_metadata.get('start'),
'timestamp_stop_utc': workflow_metadata.get('end'),
'timestamp_start_utc': format_timestamp(workflow_metadata.get('start')),
'timestamp_stop_utc': format_timestamp(workflow_metadata.get('end')),
'input_bundles': input_bundles_string.split(','),
'reference_bundle': reference_bundle,
'tasks': workflow_tasks,
Expand Down Expand Up @@ -352,8 +353,8 @@ def get_workflow_tasks(workflow_metadata):
'disk_size': runtime['disks'],
'docker_image': runtime['docker'],
'zone': runtime['zones'],
'start_time': task['start'],
'stop_time': task['end'],
'start_time': format_timestamp(task['start']),
'stop_time': format_timestamp(task['end']),
'log_out': task['stdout'],
'log_err': task['stderr']
}
Expand All @@ -362,6 +363,21 @@ def get_workflow_tasks(workflow_metadata):
return sorted_output_tasks


def format_timestamp(timestamp):
""" Standardize Cromwell timestamps to follow the date-time JSON format required by the analysis process schema.
Args:
timestamp (str): A datetime string in any format
Returns:
formatted_timestamp (str): A datetime string in the format 'YYYY-MM-DDTHH:mm:ss.SSSZ'
"""
if timestamp:
d = arrow.get(timestamp)
formatted_date = d.format('YYYY-MM-DDTHH:mm:ss.SSS')
return '{}Z'.format(formatted_date)


def get_file_format(path, extension_to_format):
"""Returns the file type of the file at the given path.
Expand Down
18 changes: 18 additions & 0 deletions pipeline_tools/tests/test_create_analysis_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,3 +253,21 @@ def verify_tasks(self, tasks):
assert first_task['cpus'] == 1
assert first_task['disk_size'] == 'local-disk 10 HDD'
assert first_task['docker_image'] == 'humancellatlas/picard:2.10.10'

def test_format_timestamp_without_seconds(self):
timestamp = '2019-02-11T01:15Z'
formatted_datetime = cam.format_timestamp(timestamp)
expected_datetime = '2019-02-11T01:15:00.000Z'
assert formatted_datetime == expected_datetime

def test_format_timestamp_without_milliseconds(self):
timestamp = '2019-02-11T01:15:00Z'
formatted_timestamp = cam.format_timestamp(timestamp)
expected_timestamp = '2019-02-11T01:15:00.000Z'
assert formatted_timestamp == expected_timestamp

def test_formatting_correct_timestamp(self):
timestamp = '2019-02-11T01:15:00.000Z'
formatted_timestamp = cam.format_timestamp(timestamp)
expected_timestamp = '2019-02-11T01:15:00.000Z'
assert formatted_timestamp == expected_timestamp
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
requests==2.20.0
arrow>=0.12.1
requests>=2.20.0,<3
google-auth>=1.6.1,<2
google-cloud-storage==1.8.0
google-cloud-storage>=1.10.0,<2
tenacity==4.10.0
PyJWT==1.6.4
git+git://github.com/HumanCellAtlas/metadata-api@release/1.0b4#egg=hca-metadata-api[dss]
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@
license='BSD 3-clause "New" or "Revised" License',
packages=['pipeline_tools'],
install_requires=[
'arrow>=0.12.1',
'google-auth>=1.6.1,<2',
'google-cloud-storage>=1.8.0,<2',
'google-cloud-storage>=1.10.0,<2',
'hca>=4.5.0,<5',
'hca-metadata-api',
'mock>=2.0.0,<3',
'requests>=2.20.0,<3',
'requests-mock>=1.5.2,<2',
'setuptools_scm>=2.0.0,<3',
'tenacity>=4.10.0,<5',
'google-cloud-storage>=1.10.0,<2',
'PyJWT==1.6.4'
],
entry_points={
Expand Down
5 changes: 3 additions & 2 deletions test-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
requests-mock==1.5.2
backports.tempfile==1.0
mock>=2.0.0,<3
pytest==3.6.3
requests-mock>=1.5.2,<2
tenacity==4.10.0
backports.tempfile==1.0

0 comments on commit 5a09501

Please sign in to comment.