Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions eva_sub_cli/semantic_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from datetime import datetime

import yaml
from requests import HTTPError

from retry import retry
from ebi_eva_common_pyutils.biosamples_communicators import NoAuthHALCommunicator
Expand Down Expand Up @@ -121,8 +122,14 @@ def check_all_analysis_run_accessions(self):
def check_accession_in_ena(self, ena_accession, accession_type, json_path):
try:
res = download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{ena_accession}')
except Exception:
self.add_error(json_path, f'{accession_type} {ena_accession} does not exist in ENA or is private')
except HTTPError as e:
# We cannot currently differentiate between the service returning an error and the accession not existing
if e.response.status_code == 500:
self.add_error(json_path, f'{accession_type} {ena_accession} does not exist in ENA or is private')
else:
self.add_error(json_path, f'{accession_type} {ena_accession} does not exist in ENA or is private')
except Exception as e:
self.add_error(json_path, f'Unexpected error occurred when checking {accession_type} {ena_accession}')

def check_project_accession(self, project_acc, json_path):
self.check_accession_in_ena(project_acc, 'Project', json_path)
Expand Down Expand Up @@ -216,7 +223,10 @@ def _validate_biosample_against_checklist(self, sample_data, json_path, accessio

def _should_bypass_error(self, sample_data, error):
try:
created_dated = datetime.strptime(sample_data['create'],'%Y-%m-%dT%H:%M:%S.%fZ')
try:
created_dated = datetime.strptime(sample_data['create'],'%Y-%m-%dT%H:%M:%S.%fZ')
except ValueError:
created_dated = datetime.strptime(sample_data['create'],'%Y-%m-%dT%H:%M:%SZ')
if created_dated < threshold_2023 and (
'collection date' in error or
'geographic location (country and/or sea)' in error
Expand Down
31 changes: 27 additions & 4 deletions tests/test_semantic_metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from unittest import TestCase
from unittest.mock import patch
from requests import HTTPError, Response

import pytest
from ebi_eva_common_pyutils.biosamples_communicators import NoAuthHALCommunicator

from eva_sub_cli.semantic_metadata import SemanticMetadataChecker
Expand All @@ -11,7 +13,8 @@
{"bioSampleAccession": "SAME00002"},
{"bioSampleAccession": "SAME00003"},
{"bioSampleAccession": "SAME00004"},
{"bioSampleAccession": "SAME00005"}
{"bioSampleAccession": "SAME00005"},
{"bioSampleAccession": "SAME00006"}
]
}
valid_sample = {
Expand Down Expand Up @@ -50,6 +53,15 @@
}
}

old_invalid_sample2 = {
'accession': 'SAME00005',
'name': 'sample4',
"create": "2011-10-10T08:45:15Z",
'characteristics': {
'organism': [{'text': 'Viridiplantae'}]
}
}


class TestSemanticMetadata(TestCase):

Expand All @@ -62,7 +74,7 @@ def test_check_all_project_accessions(self):
}
checker = SemanticMetadataChecker(metadata)
with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download:
m_ena_download.side_effect = [True, True, Exception('problem downloading')]
m_ena_download.side_effect = [True, True, HTTPError('problem downloading', response=Response())]
checker.check_all_project_accessions()
self.assertEqual(checker.errors, [
{'property': '/project/childProjects/1', 'description': 'Project PRJEBNA does not exist in ENA or is private'}
Expand Down Expand Up @@ -170,7 +182,7 @@ def test_check_all_scientific_names(self):
def test_check_existing_biosamples_with_checklist(self):
checker = SemanticMetadataChecker(metadata)
with patch.object(SemanticMetadataChecker, '_get_biosample',
side_effect=[valid_sample, ValueError, invalid_sample1, invalid_sample2, old_invalid_sample]) as m_get_sample:
side_effect=[valid_sample, ValueError, invalid_sample1, invalid_sample2, old_invalid_sample, old_invalid_sample2]) as m_get_sample:
checker.check_existing_biosamples()
self.assertEqual(
checker.errors[0],
Expand All @@ -196,14 +208,25 @@ def test_check_existing_biosamples_with_checklist(self):
def test_check_existing_biosamples(self):
checker = SemanticMetadataChecker(metadata, sample_checklist=None)
with patch.object(NoAuthHALCommunicator, 'follows_link',
side_effect=[valid_sample, ValueError, invalid_sample1, invalid_sample2, old_invalid_sample]) as m_follows_link:
side_effect=[valid_sample, ValueError, invalid_sample1, invalid_sample2, old_invalid_sample, old_invalid_sample2]) as m_follows_link:
checker.check_existing_biosamples()
self.assertEqual(checker.errors, [
{'description': 'SAME00002 does not exist or is private','property': '/sample/1/bioSampleAccession'},
{'description': 'Existing sample SAME00003 does not have a valid collection date', 'property': '/sample/2/bioSampleAccession'},
{'description': 'Existing sample SAME00004 does not have a valid collection date', 'property': '/sample/3/bioSampleAccession'},
{'description': 'Existing sample SAME00004 does not have a valid geographic location', 'property': '/sample/3/bioSampleAccession'}])

@pytest.mark.skip(reason='Contact BioSample API')
def test_check_existing_real_biosamples(self):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Useful to test a specific Biosample.

metadata = {
"sample": [
{"bioSampleAccession": "SAMN01894452"}
]
}
checker = SemanticMetadataChecker(metadata, sample_checklist=None)
checker.check_existing_biosamples()
print(checker.errors)

def test_check_analysis_alias_coherence(self):
metadata = {
"analysis": [
Expand Down