Skip to content

Commit

Permalink
fix(ena-submission): Parse get analysis process response better for s…
Browse files Browse the repository at this point in the history
…ingle segment case, return partial results and submit projects with holdUntilDate (#2896)

* Rename check_ena to get_ena_analysis_process, refactor so that get_chromsome_accessions is a separate sub-function of get_ena_analysis_process, and results are returned when either the gca accession OR the chromosome accessions are returned (in contrast to now only returning results when both are returned)

* Add tests that get_chromsome_accessions works for single and multi-segmented viruses now that we know the expected response format for both. I added a test where I mock the response using a known response text for a 1-segment case and confirm this works,

* Update assembly_table.results column response results as soon as gca or chromosome accessions are known, but only change from WAITING to SUBMITTED state when both are known. Do not update table when there are no results or results are the same as already in table.

* This PR additionally modifies the project submission request to by default submit projects with a hold_until_date parameter which is set to the day of submission (as we anyways only submit data to ENA when it is OPEN we should make the project public).

* Rename results to result to be more coherent with rest of code

---------

Co-authored-by: Cornelius Roemer <cornelius.roemer@gmail.com>
  • Loading branch information
anna-parker and corneliusroemer authored Sep 30, 2024
1 parent 19eeb82 commit 9920c16
Show file tree
Hide file tree
Showing 7 changed files with 257 additions and 101 deletions.
74 changes: 55 additions & 19 deletions ena-submission/scripts/create_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
import pytz
import yaml
from ena_submission_helper import (
CreationResults,
check_ena,
CreationResult,
create_chromosome_list,
create_ena_assembly,
create_fasta,
create_manifest,
get_ena_analysis_process,
get_ena_config,
)
from ena_types import (
Expand Down Expand Up @@ -100,14 +100,12 @@ def create_chromosome_list_object(
return AssemblyChromosomeListFile(chromosomes=entries)


def get_segment_order(unaligned_sequences) -> list[str]:
def get_segment_order(unaligned_sequences: dict[str, str]) -> list[str]:
"""Order in which we put the segments in the chromosome list file"""
segment_order = []
if len(unaligned_sequences.keys()) > 1:
for segment_name, item in unaligned_sequences.items():
if item: # Only list sequenced segments
segment_order.append(segment_name)
else:
segment_order.append("main")
for segment_name, item in unaligned_sequences.items():
if item: # Only list sequenced segments
segment_order.append(segment_name)
return sorted(segment_order)


Expand Down Expand Up @@ -371,14 +369,14 @@ def assembly_table_create(
segment_order = get_segment_order(
sample_data_in_submission_table[0]["unaligned_nucleotide_sequences"]
)
assembly_creation_results: CreationResults = create_ena_assembly(
assembly_creation_results: CreationResult = create_ena_assembly(
ena_config, manifest_file, center_name=center_name, test=test
)
if assembly_creation_results.results:
assembly_creation_results.results["segment_order"] = segment_order
if assembly_creation_results.result:
assembly_creation_results.result["segment_order"] = segment_order
update_values = {
"status": Status.WAITING,
"result": json.dumps(assembly_creation_results.results),
"result": json.dumps(assembly_creation_results.result),
}
number_rows_updated = 0
tries = 0
Expand Down Expand Up @@ -448,17 +446,52 @@ def assembly_table_update(
logger.debug("Checking state in ENA")
for row in waiting:
seq_key = {"accession": row["accession"], "version": row["version"]}
segment_order = row["result"]["segment_order"]
check_results: CreationResults = check_ena(
ena_config, row["result"]["erz_accession"], segment_order
# Previous means from the last time the entry was checked, from db
previous_result = row["result"]
segment_order = previous_result["segment_order"]
new_result: CreationResult = get_ena_analysis_process(
ena_config, previous_result["erz_accession"], segment_order
)
_last_ena_check = time
if not check_results.results:

if not new_result.result:
continue

result_contains_gca_accession = "gca_accession" in new_result.result
result_contains_insdc_accession = any(
key.startswith("insdc_accession_full") for key in new_result.result
)

if not (result_contains_gca_accession and result_contains_insdc_accession):
if previous_result == new_result.result:
continue
update_values = {
"status": Status.WAITING,
"result": json.dumps(new_result.result),
"finished_at": datetime.now(tz=pytz.utc),
}
number_rows_updated = 0
tries = 0
while number_rows_updated != 1 and tries < retry_number:
if tries > 0:
logger.warning(
f"Assembly partially in ENA but DB update failed - reentry DB update #{tries}."
)
number_rows_updated = update_db_where_conditions(
db_config,
table_name="assembly_table",
conditions=seq_key,
update_values=update_values,
)
tries += 1
if number_rows_updated == 1:
logger.info(
f"Partial results of assembly submission for accession {row["accession"]} returned!"
)
continue
update_values = {
"status": Status.SUBMITTED,
"result": json.dumps(check_results.results),
"result": json.dumps(new_result.result),
"finished_at": datetime.now(tz=pytz.utc),
}
number_rows_updated = 0
Expand Down Expand Up @@ -522,7 +555,10 @@ def assembly_table_handle_errors(
f" status WAITING for over {time_threshold_waiting}h"
)
send_slack_notification(
config, error_msg, time=datetime.now(tz=pytz.utc), time_threshold=slack_time_threshold
error_msg,
slack_config,
time=datetime.now(tz=pytz.utc),
time_threshold=slack_time_threshold,
)


Expand Down
8 changes: 4 additions & 4 deletions ena-submission/scripts/create_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pytz
import yaml
from call_loculus import get_group_info
from ena_submission_helper import CreationResults, create_ena_project, get_ena_config
from ena_submission_helper import CreationResult, create_ena_project, get_ena_config
from ena_types import (
OrganismType,
ProjectLink,
Expand Down Expand Up @@ -275,11 +275,11 @@ def project_table_create(
logger.info(
f"Starting Project creation for group_id {row["group_id"]} organism {row["organism"]}"
)
project_creation_results: CreationResults = create_ena_project(ena_config, project_set)
if project_creation_results.results:
project_creation_results: CreationResult = create_ena_project(ena_config, project_set)
if project_creation_results.result:
update_values = {
"status": Status.SUBMITTED,
"result": json.dumps(project_creation_results.results),
"result": json.dumps(project_creation_results.result),
"finished_at": datetime.now(tz=pytz.utc),
}
number_rows_updated = 0
Expand Down
8 changes: 4 additions & 4 deletions ena-submission/scripts/create_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import click
import pytz
import yaml
from ena_submission_helper import CreationResults, create_ena_sample, get_ena_config
from ena_submission_helper import CreationResult, create_ena_sample, get_ena_config
from ena_types import (
ProjectLink,
SampleAttribute,
Expand Down Expand Up @@ -320,11 +320,11 @@ def sample_table_create(
)
continue
logger.info(f"Starting sample creation for accession {row["accession"]}")
sample_creation_results: CreationResults = create_ena_sample(ena_config, sample_set)
if sample_creation_results.results:
sample_creation_results: CreationResult = create_ena_sample(ena_config, sample_set)
if sample_creation_results.result:
update_values = {
"status": Status.SUBMITTED,
"result": json.dumps(sample_creation_results.results),
"result": json.dumps(sample_creation_results.result),
"finished_at": datetime.now(tz=pytz.utc),
}
number_rows_updated = 0
Expand Down
Loading

0 comments on commit 9920c16

Please sign in to comment.