Skip to content

Commit

Permalink
Fix generation of submission order (#101)
Browse files Browse the repository at this point in the history
  • Loading branch information
paulineribeyre authored Nov 6, 2023
1 parent 288b5f2 commit 3f5257f
Show file tree
Hide file tree
Showing 6 changed files with 172 additions and 43 deletions.
63 changes: 23 additions & 40 deletions datasimulator/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from .node import Node, logger
from .errors import UserError, DictionaryError
from .generator import generate_list_numbers
from .utils import generate_list_numbers_from_file
from .utils import generate_list_numbers_from_file, get_graph_traversal_path

EXCLUDED_NODE = ["program", "root", "data_release"]

Expand Down Expand Up @@ -102,7 +102,7 @@ def _add_required_link_to_node(
node_parent = self.get_node_with_name(link_node_name)

if not node_parent:
msg = "Node {} have a link to node {} which does not exist".format(
msg = "Node '{}' has a link to node '{}' which does not exist".format(
node.name, link_node_name
)
if skip:
Expand Down Expand Up @@ -203,53 +203,36 @@ def construct_graph_edges(self):

def generate_submission_order_path_to_node(self, node, cmc_node=None):
"""
Generate submission order so that the current node can be submitted
Args:
node(Node): current node object
Outputs:
list: list of submission order
From the specified `end_node`, step through the graph from bottom to top to generate the minimal
submission order to that node.
"""
submission_order = [node]
if cmc_node:
submission_order.append(cmc_node)
index = 0

project_node = None
while index < len(submission_order):
cur_node = submission_order[index]
index += 1
if not cur_node:
continue
for linked_node_dict in cur_node.required_links:
if linked_node_dict["node"].name == "project":
project_node = linked_node_dict["node"]
continue
if linked_node_dict["node"] in submission_order:
# reorder to place the node at the beginning. eg. if this node is parent to 2 other
# nodes, we need it to be submitted before the 2 others, and not in-between.
submission_order.remove(linked_node_dict["node"])
submission_order.append(linked_node_dict["node"])

if project_node: # project should always be submitted first
submission_order.append(project_node)

# get the bottom-to-top path from the node, and then reverse since the actual submission order
# must be top-to-bottom (parent nodes first)
submission_order = get_graph_traversal_path(direction="up", start_node=node)
submission_order.reverse()

# if specified, make sure that `core_metadata_collection` is in the submission order
if cmc_node and cmc_node not in submission_order:
# insert cmc node right after "project"
submission_order.insert(1, cmc_node)

return submission_order

def generate_submission_order(self):
"""
Generate submission order for the graph
Step through the graph from top to bottom to generate the submission order from the `project` node
to all leaf nodes.
"""
submission_order = []
# populate the nodes' `child_nodes` lists
for node in self.nodes:
if node not in submission_order:
for item in self.generate_submission_order_path_to_node(node):
if item not in submission_order:
submission_order.append(item)
if node.name == "project":
project_node = node
for link in node.required_links:
link["node"].child_nodes.append(node)

submission_order = get_graph_traversal_path(
direction="down", start_node=project_node
)

return submission_order

Expand Down
2 changes: 2 additions & 0 deletions datasimulator/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ def __init__(self, node_name, node_schema, project, consent_codes):
"Error: NODE {} does not have key `{}`".format(node_name, e.message)
)
self.required_links = []
# useful property for traversing the graph from top to bottom:
self.child_nodes = []
self.simulated_dataset = []

def __str__(self):
Expand Down
36 changes: 36 additions & 0 deletions datasimulator/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,39 @@ def generate_list_numbers_from_file(data_file, submission_order, n_samples):
)

return result


def get_graph_traversal_path(direction, start_node):
"""
Starting from the specified node, step through the graph either from top to bottom (direction=down) or
from bottom to top (direction=up) through node links, and return the resulting path. Ensure "parent"
nodes are always before their direct and indirect "child" nodes in the path.
Args:
direction(str): "down" for top-to-bottom or "up" for bottom-to-top
start_node(Node): Node at which to start the path
Outputs:
list[Node]: path
"""
if direction not in ["down", "up"]:
raise Exception(
f"Graph traversal is either top-to-bottom (direction=down) or bottom-to-top (direction=up). Provided value direction={direction} unknown."
)

to_visit = [start_node]
path = [start_node]
while to_visit:
node = to_visit.pop()
if direction == "down":
links = node.child_nodes
elif direction == "up":
links = [link["node"] for link in node.required_links]
for linked_node in links:
if linked_node in path:
path.remove(linked_node)
path.append(linked_node)
if linked_node not in to_visit:
to_visit.append(linked_node)

return path
1 change: 1 addition & 0 deletions tests/schemas/gtex.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion tests/schemas/multiple_children_edge_case.json

Large diffs are not rendered by default.

111 changes: 109 additions & 2 deletions tests/test_data_simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,111 @@ def test_get_schema(default_dictionary):
# TODO: delete generated files at the end of tests


def test_generate_submission_order():
"""
Generate the submission order from the project node to all leaf nodes.
Check that parent nodes are always submitted before their linked child nodes.
"""
datadictionary = DataDictionary(
local_file=os.path.join(MOD_DIR, "schemas/gtex.json")
)
dictionary.init(datadictionary)

graph = Graph(dictionary, "DEV", "test")
graph.generate_nodes_from_dictionary()
graph.construct_graph_edges()

submission_order = graph.generate_submission_order()
names = [node.name for node in submission_order]
assert len(names) == len(
set(names)
), "There should be not duplicates in the submission order"

for node in submission_order:
node_i = submission_order.index(node)
for child_node in node.child_nodes:
child_i = submission_order.index(child_node)
assert (
node_i < child_i
), f"Node '{node.name}' should be submitted before its child '{child_node.name}'"

assert names == [
"project",
"publication",
"study",
"core_metadata_collection",
"acknowledgement",
"reference_file",
"reference_file_index",
"subject",
"demographic",
"exposure",
"electrocardiogram_test",
"sample",
"blood_pressure_test",
"sleep_test_file",
"medical_history",
"cardiac_mri",
"imaging_file",
"lab_result",
"medication",
"imaging_file_reference",
"aliquot",
"read_group",
"submitted_aligned_reads",
"submitted_unaligned_reads",
"germline_mutation_calling_workflow",
"alignment_cocleaning_workflow",
"alignment_workflow",
"aligned_reads",
"aligned_reads_index",
"simple_germline_variation",
"germline_variation_index",
]


def test_generate_submission_order_path_to_node():
"""
Generate the submission order from the project node to a specific leaf node.
Check that parent nodes are always submitted before their linked child nodes.
"""
datadictionary = DataDictionary(
local_file=os.path.join(MOD_DIR, "schemas/gtex.json")
)
dictionary.init(datadictionary)

graph = Graph(dictionary, "DEV", "test")
graph.generate_nodes_from_dictionary()
graph.construct_graph_edges()

submission_order = graph.generate_submission_order_path_to_node(
graph.get_node_with_name("submitted_aligned_reads")
)
names = [node.name for node in submission_order]
assert len(names) == len(
set(names)
), "There should be not duplicates in the submission order"

for node in submission_order:
node_i = submission_order.index(node)
for child_node in node.child_nodes:
child_i = submission_order.index(child_node)
assert (
node_i < child_i
), f"Node '{node.name}' should be submitted before its child '{child_node.name}'"

assert names == [
"project",
"study",
"subject",
"sample",
"aliquot",
"read_group",
"core_metadata_collection",
"submitted_aligned_reads",
]


def test_generate_submission_order_path_to_node_multiple_children():
# this is a simplified version of the bpadictionary, where "aliquot" is child of both "sample" and "study",
# and "case" is child of "study". So "study" should be submitted before both "aliquot" and "case", and not
Expand All @@ -33,9 +138,11 @@ def test_generate_submission_order_path_to_node_multiple_children():
graph.generate_nodes_from_dictionary()
graph.construct_graph_edges()

node = [n for n in graph.nodes if n.name == "analyte"][0]
submission_order = [
node.name for node in graph.generate_submission_order_path_to_node(node)
node.name
for node in graph.generate_submission_order_path_to_node(
graph.get_node_with_name("analyte")
)
]

# before the fix, "study" was not submitted before "case":
Expand Down

0 comments on commit 3f5257f

Please sign in to comment.