Skip to content

Commit

Permalink
feat(parser): handle duplicate ID errors during parsing (#408)
Browse files Browse the repository at this point in the history
* add duplicates at first, remove duplicated is_before links, remove duplicated child links

* remove duplicate entry nodes

* fix test

* resolve comments

* fix format issue from main
  • Loading branch information
perierc authored Feb 14, 2024
1 parent 92ff90a commit e5720bf
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 21 deletions.
92 changes: 76 additions & 16 deletions parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,15 +108,15 @@ def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]]
line_count += 1
yield line_count, "" # to end the last entry if not ended

def _add_line(self, line: str) -> str:
def _normalize_entry_id(self, raw_id: str) -> str:
"""
Get a normalized string but keeping the language code "lc:",
used for id and parent tag
"""
lc, line = line.split(":", 1)
new_line = lc + ":"
new_line += normalize_text(line, lc, stopwords=self.stopwords)
return new_line
lc, main_tag = raw_id.split(":", 1)
normalized_main_tag = normalize_text(main_tag, lc, stopwords=self.stopwords)
normalized_id = f"{lc}:{normalized_main_tag}"
return normalized_id

def _get_lc_value(self, line: str) -> tuple[str, list[str]]:
"""Get the language code "lc" and a list of normalized values"""
Expand Down Expand Up @@ -251,20 +251,24 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
for line_number, line in self._file_iter(filename, entries_start_line):
# yield data if block ended
if self._entry_end(line, data):
data = self._remove_separating_line(data)
if data.get_node_type() == NodeType.ENTRY:
data = self._get_node_data_with_parent_and_end_comments(data)
if data.id in saved_nodes:
# this duplicate node will be merged with the first one
data.is_before = None
msg = (
f"Entry with same id {data.id} already created, "
f"WARNING: Entry with same id {data.id} already exists, "
f"duplicate id in file at line {data.src_position}. "
"Node creation cancelled."
"The two nodes will be merged, keeping the last "
"values in case of conflicts."
)
self.parser_logger.error(msg)
else:
data = self._remove_separating_line(data)
if data.get_node_type() == NodeType.ENTRY:
data = self._get_node_data_with_parent_and_end_comments(data)
yield data # another function will use this dictionary to create a node
saved_nodes.append(data.id)
data = NodeData(is_before=data.id)
is_before = data.id
yield data # another function will use this dictionary to create a node
data = NodeData(is_before=is_before)

# harvest the line
if not (line) or line[0] == "#":
Expand Down Expand Up @@ -319,11 +323,11 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
data.tags["tags_ids_" + lc] = value
elif line[0] == "<":
# parent definition
data.parent_tags.append((self._add_line(line[1:]), line_number + 1))
data.parent_tags.append((self._normalize_entry_id(line[1:]), line_number + 1))
elif language_code_prefix.match(line):
# synonyms definition
if not data.id:
data.id = self._add_line(line.split(",", 1)[0])
data.id = self._normalize_entry_id(line.split(",", 1)[0])
# first 2-3 characters before ":" are the language code
data.main_language = data.id.split(":", 1)[0]
# add tags and tagsid
Expand Down Expand Up @@ -404,7 +408,9 @@ def _normalise_and_validate_child_links(

return normalised_child_links, missing_child_links

def _get_valid_child_links(self, entry_nodes: list[NodeData], raw_child_links: list[ChildLink]):
def _get_valid_child_links(
self, entry_nodes: list[NodeData], raw_child_links: list[ChildLink]
) -> list[ChildLink]:
"""Get only the valid child links, i.e. the child links whose the parent_id exists"""
node_ids = set([node.id for node in entry_nodes])

Expand Down Expand Up @@ -438,6 +444,58 @@ def _get_valid_child_links(self, entry_nodes: list[NodeData], raw_child_links: l

return valid_child_links

def _remove_duplicate_child_links(self, child_links: list[ChildLink]) -> list[ChildLink]:
"""Remove duplicate child links (i.e child links with the same parent_id and id)"""
unique_child_links = []
children_to_parents = collections.defaultdict(set)
for child_link in child_links:
parent_id, child_id = child_link["parent_id"], child_link["id"]
if parent_id not in children_to_parents[child_id]:
children_to_parents[child_id].add(parent_id)
unique_child_links.append(child_link)
return unique_child_links

def _merge_duplicate_entry_nodes(self, entry_nodes: list[NodeData]) -> list[NodeData]:
"""Merge entry nodes with the same id:
- merge their tags (union)
- merge their properties (union, and in case of conflict, keep the last value)"""
unique_entry_nodes = []
ids_to_nodes = dict()
for node in entry_nodes:
if node.id in ids_to_nodes:
first_node = ids_to_nodes[node.id]
for key, value in node.tags.items():
if not key.startswith("tags_ids_"):
# union of the tags
first_node.tags[key] = list(
# we use a dict to remove duplicates
# while keeping the order of the tags
dict.fromkeys(first_node.tags.get(key, []) + value)
)
# we have to re-normalize the tags_ids and can't just do a union,
# because two tags can have the same normalized value
language_code = key.split("_")[1]
first_node.tags[f"tags_ids_{language_code}"] = [
normalize_text(tag, language_code, stopwords=self.stopwords)
for tag in first_node.tags[key]
]
for key, value in node.properties.items():
# overwrite the value if the property already exists in the first node
first_node.properties[key] = value
for key, value in node.comments.items():
# union of the comments
first_node.comments[key] = list(
# we use a dict to remove duplicates
# while keeping the order of the tags
dict.fromkeys(first_node.comments.get(key, []) + value)
)
# union of the preceding_lines comments
first_node.preceding_lines.extend(node.preceding_lines)
else:
unique_entry_nodes.append(node)
ids_to_nodes[node.id] = node
return unique_entry_nodes

def _create_taxonomy(self, filename: str) -> Taxonomy:
"""Create the taxonomy from the file"""
self.parser_logger.info(f"Parsing {filename}")
Expand Down Expand Up @@ -467,11 +525,13 @@ def _create_taxonomy(self, filename: str) -> Taxonomy:
)
)
valid_child_links = self._get_valid_child_links(entry_nodes, raw_child_links)
child_links = self._remove_duplicate_child_links(valid_child_links)
entry_nodes = self._merge_duplicate_entry_nodes(entry_nodes)
return Taxonomy(
entry_nodes=entry_nodes,
other_nodes=other_nodes,
previous_links=previous_links,
child_links=valid_child_links,
child_links=child_links,
)

def parse_file(self, filename: str, logger: ParserConsoleLogger | None = None) -> Taxonomy:
Expand Down
10 changes: 6 additions & 4 deletions parser/tests/integration/test_parser_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,15 +195,17 @@ def test_error_log(neo4j, tmp_path, caplog):
number_of_nodes = result.value()[0]
assert number_of_nodes == 2
# error logged
assert "Entry with same id en:fake-meat already created" in caplog.text
assert "WARNING: Entry with same id en:fake-meat already exists," in caplog.text
assert "duplicate id in file at line 12" in caplog.text
assert "Node creation cancelled." in caplog.text
assert "The two nodes will be merged, keeping the last" in caplog.text
assert "values in case of conflicts." in caplog.text
# and present on project
query = "MATCH (n:ERRORS) WHERE n.id = 'p_test_branch' RETURN n"
results = session.run(query).value()
node = results[0]
assert len(node["errors"]) == 1
error = node["errors"][0]
assert "Entry with same id en:fake-meat already created" in error
assert "WARNING: Entry with same id en:fake-meat already exists," in error
assert "duplicate id in file at line 12" in error
assert "Node creation cancelled." in error
assert "The two nodes will be merged, keeping the last" in error
assert "values in case of conflicts." in error
1 change: 0 additions & 1 deletion taxonomy-editor-frontend/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ This is the main user interface developed in React, which works in conjunction w

This project was initially setup using [Create React App](https://github.com/facebook/create-react-app), [the documentation](https://create-react-app.dev/) might be useful.


## Setup Dev Environment

See [this guide](../doc/introduction/setup-dev.md) for more information.
Expand Down
32 changes: 32 additions & 0 deletions taxonomy-editor-frontend/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit e5720bf

Please sign in to comment.