feat(parser): handle duplicate ID errors during parsing (#408)

* add duplicates at first, remove duplicated is_before links, remove duplicated child links * remove duplicate entry nodes * fix test * resolve comments * fix format issue from main
openfoodfacts · Feb 14, 2024 · e5720bf · e5720bf
1 parent 92ff90a
commit e5720bf
Show file tree

Hide file tree

Showing 4 changed files with 114 additions and 21 deletions.
diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
@@ -108,15 +108,15 @@ def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]]
                 line_count += 1
             yield line_count, ""  # to end the last entry if not ended
 
-    def _add_line(self, line: str) -> str:
+    def _normalize_entry_id(self, raw_id: str) -> str:
         """
         Get a normalized string but keeping the language code "lc:",
         used for id and parent tag
         """
-        lc, line = line.split(":", 1)
-        new_line = lc + ":"
-        new_line += normalize_text(line, lc, stopwords=self.stopwords)
-        return new_line
+        lc, main_tag = raw_id.split(":", 1)
+        normalized_main_tag = normalize_text(main_tag, lc, stopwords=self.stopwords)
+        normalized_id = f"{lc}:{normalized_main_tag}"
+        return normalized_id
 
     def _get_lc_value(self, line: str) -> tuple[str, list[str]]:
         """Get the language code "lc" and a list of normalized values"""
@@ -251,20 +251,24 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
         for line_number, line in self._file_iter(filename, entries_start_line):
             # yield data if block ended
             if self._entry_end(line, data):
+                data = self._remove_separating_line(data)
+                if data.get_node_type() == NodeType.ENTRY:
+                    data = self._get_node_data_with_parent_and_end_comments(data)
                 if data.id in saved_nodes:
+                    # this duplicate node will be merged with the first one
+                    data.is_before = None
                     msg = (
-                        f"Entry with same id {data.id} already created, "
+                        f"WARNING: Entry with same id {data.id} already exists, "
                         f"duplicate id in file at line {data.src_position}. "
-                        "Node creation cancelled."
+                        "The two nodes will be merged, keeping the last "
+                        "values in case of conflicts."
                     )
                     self.parser_logger.error(msg)
                 else:
-                    data = self._remove_separating_line(data)
-                    if data.get_node_type() == NodeType.ENTRY:
-                        data = self._get_node_data_with_parent_and_end_comments(data)
-                    yield data  # another function will use this dictionary to create a node
                     saved_nodes.append(data.id)
-                data = NodeData(is_before=data.id)
+                    is_before = data.id
+                yield data  # another function will use this dictionary to create a node
+                data = NodeData(is_before=is_before)
 
             # harvest the line
             if not (line) or line[0] == "#":
@@ -319,11 +323,11 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                         data.tags["tags_ids_" + lc] = value
                 elif line[0] == "<":
                     # parent definition
-                    data.parent_tags.append((self._add_line(line[1:]), line_number + 1))
+                    data.parent_tags.append((self._normalize_entry_id(line[1:]), line_number + 1))
                 elif language_code_prefix.match(line):
                     # synonyms definition
                     if not data.id:
-                        data.id = self._add_line(line.split(",", 1)[0])
+                        data.id = self._normalize_entry_id(line.split(",", 1)[0])
                         # first 2-3 characters before ":" are the language code
                         data.main_language = data.id.split(":", 1)[0]
                     # add tags and tagsid
@@ -404,7 +408,9 @@ def _normalise_and_validate_child_links(
 
         return normalised_child_links, missing_child_links
 
-    def _get_valid_child_links(self, entry_nodes: list[NodeData], raw_child_links: list[ChildLink]):
+    def _get_valid_child_links(
+        self, entry_nodes: list[NodeData], raw_child_links: list[ChildLink]
+    ) -> list[ChildLink]:
         """Get only the valid child links, i.e. the child links whose the parent_id exists"""
         node_ids = set([node.id for node in entry_nodes])
 
@@ -438,6 +444,58 @@ def _get_valid_child_links(self, entry_nodes: list[NodeData], raw_child_links: l
 
         return valid_child_links
 
+    def _remove_duplicate_child_links(self, child_links: list[ChildLink]) -> list[ChildLink]:
+        """Remove duplicate child links (i.e child links with the same parent_id and id)"""
+        unique_child_links = []
+        children_to_parents = collections.defaultdict(set)
+        for child_link in child_links:
+            parent_id, child_id = child_link["parent_id"], child_link["id"]
+            if parent_id not in children_to_parents[child_id]:
+                children_to_parents[child_id].add(parent_id)
+                unique_child_links.append(child_link)
+        return unique_child_links
+
+    def _merge_duplicate_entry_nodes(self, entry_nodes: list[NodeData]) -> list[NodeData]:
+        """Merge entry nodes with the same id:
+        - merge their tags (union)
+        - merge their properties (union, and in case of conflict, keep the last value)"""
+        unique_entry_nodes = []
+        ids_to_nodes = dict()
+        for node in entry_nodes:
+            if node.id in ids_to_nodes:
+                first_node = ids_to_nodes[node.id]
+                for key, value in node.tags.items():
+                    if not key.startswith("tags_ids_"):
+                        # union of the tags
+                        first_node.tags[key] = list(
+                            # we use a dict to remove duplicates
+                            # while keeping the order of the tags
+                            dict.fromkeys(first_node.tags.get(key, []) + value)
+                        )
+                        # we have to re-normalize the tags_ids and can't just do a union,
+                        # because two tags can have the same normalized value
+                        language_code = key.split("_")[1]
+                        first_node.tags[f"tags_ids_{language_code}"] = [
+                            normalize_text(tag, language_code, stopwords=self.stopwords)
+                            for tag in first_node.tags[key]
+                        ]
+                for key, value in node.properties.items():
+                    # overwrite the value if the property already exists in the first node
+                    first_node.properties[key] = value
+                for key, value in node.comments.items():
+                    # union of the comments
+                    first_node.comments[key] = list(
+                        # we use a dict to remove duplicates
+                        # while keeping the order of the tags
+                        dict.fromkeys(first_node.comments.get(key, []) + value)
+                    )
+                # union of the preceding_lines comments
+                first_node.preceding_lines.extend(node.preceding_lines)
+            else:
+                unique_entry_nodes.append(node)
+                ids_to_nodes[node.id] = node
+        return unique_entry_nodes
+
     def _create_taxonomy(self, filename: str) -> Taxonomy:
         """Create the taxonomy from the file"""
         self.parser_logger.info(f"Parsing {filename}")
@@ -467,11 +525,13 @@ def _create_taxonomy(self, filename: str) -> Taxonomy:
                         )
                     )
         valid_child_links = self._get_valid_child_links(entry_nodes, raw_child_links)
+        child_links = self._remove_duplicate_child_links(valid_child_links)
+        entry_nodes = self._merge_duplicate_entry_nodes(entry_nodes)
         return Taxonomy(
             entry_nodes=entry_nodes,
             other_nodes=other_nodes,
             previous_links=previous_links,
-            child_links=valid_child_links,
+            child_links=child_links,
         )
 
     def parse_file(self, filename: str, logger: ParserConsoleLogger | None = None) -> Taxonomy:

diff --git a/parser/tests/integration/test_parser_integration.py b/parser/tests/integration/test_parser_integration.py
@@ -195,15 +195,17 @@ def test_error_log(neo4j, tmp_path, caplog):
         number_of_nodes = result.value()[0]
         assert number_of_nodes == 2
         # error logged
-        assert "Entry with same id en:fake-meat already created" in caplog.text
+        assert "WARNING: Entry with same id en:fake-meat already exists," in caplog.text
         assert "duplicate id in file at line 12" in caplog.text
-        assert "Node creation cancelled." in caplog.text
+        assert "The two nodes will be merged, keeping the last" in caplog.text
+        assert "values in case of conflicts." in caplog.text
         # and present on project
         query = "MATCH (n:ERRORS) WHERE n.id = 'p_test_branch' RETURN n"
         results = session.run(query).value()
         node = results[0]
         assert len(node["errors"]) == 1
         error = node["errors"][0]
-        assert "Entry with same id en:fake-meat already created" in error
+        assert "WARNING: Entry with same id en:fake-meat already exists," in error
         assert "duplicate id in file at line 12" in error
-        assert "Node creation cancelled." in error
+        assert "The two nodes will be merged, keeping the last" in error
+        assert "values in case of conflicts." in error
diff --git a/taxonomy-editor-frontend/README.md b/taxonomy-editor-frontend/README.md
@@ -15,7 +15,6 @@ This is the main user interface developed in React, which works in conjunction w
 
 This project was initially setup using [Create React App](https://github.com/facebook/create-react-app), [the documentation](https://create-react-app.dev/) might be useful.
 
-
 ## Setup Dev Environment
 
 See [this guide](../doc/introduction/setup-dev.md) for more information.

diff --git a/taxonomy-editor-frontend/package-lock.json b/taxonomy-editor-frontend/package-lock.json