From 780ad2af6eeb95c588f18b84a95e6e3522c3726f Mon Sep 17 00:00:00 2001 From: Charles Perier <82757576+perierc@users.noreply.github.com> Date: Wed, 7 Feb 2024 11:41:01 +0100 Subject: [PATCH] feat(parser,frontend,backend): conserve comments order and sort properties at export (#364) * feat: conserve comments order * remove unused import * sort properties at export and fix tests * avoid storing comments with empty lists * fix update_node controller to remove comments when needed * handle prop comments and potential issues * refactor and make pure functions * fix and format * hide comments in properties table, and fix set changing size during iteration --- backend/editor/entries.py | 41 +++++---- .../parser/parser.py | 9 +- .../parser/taxonomy_parser.py | 89 ++++++++++++++++--- .../openfoodfacts_taxonomy_parser/unparser.py | 15 ++-- parser/tests/data/test.txt | 2 +- parser/tests/unit/test_parser_unit.py | 2 +- .../editentry/ListAllEntryProperties.jsx | 30 ++----- .../pages/editentry/ListAllNonEntryInfo.jsx | 24 ----- 8 files changed, 124 insertions(+), 88 deletions(-) diff --git a/backend/editor/entries.py b/backend/editor/entries.py index 9b057b31..2bec7905 100644 --- a/backend/editor/entries.py +++ b/backend/editor/entries.py @@ -286,6 +286,12 @@ def is_valid_branch_name(self): """ return parser_utils.normalize_text(self.branch_name, char="_") == self.branch_name + def is_tag_key(self, key): + """ + Helper function to check if a key is a tag key (tags_XX) + """ + return key.startswith("tags_") and not ("_ids_" in key or key.endswith("_comments")) + async def list_projects(self, status=None): """ Helper function for listing all existing projects created in Taxonomy Editor @@ -494,12 +500,21 @@ async def update_node(self, label, entry, new_node): curr_node = result[0]["n"] deleted_keys = set(curr_node.keys()) - set(new_node.keys()) + # Check for deleted keys having associated comments and delete them too + comments_keys_to_delete = set() + for key in deleted_keys: + comments_key = key + "_comments" + if new_node.get(comments_key) is not None: + comments_keys_to_delete.add(comments_key) + deleted_keys = deleted_keys.union(comments_keys_to_delete) + # Check for keys having null/empty values for key in new_node.keys(): if ((new_node[key] == []) or (new_node[key] is None)) and key != "preceding_lines": deleted_keys.add(key) + deleted_keys.add(key + "_comments") # Delete tags_ids if we delete tags of a language - if key.startswith("tags_") and "_ids_" not in key: + if self.is_tag_key(key): deleted_keys.add("tags_ids_" + key.split("_", 1)[1]) # Build query @@ -515,20 +530,16 @@ async def update_node(self, label, entry, new_node): normalised_new_node = {} stopwords = await self.get_stopwords_dict() for key in set(new_node.keys()) - deleted_keys: - if key.startswith("tags_"): - if "_ids_" not in key: - keys_language_code = key.split("_", 1)[1] - normalised_value = [] - for value in new_node[key]: - normalised_value.append( - parser_utils.normalize_text( - value, keys_language_code, stopwords=stopwords - ) - ) - normalised_new_node[key] = new_node[key] - normalised_new_node["tags_ids_" + keys_language_code] = normalised_value - else: - pass # We generate tags_ids, and ignore the one sent + if self.is_tag_key(key): + # Normalise tags and store them in tags_ids + keys_language_code = key.split("_", 1)[1] + normalised_value = [] + for value in new_node[key]: + normalised_value.append( + parser_utils.normalize_text(value, keys_language_code, stopwords=stopwords) + ) + normalised_new_node[key] = new_node[key] + normalised_new_node["tags_ids_" + keys_language_code] = normalised_value else: # No need to normalise normalised_new_node[key] = new_node[key] diff --git a/parser/openfoodfacts_taxonomy_parser/parser/parser.py b/parser/openfoodfacts_taxonomy_parser/parser/parser.py index 824554f1..7460578f 100644 --- a/parser/openfoodfacts_taxonomy_parser/parser/parser.py +++ b/parser/openfoodfacts_taxonomy_parser/parser/parser.py @@ -66,16 +66,17 @@ def _create_entry_nodes(self, entry_nodes: list[NodeData], project_label: str): # we don't know in advance which properties and tags # we will encounter in the batch # so we accumulate them in this set - seen_properties_and_tags = set() + seen_properties_and_tags_and_comments = set() for entry_node in entry_nodes: if entry_node.get_node_type() != NodeType.ENTRY: raise ValueError(f"Only ENTRY nodes should be passed to this function") - seen_properties_and_tags.update(entry_node.tags) - seen_properties_and_tags.update(entry_node.properties) + seen_properties_and_tags_and_comments.update(entry_node.tags) + seen_properties_and_tags_and_comments.update(entry_node.properties) + seen_properties_and_tags_and_comments.update(entry_node.comments) additional_properties_queries = [ - f"{key} : entry_node.{key}" for key in seen_properties_and_tags + f"{key} : entry_node.{key}" for key in seen_properties_and_tags_and_comments ] base_properties_query = f""" diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py index 32ac4ce4..1397bba1 100644 --- a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py +++ b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py @@ -1,4 +1,5 @@ import collections +import copy import logging import re import sys @@ -25,10 +26,16 @@ class NodeData: is_before: str | None = None main_language: str | None = None preceding_lines: list[str] = field(default_factory=list) - parent_tag: list[tuple[str, int]] = field(default_factory=list) + parent_tags: list[tuple[str, int]] = field(default_factory=list) src_position: int | None = None properties: dict[str, str] = field(default_factory=dict) tags: dict[str, list[str]] = field(default_factory=dict) + comments: dict[str, list[str]] = field(default_factory=dict) + # comments_stack is a list of tuples (line_number, comment), + # to keep track of comments just above the current line + # during parsing of an entry, to be able to add them + # to the right property or tag when possible + comments_stack: list[(int, str)] = field(default_factory=list) def to_dict(self): return { @@ -38,6 +45,7 @@ def to_dict(self): "src_position": self.src_position, **self.properties, **self.tags, + **self.comments, } def get_node_type(self): @@ -151,12 +159,12 @@ def _header_harvest(self, filename: str) -> tuple[list[str], int]: def _entry_end(self, line: str, data: NodeData) -> bool: """Return True if the block ended""" - # stopwords and synonyms are one-liner, entries are separated by a blank line - if line.startswith("stopwords") or line.startswith("synonyms") or not line: - # can be the end of an block or just additional line separator, - # file_iter() always end with '' - if data.id: # to be sure that it's an end - return True + if data.id.startswith("stopwords") or data.id.startswith("synonyms"): + # stopwords and synonyms are one-liners; if the id is set, it's the end + return True + if not line and data.id: + # entries are separated by a blank line + return True return False def _remove_separating_line(self, data: NodeData) -> NodeData: @@ -186,6 +194,43 @@ def _remove_separating_line(self, data: NodeData) -> NodeData: data.preceding_lines.pop(0) return data + def _get_node_data_with_comments_above_key( + self, data: NodeData, line_number: int, key: str + ) -> NodeData: + """Returns the updated node data with comments above the given + key stored in the {key}_comments property.""" + new_data = copy.deepcopy(data) + + # Get comments just above the given line + comments_above = [] + current_line = line_number - 1 + while new_data.comments_stack and new_data.comments_stack[-1][0] == current_line: + comments_above.append(new_data.comments_stack.pop()[1]) + current_line -= 1 + if comments_above: + new_data.comments[key + "_comments"] = comments_above[::-1] + + return new_data + + def _get_node_data_with_parent_and_end_comments(self, data: NodeData) -> NodeData: + """Returns the updated node data with parent and end comments""" + new_data = copy.deepcopy(data) + + # Get parent comments (part of an entry block and just above/between the parents lines) + parent_comments = [] + while new_data.preceding_lines and new_data.preceding_lines[-1] != "": + parent_comments.append(new_data.preceding_lines.pop()) + if parent_comments: + new_data.comments["parent_comments"] = parent_comments[::-1] + + # Get end comments (part of an entry block after the last tag/prop + # and before the separating blank line) + end_comments = [comment[1] for comment in new_data.comments_stack] + if end_comments: + new_data.comments["end_comments"] = end_comments + + return new_data + def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[NodeData]: """Transform data from file to dictionary""" saved_nodes = [] @@ -215,14 +260,21 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N self.parser_logger.error(msg) else: data = self._remove_separating_line(data) + if data.get_node_type() == NodeType.ENTRY: + data = self._get_node_data_with_parent_and_end_comments(data) yield data # another function will use this dictionary to create a node saved_nodes.append(data.id) data = NodeData(is_before=data.id) # harvest the line if not (line) or line[0] == "#": - # comment or blank - data.preceding_lines.append(line) + # comment or blank line + if data.id: + # we are within the node definition + data.comments_stack.append((line_number, line)) + else: + # we are before the actual node + data.preceding_lines.append(line) else: line = line.rstrip(",") if not data.src_position: @@ -234,7 +286,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N index_stopwords += 1 # remove "stopwords:" part line = line[10:] - # compute raw values outside _get_lc_value as it removes stop words ! + # compute raw values outside _get_lc_value as it normalizes them! tags = [words.strip() for words in line[3:].split(",")] try: lc, value = self._get_lc_value(line) @@ -252,7 +304,9 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N id = "synonyms:" + str(index_synonyms) data = self._set_data_id(data, id, line_number) index_synonyms += 1 + # remove "synonyms:" part line = line[9:] + # compute raw values outside _get_lc_value as it normalizes them! tags = [words.strip() for words in line[3:].split(",")] try: lc, value = self._get_lc_value(line) @@ -265,7 +319,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N data.tags["tags_ids_" + lc] = value elif line[0] == "<": # parent definition - data.parent_tag.append((self._add_line(line[1:]), line_number + 1)) + data.parent_tags.append((self._add_line(line[1:]), line_number + 1)) elif language_code_prefix.match(line): # synonyms definition if not data.id: @@ -286,6 +340,9 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N tagsids_list.append(word_normalized) data.tags["tags_" + lang] = tags_list data.tags["tags_ids_" + lang] = tagsids_list + data = self._get_node_data_with_comments_above_key( + data, line_number, "tags_" + lang + ) else: # property definition property_name = None @@ -306,7 +363,11 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N f"Reading error at line {line_number + 1}, unexpected format: '{self.parser_logger.ellipsis(line)}'" ) if property_name: - data.properties["prop_" + property_name + "_" + lc] = property_value + prop_key = "prop_" + property_name + "_" + lc + data.properties[prop_key] = property_value + data = self._get_node_data_with_comments_above_key( + data, line_number, prop_key + ) data.id = "__footer__" data.preceding_lines.pop(0) @@ -395,8 +456,8 @@ def _create_taxonomy(self, filename: str) -> Taxonomy: other_nodes.append(entry) if entry.is_before: previous_links.append(PreviousLink(before_id=entry.is_before, id=entry.id)) - if entry.parent_tag: - for position, (parent, line_position) in enumerate(entry.parent_tag): + if entry.parent_tags: + for position, (parent, line_position) in enumerate(entry.parent_tags): raw_child_links.append( ChildLink( parent_id=parent, diff --git a/parser/openfoodfacts_taxonomy_parser/unparser.py b/parser/openfoodfacts_taxonomy_parser/unparser.py index 25e1fbad..71d714f8 100644 --- a/parser/openfoodfacts_taxonomy_parser/unparser.py +++ b/parser/openfoodfacts_taxonomy_parser/unparser.py @@ -55,10 +55,10 @@ def get_tags_line(self, node, lc): return lc + ":" + line def list_property_and_lc(self, node): - """return a ordered list of properties with their language code (lc)""" + """return an ordered list of properties with their language code (lc)""" # there is no rule for the order of properties # properties will be arranged in alphabetical order - return [property[5:] for property in node if property.startswith("prop_")] + return sorted([property[5:] for property in node if property.startswith("prop_") and not property.endswith("_comments")]) def get_property_line(self, node, property): """return a string that should look like the original property line""" @@ -90,8 +90,7 @@ def iter_lines(self, project_label): if add_blank: yield "" # comments - if node["preceding_lines"]: - yield from node["preceding_lines"] + yield from node.get("preceding_lines", []) if has_content: tags_lc = self.list_tags_lc(node) if node["id"].startswith("stopwords"): @@ -99,19 +98,25 @@ def iter_lines(self, project_label): elif node["id"].startswith("synonyms"): yield "synonyms:" + self.get_tags_line(node, tags_lc[0]) else: - # synonyms line + # parents + yield from node.get("parent_comments", []) yield from self.get_parents_lines(parents) # main language synonyms first main_language = node.pop("main_language") tags_lc.remove(main_language) + yield from node.get("tags_" + main_language + "_comments", []) yield self.get_tags_line(node, main_language) # more synonyms after for lc in tags_lc: + yield from node.get("tags_" + lc + "_comments", []) yield self.get_tags_line(node, lc) # properties properties_list = self.list_property_and_lc(node) for property in properties_list: + yield from node.get("prop_" + property + "_comments", []) yield self.get_property_line(node, property) + # final comments + yield from node.get("end_comments", []) previous_block_id = node["id"] def rewrite_file(self, filename, lines): diff --git a/parser/tests/data/test.txt b/parser/tests/data/test.txt index 5837bf08..04f4ba85 100644 --- a/parser/tests/data/test.txt +++ b/parser/tests/data/test.txt @@ -23,8 +23,8 @@ fr:yaourts au fruit de la passion allégés # meat en:meat -vegan:en:no carbon_footprint_fr_foodges_value:fr:10 +vegan:en:no { // UUID of properties will have a "_uuid" suffix // Ex: prop_vegan_en_uuid - if (key.startsWith("prop") && key.endsWith("uuid")) { + if ( + key.startsWith("prop") && + key.endsWith("uuid") && + !key.endsWith("_comments_uuid") + ) { const uuid = nodeObject[key][0]; // UUID // Removing "prop_" prefix from key to render only the name const property_name = key.split("_").slice(1, -1).join("_"); @@ -33,13 +37,6 @@ const ListAllEntryProperties = ({ nodeObject, setNodeObject }) => { setData(renderedProperties); }, [nodeObject]); - // Helper function used for changing comments from node - const changeCommentData = (value) => { - const newNodeObject = { ...nodeObject }; - newNodeObject["preceding_lines"] = value; - setNodeObject(newNodeObject); - }; - // Helper function used for changing properties of node const changePropertyData = (key, value) => { setNodeObject((prevState) => { @@ -60,21 +57,6 @@ const ListAllEntryProperties = ({ nodeObject, setNodeObject }) => { return ( - {/* Comments */} - - Comments - - { - changeCommentData(event.target.value.split("\n")); - }} - value={nodeObject?.preceding_lines.join("\n")} - variant="outlined" - /> - {/* Properties */} { setRenderedNonEntryInfo(tagsExtracted); }, [nodeObject]); - // Helper function used for changing state of "preceding_lines" - const changeDataComment = (value) => { - const newNodeObject = { ...nodeObject }; - newNodeObject["preceding_lines"] = value.split("\n"); - setNodeObject(newNodeObject); - }; - // Helper function used for changing state of properties const changeData = (index, value) => { const updatedTagObject = { index: index, tag: value }; @@ -115,23 +108,6 @@ const ListAllNonEntryInfo = ({ nodeObject, id, setNodeObject }) => { return ( - {/* Comments */} - - Comments - - {nodeObject && ( - { - changeDataComment(event.target.value); - }} - value={nodeObject.preceding_lines} - variant="outlined" - /> - )} - {/* Main Language */} {(IDType === "Synonyms" || IDType === "Stopwords") && (