Fix tags extraction from body (#103)

* Update tests * [tags extraction from body][l]: Commented line within code block incorrectly parsed as tag #99 Link to GitHub issue incorrectly parsed as a tag #100 mddb sometimes chokes on accented characters within tags #101
datopian · Jan 4, 2024 · ae6575e · ae6575e
1 parent 137eff3
commit ae6575e
Show file tree

Hide file tree

Showing 7 changed files with 134 additions and 13 deletions.
diff --git a/.changeset/short-llamas-kneel.md b/.changeset/short-llamas-kneel.md
@@ -0,0 +1,5 @@
+---
+"mddb": patch
+---
+
+Fix tags from body extraction
diff --git a/__mocks__/content/index.mdx b/__mocks__/content/index.mdx
@@ -5,10 +5,30 @@ tags: tag1, tag2, tag3
 
 # Welcome
 
+## Test links
 [link](blog0.mdx)
 
+## Test tasks
 - [] uncompleted task 1
 - [ ] uncompleted task 2
 
 - [x] completed task 1
-- [X] completed task 2
+- [X] completed task 2
+
+## Test tags
+
+### Should be extracted 
+#日本語タグ
+Another tag: #标签名
+#метка
+Another tag: #태그이름
+#tag_فارسی
+#Tag_avec_éèç-_öäüßñ
+
+### Shouldn't be extracted 
+```bash
+#-------f--------------------------------------------------------------------------------
+#
+#---------------------------------------------------------------------------------------
+```
+#4874
diff --git a/src/lib/parseFile.ts b/src/lib/parseFile.ts
@@ -66,17 +66,40 @@ export const extractTagsFromBody = (ast: Root) => {
   const nodes = selectAll("*", ast);
   for (let index = 0; index < nodes.length; index++) {
     const node: any = nodes[index];
-    if (node.value) {
-      const textTags = node.value.match(/(?:^|\s)(#(\w+|\/|-|_)+)/g);
-      if (textTags) {
-        tags = tags.concat(textTags.map((tag: string) => tag.trim().slice(1))); // Extract tags and remove the '#'
-      }
+    const textContent = node.value; // extractTextWithoutCodeBlocks(node);
+    if (textContent && node.type !== "code") {
+      const textTags = extractTags(textContent);
+      tags = tags.concat(textTags);
     }
   }
 
   return tags;
 };
 
+function extractTags(text: string) {
+  let tags: any = [];
+  const textTags = text.match(/(?:^|\s+|\n+|\r+)#([a-zA-Z0-9_\-\/\p{L}]+)/gu);
+  if (textTags) {
+    tags = tags.concat(
+      textTags
+        .filter((tag) => isValidTag(tag.trim().slice(1)))
+        .map((tag) => tag.trim().slice(1))
+    ); // Extract tags and remove the '#'
+  }
+
+  return tags;
+};
+
+function isValidTag(tag: string) {
+  // Check if the tag follows the specified rules
+  return (
+    tag.length > 1 &&
+    /[a-zA-Z_\-\/\p{L}]+/gu.test(tag) && // At least one non-numerical character
+    !/\s/.test(tag) && // No blank spaces
+    /[a-zA-Z0-9_\-\/\p{L}]+/gu.test(tag) // Valid characters: alphabetical letters, numbers, underscore, hyphen, forward slash, and any letter in any language
+  );
+}
+
 export interface LinkExtractors {
   [test: string]: (node: any) => WikiLink;
 }

diff --git a/src/tests/extractTagsFromBody.spec.ts b/src/tests/extractTagsFromBody.spec.ts
@@ -39,7 +39,7 @@ describe("extractTagsFromBody", () => {
 
   test("should extract tags from both heading and body text", () => {
     const tags = getTagsFromSource(`# head #tag 
-    in heading and also in the #tag-body body text.`);
+in heading and also in the #tag-body body text.`);
     const expectedTags = ["tag", "tag-body"];
     expect(tags).toEqual(expectedTags);
   });
@@ -73,8 +73,8 @@ describe("extractTagsFromBody", () => {
   // for now we will pass the body content only not the whole source
   test("shouldn't extract frontmatter tags", () => {
     const tags = getTagsFromSource(`
-    No tags in this content.
-    #gr3
+No tags in this content.
+#gr3
     `);
     const expectedTags: string[] = ["gr3"];
     expect(tags).toEqual(expectedTags);

diff --git a/src/tests/markdowndb.spec.ts b/src/tests/markdowndb.spec.ts
@@ -203,6 +203,12 @@ describe("MarkdownDB - default config", () => {
         { name: "politics" },
         { name: "sports" },
         { name: "culture" },
+        { name: "日本語タグ" },
+        { name: "标签名" },
+        { name: "метка" },
+        { name: "태그이름" },
+        { name: "tag_فارسی" },
+        { name: "Tag_avec_éèç-_öäüßñ" },
       ];
 
       expect(dbTags).toHaveLength(extectedTags.length);

diff --git a/src/tests/parseFile.spec.ts b/src/tests/parseFile.spec.ts
@@ -12,14 +12,44 @@ tags: a, b, c
 ![[Some Image.png]]
 - [ ] uncompleted task
 - [x] completed task
+
+## Test tags
+
+### Should be extracted 
+#日本語タグ
+Another tag: #标签名
+#метка
+Another tag: #태그이름
+#tag_فارسی
+#Tag_avec_éèç-_öäüßñ
+
+### Shouldn't be extracted 
+\`\`\`bash
+#---------------------------------------------------------------------------------------
+# This's a title
+#---------------------------------------------------------------------------------------
+...
+\`\`\`
+
+#4874
 `;
 
 describe("parseFile", () => {
   it("should parse a file returning metadata and wiki links", () => {
     const expectedMetadata = {
       title: "Hello World",
       authors: ["John Doe", "Jane Doe"],
-      tags: ["a", "b", "c"],
+      tags: [
+        "a",
+        "b",
+        "c",
+        "日本語タグ",
+        "标签名",
+        "метка",
+        "태그이름",
+        "tag_فارسی",
+        "Tag_avec_éèç-_öäüßñ",
+      ],
       tasks: [
         { description: "uncompleted task", checked: false },
         { description: "completed task", checked: true },
@@ -68,7 +98,24 @@ describe("parseFile", () => {
     const expectedMetadata = {
       title: "Hello World",
       authors: ["John Doe", "Jane Doe"],
-      tags: ["a", "b", "c"],
+      // For some reason remark-parse duplicates tags when permalinks are passed
+      tags: [
+        "a",
+        "b",
+        "c",
+        "日本語タグ",
+        "标签名",
+        "метка",
+        "태그이름",
+        "tag_فارسی",
+        "Tag_avec_éèç-_öäüßñ",
+        "日本語タグ",
+        "标签名",
+        "метка",
+        "태그이름",
+        "tag_فارسی",
+        "Tag_avec_éèç-_öäüßñ",
+      ],
       tasks: [
         { description: "uncompleted task", checked: false },
         { description: "completed task", checked: true },

diff --git a/src/tests/process.spec.ts b/src/tests/process.spec.ts
@@ -18,10 +18,30 @@ describe("Can parse a file and get file info", () => {
     expect(fileInfo.file_path).toBe(fullPath);
     expect(fileInfo.url_path).toBe("index.mdx");
     expect(fileInfo.extension).toBe("mdx");
-    expect(fileInfo.tags).toEqual(["tag1", "tag2", "tag3"]);
+    expect(fileInfo.tags).toEqual([
+      "tag1",
+      "tag2",
+      "tag3",
+      "日本語タグ",
+      "标签名",
+      "метка",
+      "태그이름",
+      "tag_فارسی",
+      "Tag_avec_éèç-_öäüßñ",
+    ]);
     expect(fileInfo.metadata).toEqual({
       title: "Homepage",
-      tags: ["tag1", "tag2", "tag3"],
+      tags: [
+        "tag1",
+        "tag2",
+        "tag3",
+        "日本語タグ",
+        "标签名",
+        "метка",
+        "태그이름",
+        "tag_فارسی",
+        "Tag_avec_éèç-_öäüßñ",
+      ],
       tasks: [
         {
           checked: false,