Skip to content

Commit

Permalink
Fix tags extraction from body (#103)
Browse files Browse the repository at this point in the history
* Update tests

* [tags extraction from body][l]: Commented line within code block incorrectly parsed as tag #99 Link to GitHub issue incorrectly parsed as a tag #100 mddb sometimes chokes on accented characters within tags #101
  • Loading branch information
mohamedsalem401 authored Jan 4, 2024
1 parent 137eff3 commit ae6575e
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 13 deletions.
5 changes: 5 additions & 0 deletions .changeset/short-llamas-kneel.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"mddb": patch
---

Fix tags from body extraction
22 changes: 21 additions & 1 deletion __mocks__/content/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,30 @@ tags: tag1, tag2, tag3

# Welcome

## Test links
[link](blog0.mdx)

## Test tasks
- [] uncompleted task 1
- [ ] uncompleted task 2

- [x] completed task 1
- [X] completed task 2
- [X] completed task 2

## Test tags

### Should be extracted
#日本語タグ
Another tag: #标签名
#метка
Another tag: #태그이름
#tag_فارسی
#Tag_avec_éèç-_öäüßñ

### Shouldn't be extracted
```bash
#-------f--------------------------------------------------------------------------------
#
#---------------------------------------------------------------------------------------
```
#4874
33 changes: 28 additions & 5 deletions src/lib/parseFile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,17 +66,40 @@ export const extractTagsFromBody = (ast: Root) => {
const nodes = selectAll("*", ast);
for (let index = 0; index < nodes.length; index++) {
const node: any = nodes[index];
if (node.value) {
const textTags = node.value.match(/(?:^|\s)(#(\w+|\/|-|_)+)/g);
if (textTags) {
tags = tags.concat(textTags.map((tag: string) => tag.trim().slice(1))); // Extract tags and remove the '#'
}
const textContent = node.value; // extractTextWithoutCodeBlocks(node);
if (textContent && node.type !== "code") {
const textTags = extractTags(textContent);
tags = tags.concat(textTags);
}
}

return tags;
};

function extractTags(text: string) {
let tags: any = [];
const textTags = text.match(/(?:^|\s+|\n+|\r+)#([a-zA-Z0-9_\-\/\p{L}]+)/gu);
if (textTags) {
tags = tags.concat(
textTags
.filter((tag) => isValidTag(tag.trim().slice(1)))
.map((tag) => tag.trim().slice(1))
); // Extract tags and remove the '#'
}

return tags;
};

function isValidTag(tag: string) {
// Check if the tag follows the specified rules
return (
tag.length > 1 &&
/[a-zA-Z_\-\/\p{L}]+/gu.test(tag) && // At least one non-numerical character
!/\s/.test(tag) && // No blank spaces
/[a-zA-Z0-9_\-\/\p{L}]+/gu.test(tag) // Valid characters: alphabetical letters, numbers, underscore, hyphen, forward slash, and any letter in any language
);
}

export interface LinkExtractors {
[test: string]: (node: any) => WikiLink;
}
Expand Down
6 changes: 3 additions & 3 deletions src/tests/extractTagsFromBody.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ describe("extractTagsFromBody", () => {

test("should extract tags from both heading and body text", () => {
const tags = getTagsFromSource(`# head #tag
in heading and also in the #tag-body body text.`);
in heading and also in the #tag-body body text.`);
const expectedTags = ["tag", "tag-body"];
expect(tags).toEqual(expectedTags);
});
Expand Down Expand Up @@ -73,8 +73,8 @@ describe("extractTagsFromBody", () => {
// for now we will pass the body content only not the whole source
test("shouldn't extract frontmatter tags", () => {
const tags = getTagsFromSource(`
No tags in this content.
#gr3
No tags in this content.
#gr3
`);
const expectedTags: string[] = ["gr3"];
expect(tags).toEqual(expectedTags);
Expand Down
6 changes: 6 additions & 0 deletions src/tests/markdowndb.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,12 @@ describe("MarkdownDB - default config", () => {
{ name: "politics" },
{ name: "sports" },
{ name: "culture" },
{ name: "日本語タグ" },
{ name: "标签名" },
{ name: "метка" },
{ name: "태그이름" },
{ name: "tag_فارسی" },
{ name: "Tag_avec_éèç-_öäüßñ" },
];

expect(dbTags).toHaveLength(extectedTags.length);
Expand Down
51 changes: 49 additions & 2 deletions src/tests/parseFile.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,44 @@ tags: a, b, c
![[Some Image.png]]
- [ ] uncompleted task
- [x] completed task
## Test tags
### Should be extracted
#日本語タグ
Another tag: #标签名
#метка
Another tag: #태그이름
#tag_فارسی
#Tag_avec_éèç-_öäüßñ
### Shouldn't be extracted
\`\`\`bash
#---------------------------------------------------------------------------------------
# This's a title
#---------------------------------------------------------------------------------------
...
\`\`\`
#4874
`;

describe("parseFile", () => {
it("should parse a file returning metadata and wiki links", () => {
const expectedMetadata = {
title: "Hello World",
authors: ["John Doe", "Jane Doe"],
tags: ["a", "b", "c"],
tags: [
"a",
"b",
"c",
"日本語タグ",
"标签名",
"метка",
"태그이름",
"tag_فارسی",
"Tag_avec_éèç-_öäüßñ",
],
tasks: [
{ description: "uncompleted task", checked: false },
{ description: "completed task", checked: true },
Expand Down Expand Up @@ -68,7 +98,24 @@ describe("parseFile", () => {
const expectedMetadata = {
title: "Hello World",
authors: ["John Doe", "Jane Doe"],
tags: ["a", "b", "c"],
// For some reason remark-parse duplicates tags when permalinks are passed
tags: [
"a",
"b",
"c",
"日本語タグ",
"标签名",
"метка",
"태그이름",
"tag_فارسی",
"Tag_avec_éèç-_öäüßñ",
"日本語タグ",
"标签名",
"метка",
"태그이름",
"tag_فارسی",
"Tag_avec_éèç-_öäüßñ",
],
tasks: [
{ description: "uncompleted task", checked: false },
{ description: "completed task", checked: true },
Expand Down
24 changes: 22 additions & 2 deletions src/tests/process.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,30 @@ describe("Can parse a file and get file info", () => {
expect(fileInfo.file_path).toBe(fullPath);
expect(fileInfo.url_path).toBe("index.mdx");
expect(fileInfo.extension).toBe("mdx");
expect(fileInfo.tags).toEqual(["tag1", "tag2", "tag3"]);
expect(fileInfo.tags).toEqual([
"tag1",
"tag2",
"tag3",
"日本語タグ",
"标签名",
"метка",
"태그이름",
"tag_فارسی",
"Tag_avec_éèç-_öäüßñ",
]);
expect(fileInfo.metadata).toEqual({
title: "Homepage",
tags: ["tag1", "tag2", "tag3"],
tags: [
"tag1",
"tag2",
"tag3",
"日本語タグ",
"标签名",
"метка",
"태그이름",
"tag_فارسی",
"Tag_avec_éèç-_öäüßñ",
],
tasks: [
{
checked: false,
Expand Down

0 comments on commit ae6575e

Please sign in to comment.