Skip to content

Commit

Permalink
(EAI-627): Skip snooty pages with noindex meta.robots tag (#577)
Browse files Browse the repository at this point in the history
* ingest snooty docs facets and meta

* page prefix on keys

* remove trailing/leading whitespace

* Support concurrent embedding

* page transform and exclude

* test chunk transformer

* skip noindex pages

* fix length b/c noindex page
  • Loading branch information
mongodben authored Dec 12, 2024
1 parent 370a2d5 commit 7f4e4a8
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 14 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import nock from "nock";
import { Readable } from "stream";
import fs from "fs";
import Path from "path";
import JSONL from "jsonl-parse-stringify";
Expand Down Expand Up @@ -56,7 +57,7 @@ describe("SnootyDataSource", () => {
});

const pages = await source.fetchPages();
expect(pages.length).toBe(12);
expect(pages).toHaveLength(11);
const astPages = JSONL.parse<{ type: string; data: { ast: SnootyNode } }>(
fs.readFileSync(sampleDataPath, "utf8")
);
Expand All @@ -82,7 +83,7 @@ describe("SnootyDataSource", () => {
snootyDataApiBaseUrl,
});
const pages = await source.fetchPages();
expect(pages.length).toBe(12);
expect(pages.length).toBe(11);
expect(pages[0]).toMatchObject({
format: "md",
sourceName: "snooty-docs",
Expand Down Expand Up @@ -174,8 +175,45 @@ describe("SnootyDataSource", () => {
)
).toBeUndefined();
});

it("skips noindex page", async () => {
const mockUrl = "https://example.com";
const noIndexMock = nock(mockUrl);
// Use normal sample data (no deletes)
const source = await makeSnootyDataSource({
name: `snooty-test`,
project,
snootyDataApiBaseUrl: mockUrl,
});
noIndexMock
.get(`/projects/${project.name}/${project.currentBranch}/documents`)
.reply(200, () => {
const noIndexAst = jsonLify(
Path.resolve(SRC_ROOT, "../testData/noindex.json")
);

const astWithIndex = jsonLify(
Path.resolve(SRC_ROOT, "../testData/samplePage.json")
);

const stream = new Readable();
stream.push(noIndexAst + "\n");
stream.push(astWithIndex + "\n");
stream.push(null); // End the stream
return stream;
});

const pages = await source.fetchPages();
// only captures the astWithIndex page, not the noIndexAst page
expect(pages).toHaveLength(1);
noIndexMock.done();
});
});
});

function jsonLify(path: string) {
return JSON.stringify(JSON.parse(fs.readFileSync(path, "utf-8")));
}
describe("handlePage()", () => {
it("should correctly parse openapi spec page", async () => {
const apiSpecPage = JSON.parse(
Expand Down Expand Up @@ -220,6 +258,6 @@ describe("handlePage()", () => {
version: "1.0",
},
});
expect(result.body).toContain("# $merge (aggregation)");
expect(result?.body).toContain("# $merge (aggregation)");
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,12 @@ export type SnootyMetaNode = SnootyNode & {
*/
description: string;
[key: string]: string | undefined;

/**
Robots meta tag value for the page.
@example "noindex, nofollow"
*/
robots?: string;
};
};

Expand Down Expand Up @@ -221,7 +227,9 @@ export const makeSnootyDataSource = ({
productName,
version,
});
pages.push(page);
if (page !== undefined) {
pages.push(page);
}
} catch (error) {
// Log the error and discard this document, but don't break the
// overall fetchPages() call.
Expand Down Expand Up @@ -333,7 +341,7 @@ export const handlePage = async (
productName?: string;
version?: string;
}
): Promise<Page> => {
): Promise<Page | undefined> => {
// Strip first three path segments - according to Snooty team, they'll always
// be ${property}/docsworker-xlarge/${branch}
const pagePath = page.page_id
Expand Down Expand Up @@ -361,7 +369,12 @@ export const handlePage = async (
body = snootyAstToMd(page.ast);
title = getTitleFromSnootyAst(page.ast);
}
const pageMetadata = getMetadataFromSnootyAst(page.ast);
const { metadata: pageMetadata, noIndex } = getMetadataFromSnootyAst(
page.ast
);
if (noIndex) {
return;
}

return {
url: new URL(pagePath, baseUrl.replace(/\/?$/, "/")).href.replace(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -315,22 +315,37 @@ describe("getMetadataFromSnootyAst", () => {
)
);
it("extracts meta directives", () => {
const metadata = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
const { metadata } = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
expect(metadata).toMatchObject({
description: expect.any(String),
});
});
it("extracts meta.keyword directives as string[]", () => {
const metadata = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
const { metadata } = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
expect(metadata).toMatchObject({
keywords: expect.arrayContaining([expect.any(String)]),
});
});
it("extracts facet directives", () => {
const metadata = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
const { metadata } = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
expect(metadata).toMatchObject({
genre: "tutorial",
foo: "bar",
});
});

it("doesn't extract noindex if not present", () => {
const { noIndex } = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
expect(noIndex).toBe(false);
});

it("extracts noindex if present", () => {
const sampleMetadataPage = JSON.parse(
fs.readFileSync(Path.resolve(SRC_ROOT, "../testData/noindex.json"), {
encoding: "utf-8",
})
);
const { noIndex } = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
expect(noIndex).toBe(true);
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -211,9 +211,7 @@ export const getTitleFromSnootyAst = (node: SnootyNode): string | undefined => {
return textNodes.map(({ value }) => value).join("");
};

export const getMetadataFromSnootyAst = (
node: SnootyNode
): Record<string, unknown> => {
export const getMetadataFromSnootyAst = (node: SnootyNode) => {
const facetAndMetaNodes = findAll(
node,
({ name }) => name === "facet" || name === "meta"
Expand All @@ -238,6 +236,7 @@ export const getMetadataFromSnootyAst = (
return acc;
}, {} as Record<string, string>);

let noIndex = false;
const meta = metaNodes.reduce((acc, metaNode) => {
if (!metaNode.options) {
return acc;
Expand All @@ -248,13 +247,18 @@ export const getMetadataFromSnootyAst = (
acc[key] = value.split(",").map((s) => s.trim());
} else if (key === "description" && value) {
acc[key] = value;
} else if (key === "robots" && value) {
noIndex = value.includes("noindex");
}
}

return acc;
}, {} as Record<string, string | string[]>);
return {
...facets,
...meta,
metadata: {
...facets,
...meta,
},
noIndex,
};
};
100 changes: 100 additions & 0 deletions packages/ingest-mongodb-public/testData/noindex.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
{
"type": "page",
"data": {
"_id": "64e8d24124fcc731b479906d",
"github_username": "docs-builder-bot",
"page_id": "docs/docsworker-xlarge/master/release-notes/1.2",
"ast": {
"type": "root",
"position": { "start": { "line": 0 } },
"children": [
{
"type": "comment",
"position": { "start": { "line": 1 } },
"children": [
{
"type": "text",
"position": { "start": { "line": 1 } },
"value": "This page is hidden from the TOC and search indexing."
}
]
},
{
"type": "directive",
"position": { "start": { "line": 4 } },
"children": [],
"domain": "",
"name": "meta",
"argument": [],
"options": { "robots": "noindex, nosnippet" }
},
{
"type": "target",
"position": { "start": { "line": 7 } },
"children": [
{
"type": "target_identifier",
"position": { "start": { "line": 7 } },
"children": [
{
"type": "text",
"position": { "start": { "line": 11 } },
"value": "Release Notes for MongoDB 1.2.x"
}
],
"ids": ["release-notes-1.2"]
}
],
"domain": "std",
"name": "label",
"html_id": "std-label-release-notes-1.2"
},
{
"type": "section",
"position": { "start": { "line": 11 } },
"children": [
{
"type": "heading",
"position": { "start": { "line": 11 } },
"children": [
{
"type": "text",
"position": { "start": { "line": 11 } },
"value": "Release Notes for MongoDB 1.2.x"
}
],
"id": "release-notes-for-mongodb-1.2.x"
},
{
"type": "directive",
"position": { "start": { "line": 15 } },
"children": [],
"domain": "",
"name": "contents",
"argument": [
{
"type": "text",
"position": { "start": { "line": 15 } },
"value": "On this page"
}
],
"options": {
"local": true,
"backlinks": "none",
"depth": 1,
"class": "singlecol"
}
}
]
}
],
"fileid": "release-notes/1.2.txt"
},
"created_at": "2023-08-25T16:09:35.577Z",
"deleted": true,
"filename": "release-notes/1.2.txt",
"static_assets": [],
"updated_at": "2024-02-01T21:50:41.225Z",
"build_id": "65bc1166bdcf995e0c6983bb"
}
}

0 comments on commit 7f4e4a8

Please sign in to comment.