Skip to content

Commit

Permalink
(EAI-375 lite): Include page metadata in ingest, but not chunking (#576)
Browse files Browse the repository at this point in the history
* ingest snooty docs facets and meta

* page prefix on keys

* remove trailing/leading whitespace

* Support concurrent embedding

* page transform and exclude

* test chunk transformer
  • Loading branch information
mongodben authored Dec 11, 2024
1 parent 9f8345e commit 370a2d5
Show file tree
Hide file tree
Showing 8 changed files with 485 additions and 5 deletions.
6 changes: 6 additions & 0 deletions examples/quick-start/packages/ingest/src/ingest.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,10 @@ export default {

return [mongodbChatbotFrameworkSource];
},
concurrencyOptions: () => ({
embed: {
createChunks: 5,
processPages: 2,
},
}),
} satisfies Config;
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ import { createInterface } from "readline";
import { Page, PageFormat, logger } from "mongodb-rag-core";
import fetch from "node-fetch";
import { DataSource, ProjectBase } from "mongodb-rag-core/dataSources";
import { snootyAstToMd, getTitleFromSnootyAst } from "./snootyAstToMd";
import {
snootyAstToMd,
getTitleFromSnootyAst,
getMetadataFromSnootyAst,
} from "./snootyAstToMd";
import {
getTitleFromSnootyOpenApiSpecAst,
snootyAstToOpenApiSpec,
Expand Down Expand Up @@ -49,6 +53,35 @@ export type SnootyTextNode = SnootyNode & {
value: string;
};

export type SnootyFacetNode = SnootyNode & {
type: "directive";
name: "facet";
children: never;
options?: {
name: string;
values: string;
};
};

export type SnootyMetaNode = SnootyNode & {
type: "directive";
name: "meta";
children: never;
options?: {
/**
List of relevant keywords for the page, comma separated.
@example "code example, node.js, analyze, array"
*/
keywords?: string;

/**
High-level description of the page.
*/
description: string;
[key: string]: string | undefined;
};
};

/**
A page in the Snooty manifest.
*/
Expand Down Expand Up @@ -328,6 +361,7 @@ export const handlePage = async (
body = snootyAstToMd(page.ast);
title = getTitleFromSnootyAst(page.ast);
}
const pageMetadata = getMetadataFromSnootyAst(page.ast);

return {
url: new URL(pagePath, baseUrl.replace(/\/?$/, "/")).href.replace(
Expand All @@ -339,6 +373,7 @@ export const handlePage = async (
body,
format,
metadata: {
page: pageMetadata,
tags,
productName,
version,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import Path from "path";
import fs from "fs";
import { snootyAstToMd, getTitleFromSnootyAst } from "./snootyAstToMd";
import {
snootyAstToMd,
getTitleFromSnootyAst,
getMetadataFromSnootyAst,
} from "./snootyAstToMd";
import { SnootyNode } from "./SnootyDataSource";
import { rstToSnootyAst } from "./rstToSnootyAst";

Expand Down Expand Up @@ -300,3 +304,33 @@ describe("getTitleFromSnootyAst", () => {
);
});
});

describe("getMetadataFromSnootyAst", () => {
const sampleMetadataPage = JSON.parse(
fs.readFileSync(
Path.resolve(SRC_ROOT, "../testData/samplePageWithMetadata.json"),
{
encoding: "utf-8",
}
)
);
it("extracts meta directives", () => {
const metadata = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
expect(metadata).toMatchObject({
description: expect.any(String),
});
});
it("extracts meta.keyword directives as string[]", () => {
const metadata = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
expect(metadata).toMatchObject({
keywords: expect.arrayContaining([expect.any(String)]),
});
});
it("extracts facet directives", () => {
const metadata = getMetadataFromSnootyAst(sampleMetadataPage.data.ast);
expect(metadata).toMatchObject({
genre: "tutorial",
foo: "bar",
});
});
});
58 changes: 56 additions & 2 deletions packages/ingest-mongodb-public/src/sources/snooty/snootyAstToMd.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
import { SnootyNode, SnootyTextNode } from "./SnootyDataSource";
import {
SnootyFacetNode,
SnootyMetaNode,
SnootyNode,
SnootyTextNode,
} from "./SnootyDataSource";
import { strict as assert } from "assert";
import { renderSnootyTable } from "./renderSnootyTable";

Expand Down Expand Up @@ -149,7 +154,8 @@ const renderDirective = (
.map((child) => renderAst(child, { parentHeadingLevel }))
.join("")}\n\n</Tab>\n\n`;
}
case "tabs" || "tabs-drivers":
case "tabs":
case "tabs-drivers":
return `\n\n<Tabs>\n\n${node.children
.map((child) => renderAst(child, { parentHeadingLevel }))
.join("")}\n\n</Tabs>\n\n`;
Expand Down Expand Up @@ -204,3 +210,51 @@ export const getTitleFromSnootyAst = (node: SnootyNode): string | undefined => {
) as SnootyTextNode[];
return textNodes.map(({ value }) => value).join("");
};

export const getMetadataFromSnootyAst = (
node: SnootyNode
): Record<string, unknown> => {
const facetAndMetaNodes = findAll(
node,
({ name }) => name === "facet" || name === "meta"
) as (SnootyFacetNode | SnootyMetaNode)[];

const facetNodes = facetAndMetaNodes.filter(
(n) => n.name === "facet"
) as SnootyFacetNode[];
const metaNodes = facetAndMetaNodes.filter(
(n) => n.name === "meta"
) as SnootyMetaNode[];

const facets = facetNodes.reduce((acc, facetNode) => {
if (!facetNode.options) {
return acc;
}
const { name, values } = facetNode.options;
if (!name || !values) {
return acc;
}
acc[name] = values;
return acc;
}, {} as Record<string, string>);

const meta = metaNodes.reduce((acc, metaNode) => {
if (!metaNode.options) {
return acc;
}
const metaEntries = Object.entries(metaNode.options);
for (const [key, value] of metaEntries) {
if (key === "keywords" && value) {
acc[key] = value.split(",").map((s) => s.trim());
} else if (key === "description" && value) {
acc[key] = value;
}
}

return acc;
}, {} as Record<string, string | string[]>);
return {
...facets,
...meta,
};
};
Loading

0 comments on commit 370a2d5

Please sign in to comment.