Skip to content

Commit

Permalink
Scrape all pages, not only level 1 navigation (langchain-ai#1191)
Browse files Browse the repository at this point in the history
* Use sitemap to properly load all paths

Gitbook's server-side rendering means that when we fetch the contents of the index page
the sub-menu items are not rendered and thus are invisible to $('nav a').
This patch makes the loader use sitemap.xml which has an index of all pages.

Inspired by the Python version.

* Fix lint issues

* Fix the logic

* Add the integration test provided by jacoblee93

* Fix loading the docs

* Fix formatting
  • Loading branch information
emilsedgh authored May 24, 2023
1 parent c661eec commit 151ee49
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 6 deletions.
19 changes: 19 additions & 0 deletions langchain/src/document_loaders/tests/gitbook.int.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import { test } from "@jest/globals";
import { GitbookLoader } from "../web/gitbook.js";

test("Test GitbookLoader", async () => {
const loader = new GitbookLoader(
"https://docs.gitbook.com/product-tour/navigation"
);

const docs = await loader.load();
console.log("Loaded", docs.length, "Gitbook documents");
});

test.only("Test GitbookLoader with shouldLoadAllPaths", async () => {
const loader = new GitbookLoader("https://docs.maildrop.cc", {
shouldLoadAllPaths: true,
});
const docs = await loader.load();
console.log("Loaded", docs.length, "Gitbook documents");
});
15 changes: 9 additions & 6 deletions langchain/src/document_loaders/web/gitbook.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@ export class GitbookLoader extends CheerioWebBaseLoader {
shouldLoadAllPaths = false;

constructor(public webPath: string, params: GitbookLoaderParams = {}) {
super(webPath);
const path =
params.shouldLoadAllPaths === true ? `${webPath}/sitemap.xml` : webPath;
super(path);

this.webPath = path;

this.shouldLoadAllPaths =
params.shouldLoadAllPaths ?? this.shouldLoadAllPaths;
}
Expand Down Expand Up @@ -45,14 +50,12 @@ export class GitbookLoader extends CheerioWebBaseLoader {
}

private async loadAllPaths($: CheerioAPI): Promise<Document[]> {
const relative_paths = $("nav a")
const urls = $("loc")
.toArray()
.map((element) => $(element).attr("href"))
.filter((text) => text && text[0] === "/");
.map((element) => $(element).text());

const documents: Document[] = [];
for (const path of relative_paths) {
const url = this.webPath + path;
for (const url of urls) {
console.log(`Fetching text from ${url}`);
const html = await GitbookLoader._scrape(url, this.caller, this.timeout);
documents.push(...this.loadPath(html, url));
Expand Down

0 comments on commit 151ee49

Please sign in to comment.