Skip to content

Commit

Permalink
langchain[minor],docs[minor]: Add SitemapLoader (#4331)
Browse files Browse the repository at this point in the history
* init

* fixed and docs

* add page metadata to document metadata

* chore: lint files

* cr

* chore: lint files

* cr

* chore: lint files

* chore: lint files

* chore: lint files
  • Loading branch information
bracesproul committed Feb 8, 2024
1 parent 85f41f1 commit 38c6e87
Show file tree
Hide file tree
Showing 17 changed files with 332 additions and 4 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Sitemap Loader

This notebook goes over how to use the [`SitemapLoader`](https://api.js.langchain.com/classes/langchain_document_loaders_web_sitemap.SitemapLoader.html) class to load sitemaps into `Document`s.

## Setup

First, we need to install the `langchain` package:

```bash npm2yarn
npm install --save langchain
```

The URL passed in must either contain the `.xml` path to the sitemap, or a default `/sitemap.xml` will be appended to the URL.

import CodeBlock from "@theme/CodeBlock";
import Example from "@examples/document_loaders/sitemap.ts";

<CodeBlock language="typescript">{Example}</CodeBlock>

Or, if you want to only load the sitemap and not the contents of each page from the sitemap, you can use the `parseSitemap` method:

import ParseSitemapExample from "@examples/document_loaders/parse_sitemap.ts";

<CodeBlock language="typescript">{ParseSitemapExample}</CodeBlock>
1 change: 1 addition & 0 deletions environment_tests/test-exports-bun/src/entrypoints.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ export * from "langchain/text_splitter";
export * from "langchain/memory";
export * from "langchain/document";
export * from "langchain/document_loaders/base";
export * from "langchain/document_loaders/web/sitemap";
export * from "langchain/document_loaders/web/searchapi";
export * from "langchain/document_loaders/web/serpapi";
export * from "langchain/document_loaders/web/sort_xyz_blockchain";
Expand Down
1 change: 1 addition & 0 deletions environment_tests/test-exports-cf/src/entrypoints.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ export * from "langchain/text_splitter";
export * from "langchain/memory";
export * from "langchain/document";
export * from "langchain/document_loaders/base";
export * from "langchain/document_loaders/web/sitemap";
export * from "langchain/document_loaders/web/searchapi";
export * from "langchain/document_loaders/web/serpapi";
export * from "langchain/document_loaders/web/sort_xyz_blockchain";
Expand Down
1 change: 1 addition & 0 deletions environment_tests/test-exports-cjs/src/entrypoints.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ const text_splitter = require("langchain/text_splitter");
const memory = require("langchain/memory");
const document = require("langchain/document");
const document_loaders_base = require("langchain/document_loaders/base");
const document_loaders_web_sitemap = require("langchain/document_loaders/web/sitemap");
const document_loaders_web_searchapi = require("langchain/document_loaders/web/searchapi");
const document_loaders_web_serpapi = require("langchain/document_loaders/web/serpapi");
const document_loaders_web_sort_xyz_blockchain = require("langchain/document_loaders/web/sort_xyz_blockchain");
Expand Down
1 change: 1 addition & 0 deletions environment_tests/test-exports-esbuild/src/entrypoints.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ import * as text_splitter from "langchain/text_splitter";
import * as memory from "langchain/memory";
import * as document from "langchain/document";
import * as document_loaders_base from "langchain/document_loaders/base";
import * as document_loaders_web_sitemap from "langchain/document_loaders/web/sitemap";
import * as document_loaders_web_searchapi from "langchain/document_loaders/web/searchapi";
import * as document_loaders_web_serpapi from "langchain/document_loaders/web/serpapi";
import * as document_loaders_web_sort_xyz_blockchain from "langchain/document_loaders/web/sort_xyz_blockchain";
Expand Down
1 change: 1 addition & 0 deletions environment_tests/test-exports-esm/src/entrypoints.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ import * as text_splitter from "langchain/text_splitter";
import * as memory from "langchain/memory";
import * as document from "langchain/document";
import * as document_loaders_base from "langchain/document_loaders/base";
import * as document_loaders_web_sitemap from "langchain/document_loaders/web/sitemap";
import * as document_loaders_web_searchapi from "langchain/document_loaders/web/searchapi";
import * as document_loaders_web_serpapi from "langchain/document_loaders/web/serpapi";
import * as document_loaders_web_sort_xyz_blockchain from "langchain/document_loaders/web/sort_xyz_blockchain";
Expand Down
1 change: 1 addition & 0 deletions environment_tests/test-exports-vercel/src/entrypoints.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ export * from "langchain/text_splitter";
export * from "langchain/memory";
export * from "langchain/document";
export * from "langchain/document_loaders/base";
export * from "langchain/document_loaders/web/sitemap";
export * from "langchain/document_loaders/web/searchapi";
export * from "langchain/document_loaders/web/serpapi";
export * from "langchain/document_loaders/web/sort_xyz_blockchain";
Expand Down
1 change: 1 addition & 0 deletions environment_tests/test-exports-vite/src/entrypoints.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ export * from "langchain/text_splitter";
export * from "langchain/memory";
export * from "langchain/document";
export * from "langchain/document_loaders/base";
export * from "langchain/document_loaders/web/sitemap";
export * from "langchain/document_loaders/web/searchapi";
export * from "langchain/document_loaders/web/serpapi";
export * from "langchain/document_loaders/web/sort_xyz_blockchain";
Expand Down
35 changes: 35 additions & 0 deletions examples/src/document_loaders/parse_sitemap.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import { SitemapLoader } from "langchain/document_loaders/web/sitemap";

const loader = new SitemapLoader("https://www.langchain.com/");

const sitemap = await loader.parseSitemap();
console.log(sitemap);
/**
[
{
loc: 'https://www.langchain.com/blog-detail/starting-a-career-in-design',
changefreq: '',
lastmod: '',
priority: ''
},
{
loc: 'https://www.langchain.com/blog-detail/building-a-navigation-component',
changefreq: '',
lastmod: '',
priority: ''
},
{
loc: 'https://www.langchain.com/blog-detail/guide-to-creating-a-website',
changefreq: '',
lastmod: '',
priority: ''
},
{
loc: 'https://www.langchain.com/page-1/terms-and-conditions',
changefreq: '',
lastmod: '',
priority: ''
},
...42 more items
]
*/
34 changes: 34 additions & 0 deletions examples/src/document_loaders/sitemap.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import { SitemapLoader } from "langchain/document_loaders/web/sitemap";

const loader = new SitemapLoader("https://www.langchain.com/");

const docs = await loader.load();
console.log(docs.length);
/**
26
*/
console.log(docs[0]);
/**
Document {
pageContent: '\n' +
' \n' +
'\n' +
' \n' +
' \n' +
' Blog ArticleApr 8, 2022As the internet continues to develop and grow exponentially, jobs related to the industry do too, particularly those that relate to web design and development. The prediction is that by 2029, the job outlook for these two fields will grow by 8%—significantly faster than average. Whether you’re seeking salaried employment or aiming to work in a freelance capacity, a career in web design can offer a variety of employment arrangements, competitive salaries, and opportunities to utilize both technical and creative skill sets.What does a career in web design involve?A career in website design can involve the design, creation, and coding of a range of website types. Other tasks will typically include liaising with clients and discussing website specifications, incorporating feedback, working on graphic design and image editing, and enabling multimedia features such as audio and video. Requiring a range of creative and technical skills, web designers may be involved in work across a range of industries, including software companies, IT consultancies, web design companies, corporate organizations, and more. In contrast with web developers, web designers tend to play a more creative role, crafting the overall vision and design of a site, and determining how to best incorporate the necessary functionality. However, there can be significant overlap between the roles.Full-stack, back-end, and front-end web developmentThe U.S. Bureau of Labor Statistics (BLS) Occupational Outlook Handbook tends to group web developers and digital designers into one category. However, they define them separately, stating that web developers create and maintain websites and are responsible for the technical aspects including performance and capacity. Web or digital designers, on the other hand, are responsible for the look and functionality of websites and interfaces. They develop, create, and test the layout, functions, and navigation for usability. Web developers can focus on the back-end, front-end, or full-stack development, and typically utilize a range of programming languages, libraries, and frameworks to do so. Web designers may work more closely with front-end engineers to establish the user-end functionality and appearance of a site.Are web designers in demand in 2022?In our ever-increasingly digital environment, there is a constant need for websites—and therefore for web designers and developers. With 17.4 billion websites in existence as of January 2020, the demand for web developers is only expected to rise.Web designers with significant coding experience are typically in higher demand, and can usually expect a higher salary. Like all jobs, there are likely to be a range of opportunities, some of which are better paid than others. But certain skill sets are basic to web design, most of which are key to how to become a web designer in 2022.const removeHiddenBreakpointLayers = function ie(e){function t(){for(let{hash:r,mediaQuery:i}of e){if(!i)continue;if(window.matchMedia(i).matches)return r}return e[0]?.hash}let o=t();if(o)for(let r of document.querySelectorAll(".hidden-"+o))r.parentNode?.removeChild(r);for(let r of document.querySelectorAll(".ssr-variant")){for(;r.firstChild;)r.parentNode?.insertBefore(r.firstChild,r);r.parentNode?.removeChild(r)}for(let r of document.querySelectorAll("[data-framer-original-sizes]")){let i=r.getAttribute("data-framer-original-sizes");i===""?r.removeAttribute("sizes"):r.setAttribute("sizes",i),r.removeAttribute("data-framer-original-sizes")}};removeHiddenBreakpointLayers([{"hash":"1ksv3g6"}])\n' +
'\n' +
' \n' +
' \n' +
' \n' +
' \n' +
' \n' +
'\n' +
'\n',
metadata: {
changefreq: '',
lastmod: '',
priority: '',
source: 'https://www.langchain.com/blog-detail/starting-a-career-in-design'
}
}
*/
4 changes: 4 additions & 0 deletions langchain/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,10 @@ document_loaders/web/s3.cjs
document_loaders/web/s3.js
document_loaders/web/s3.d.ts
document_loaders/web/s3.d.cts
document_loaders/web/sitemap.cjs
document_loaders/web/sitemap.js
document_loaders/web/sitemap.d.ts
document_loaders/web/sitemap.d.cts
document_loaders/web/sonix_audio.cjs
document_loaders/web/sonix_audio.js
document_loaders/web/sonix_audio.d.ts
Expand Down
2 changes: 2 additions & 0 deletions langchain/langchain.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ export const config = {
"document_loaders/web/pdf": "document_loaders/web/pdf",
"document_loaders/web/recursive_url": "document_loaders/web/recursive_url",
"document_loaders/web/s3": "document_loaders/web/s3",
"document_loaders/web/sitemap": "document_loaders/web/sitemap",
"document_loaders/web/sonix_audio": "document_loaders/web/sonix_audio",
"document_loaders/web/confluence": "document_loaders/web/confluence",
"document_loaders/web/searchapi": "document_loaders/web/searchapi",
Expand Down Expand Up @@ -640,6 +641,7 @@ export const config = {
"document_loaders/web/notionapi",
"document_loaders/web/recursive_url",
"document_loaders/web/s3",
"document_loaders/web/sitemap",
"document_loaders/web/sonix_audio",
"document_loaders/web/confluence",
"document_loaders/web/youtube",
Expand Down
13 changes: 13 additions & 0 deletions langchain/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,10 @@
"document_loaders/web/s3.js",
"document_loaders/web/s3.d.ts",
"document_loaders/web/s3.d.cts",
"document_loaders/web/sitemap.cjs",
"document_loaders/web/sitemap.js",
"document_loaders/web/sitemap.d.ts",
"document_loaders/web/sitemap.d.cts",
"document_loaders/web/sonix_audio.cjs",
"document_loaders/web/sonix_audio.js",
"document_loaders/web/sonix_audio.d.ts",
Expand Down Expand Up @@ -2868,6 +2872,15 @@
"import": "./document_loaders/web/s3.js",
"require": "./document_loaders/web/s3.cjs"
},
"./document_loaders/web/sitemap": {
"types": {
"import": "./document_loaders/web/sitemap.d.ts",
"require": "./document_loaders/web/sitemap.d.cts",
"default": "./document_loaders/web/sitemap.d.ts"
},
"import": "./document_loaders/web/sitemap.js",
"require": "./document_loaders/web/sitemap.cjs"
},
"./document_loaders/web/sonix_audio": {
"types": {
"import": "./document_loaders/web/sonix_audio.d.ts",
Expand Down
30 changes: 30 additions & 0 deletions langchain/src/document_loaders/tests/sitemap.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import { test } from "@jest/globals";
import { SitemapLoader } from "../web/sitemap.js";

test("SitemapLoader", async () => {
const loader = new SitemapLoader("https://www.langchain.com/");

const docs = await loader.load();
expect(docs.length).toBeGreaterThan(0);
});

test("checkUrlPatterns can properly identify unwanted links", async () => {
const links = [
"https://js.langchain.com/docs/use_cases/agent_simulations/",
"https://js.langchain.com/docs/use_cases/agent_simulations/generative_agents",
"https://js.langchain.com/docs/integrations/platforms/google",
"https://js.langchain.com/docs/integrations/vectorstores/analyticdb",
"https://js.langchain.com/docs/expression_language/interface",
"https://js.langchain.com/docs/modules/data_connection/",
];

const linkRegex =
/^(https:\/\/js\.langchain\.com\/docs\/use_cases)|.*interface$/;

const loader = new SitemapLoader("https://www.langchain.com/", {
filterUrls: [linkRegex.source],
});

const matches = links.map((link) => loader._checkUrlPatterns(link));
expect(matches).toEqual([true, true, false, false, true, false]);
});
33 changes: 29 additions & 4 deletions langchain/src/document_loaders/web/cheerio.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
import type { CheerioAPI, load as LoadT, SelectorType } from "cheerio";
import type {
CheerioAPI,
CheerioOptions,
load as LoadT,
SelectorType,
} from "cheerio";
import { Document } from "@langchain/core/documents";
import {
AsyncCaller,
Expand Down Expand Up @@ -62,21 +67,41 @@ export class CheerioWebBaseLoader
this.textDecoder = textDecoder;
}

/**
* Fetches web documents from the given array of URLs and loads them using Cheerio.
* It returns an array of CheerioAPI instances.
* @param urls An array of URLs to fetch and load.
* @returns A Promise that resolves to an array of CheerioAPI instances.
*/
static async scrapeAll(
urls: string[],
caller: AsyncCaller,
timeout: number | undefined,
textDecoder?: TextDecoder,
options?: CheerioOptions
): Promise<CheerioAPI[]> {
return Promise.all(
urls.map((url) =>
CheerioWebBaseLoader._scrape(url, caller, timeout, textDecoder, options)
)
);
}

static async _scrape(
url: string,
caller: AsyncCaller,
timeout: number | undefined,
textDecoder?: TextDecoder
textDecoder?: TextDecoder,
options?: CheerioOptions
): Promise<CheerioAPI> {
const { load } = await CheerioWebBaseLoader.imports();
const response = await caller.call(fetch, url, {
signal: timeout ? AbortSignal.timeout(timeout) : undefined,
});

const html =
textDecoder?.decode(await response.arrayBuffer()) ??
(await response.text());
return load(html);
return load(html, options);
}

/**
Expand Down
Loading

0 comments on commit 38c6e87

Please sign in to comment.