Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

core[minor]: Add XML output parser #4258

Merged
merged 10 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions docs/core_docs/docs/modules/model_io/output_parsers/types/xml.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# XML output parser

The `XMLOutputParser` takes language model output which contains XML and parses it into a JSON object.

The output parser currently does _not_ support streaming results.

## Usage

import CodeBlock from "@theme/CodeBlock";
import XMLExample from "@examples/prompts/xml_output_parser.ts";

import IntegrationInstallTooltip from "@mdx_components/integration_install_tooltip.mdx";

<IntegrationInstallTooltip></IntegrationInstallTooltip>

```bash npm2yarn
npm install @langchain/core
```

<CodeBlock language="typescript">{XMLExample}</CodeBlock>
64 changes: 64 additions & 0 deletions examples/src/prompts/xml_output_parser.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import { XMLOutputParser } from "@langchain/core/output_parsers";

const XML_EXAMPLE = `<?xml version="1.0" encoding="UTF-8"?>
<userProfile>
<userID>12345</userID>
<name>John Doe</name>
<email>john.doe@example.com</email>
<roles>
<role>Admin</role>
<role>User</role>
</roles>
<preferences>
<theme>Dark</theme>
<notifications>
<email>true</email>
<sms>false</sms>
</notifications>
</preferences>
</userProfile>`;

type MySchema = {
userProfile: {
userID: number;
name: string;
email: string;
roles: { role: string[] };
preferences: {
theme: string;
notifications: {
email: boolean;
sms: boolean;
};
};
};
};

// Pass in a generic type for the schema
const parser = new XMLOutputParser<MySchema>();

const result = await parser.invoke(XML_EXAMPLE);

console.log(JSON.stringify(result, null, 2));
/*
{
"userProfile": {
"userID": 12345,
"name": "John Doe",
"email": "john.doe@example.com",
"roles": {
"role": [
"Admin",
"User"
]
},
"preferences": {
"theme": "Dark",
"notifications": {
"email": true,
"sms": false
}
}
}
}
*/
1 change: 1 addition & 0 deletions langchain-core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
"ansi-styles": "^5.0.0",
"camelcase": "6",
"decamelize": "1.2.0",
"fast-xml-parser": "^4.3.4",
"js-tiktoken": "^1.0.8",
"langsmith": "~0.0.48",
"ml-distance": "^4.0.0",
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey there! I noticed that the recent PR added a new dependency "sax" and changed the version of "p-retry". This might impact the project's peer/dev/hard dependencies. I'm flagging this for your review. Keep up the great work!

Expand Down
1 change: 1 addition & 0 deletions langchain-core/src/output_parsers/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ export * from "./list.js";
export * from "./string.js";
export * from "./transform.js";
export * from "./json.js";
export * from "./xml.js";
2 changes: 1 addition & 1 deletion langchain-core/src/output_parsers/json.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ export function parseJsonMarkdown(s: string, parser = parsePartialJson) {
}
}

// Adapted from https://github.com/KillianLucas/open-interpreter/blob/main/interpreter/utils/parse_partial_json.py
// Adapted from https://github.com/KillianLucas/open-interpreter/blob/main/interpreter/core/llm/utils/parse_partial_json.py
// MIT License
export function parsePartialJson(s: string) {
// If the input is undefined, return null to indicate failure.
Expand Down
76 changes: 76 additions & 0 deletions langchain-core/src/output_parsers/tests/xml.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import { XMLOutputParser } from "../xml.js";

const XML_EXAMPLE = `<?xml version="1.0" encoding="UTF-8"?>
<userProfile>
<userID>12345</userID>
<name>John Doe</name>
<email>john.doe@example.com</email>
<roles>
<role>Admin</role>
<role>User</role>
</roles>
<preferences>
<theme>Dark</theme>
<notifications>
<email>true</email>
<sms>false</sms>
</notifications>
</preferences>
</userProfile>`;

const BACKTICK_WRAPPED_XML = `\`\`\`xml\n${XML_EXAMPLE}\n\`\`\``;

type MySchema = {
userProfile: {
userID: number;
name: string;
email: string;
roles: { role: string[] };
preferences: {
theme: string;
notifications: {
email: boolean;
sms: boolean;
};
};
};
};

const expectedResult = {
userProfile: {
userID: 12345,
name: "John Doe",
email: "john.doe@example.com",
roles: { role: ["Admin", "User"] },
preferences: {
theme: "Dark",
notifications: {
email: true,
sms: false,
},
},
},
};

test("Can parse XML", async () => {
const parser = new XMLOutputParser<MySchema>();

const result = await parser.invoke(XML_EXAMPLE);
expect(result).toStrictEqual(expectedResult);
});

test("Can parse backtick wrapped XML", async () => {
const parser = new XMLOutputParser<MySchema>();

const result = await parser.invoke(BACKTICK_WRAPPED_XML);
expect(result).toStrictEqual(expectedResult);
});

test("Can format instructions with passed tags.", async () => {
const tags = ["tag1", "tag2", "tag3"];
const parser = new XMLOutputParser<MySchema>({ tags });

const formatInstructions = parser.getFormatInstructions();

expect(formatInstructions).toContain("tag1, tag2, tag3");
});
78 changes: 78 additions & 0 deletions langchain-core/src/output_parsers/xml.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import { XMLParser } from "fast-xml-parser";
import { BaseOutputParser } from "./base.js";

export const XML_FORMAT_INSTRUCTIONS = `The output should be formatted as a XML file.
1. Output should conform to the tags below.
2. If tags are not given, make them on your own.
3. Remember to always open and close all the tags.

As an example, for the tags ["foo", "bar", "baz"]:
1. String "<foo>\n <bar>\n <baz></baz>\n </bar>\n</foo>" is a well-formatted instance of the schema.
2. String "<foo>\n <bar>\n </foo>" is a badly-formatted instance.
3. String "<foo>\n <tag>\n </tag>\n</foo>" is a badly-formatted instance.

Here are the output tags:
\`\`\
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing a backtick?

{tags}
\`\`\``;

export interface XMLOutputParserFields {
/**
* Optional list of tags that the output should conform to.
* Only used in formatting of the prompt.
*/
tags?: string[];
}

export class XMLOutputParser<
// eslint-disable-next-line @typescript-eslint/no-explicit-any
T extends Record<string, any> = Record<string, any>
> extends BaseOutputParser<T> {
tags?: string[];

constructor(fields?: XMLOutputParserFields) {
super();
this.tags = fields?.tags;
}

static lc_name() {
return "XMLOutputParser";
}

lc_namespace = ["langchain_core", "output_parsers"];

lc_serializable = true;

async parse(text: string): Promise<T> {
return parseXMLMarkdown<T>(text);
}

getFormatInstructions(): string {
const withTags = !!(this.tags && this.tags.length > 0);
return withTags
? XML_FORMAT_INSTRUCTIONS.replace("{tags}", this.tags?.join(", ") ?? "")
: XML_FORMAT_INSTRUCTIONS;
}
}

export function parseXMLMarkdown<
// eslint-disable-next-line @typescript-eslint/no-explicit-any
T extends Record<string, any> = Record<string, any>
>(s: string) {
const parser = new XMLParser();
const newString = s.trim();
// Try to find XML string within triple backticks.
const match = /```(xml)?(.*)```/s.exec(newString);
let parsedResult: T;
if (!match) {
// If match found, use the content within the backticks
parsedResult = parser.parse(newString);
} else {
parsedResult = parser.parse(match[2]);
}

if (parsedResult && "?xml" in parsedResult) {
delete parsedResult["?xml"];
}
return parsedResult;
}
12 changes: 12 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -9127,6 +9127,7 @@ __metadata:
eslint-plugin-jest: ^27.6.0
eslint-plugin-no-instanceof: ^1.0.1
eslint-plugin-prettier: ^4.2.1
fast-xml-parser: ^4.3.4
jest: ^29.5.0
jest-environment-node: ^29.6.4
js-tiktoken: ^1.0.8
Expand Down Expand Up @@ -21101,6 +21102,17 @@ __metadata:
languageName: node
linkType: hard

"fast-xml-parser@npm:^4.3.4":
version: 4.3.4
resolution: "fast-xml-parser@npm:4.3.4"
dependencies:
strnum: ^1.0.5
bin:
fxparser: src/cli/cli.js
checksum: ab88177343f6d3d971d53462db3011003a83eb8a8db704840127ddaaf27105ea90cdf7903a0f9b2e1279ccc4adfca8dfc0277b33bae6262406f10c16bd60ccf9
languageName: node
linkType: hard

"fastq@npm:^1.6.0":
version: 1.15.0
resolution: "fastq@npm:1.15.0"
Expand Down
Loading