Skip to content

Commit

Permalink
⚡️ perf: support latex chunking (lobehub#3592)
Browse files Browse the repository at this point in the history
* 💄 style: support latex chunking

* ✅ test: fix test
  • Loading branch information
arvinxx committed Aug 24, 2024
1 parent 09af3b3 commit 47f5ac7
Show file tree
Hide file tree
Showing 6 changed files with 352 additions and 1 deletion.
9 changes: 9 additions & 0 deletions src/libs/langchain/loaders/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { LangChainLoaderType } from '@/libs/langchain/types';

import { CodeLoader } from './code';
import { DocxLoader } from './docx';
import { LatexLoader } from './latex';
import { MarkdownLoader } from './markdown';
import { PdfLoader } from './pdf';
import { PPTXLoader } from './pptx';
Expand Down Expand Up @@ -38,6 +39,10 @@ export class ChunkingLoader {
return await PPTXLoader(fileBlob);
}

case 'latex': {
return await LatexLoader(txt);
}

case 'pdf': {
return await PdfLoader(fileBlob);
}
Expand Down Expand Up @@ -78,6 +83,10 @@ export class ChunkingLoader {
return 'pdf';
}

if (filename.endsWith('tex')) {
return 'latex';
}

if (filename.endsWith('md') || filename.endsWith('mdx')) {
return 'markdown';
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html

exports[`LatexLoader > should run 1`] = `
[
Document {
"id": undefined,
"metadata": {
"loc": {
"lines": {
"from": 1,
"to": 41,
},
},
},
"pageContent": "\\documentclass{article}
\\usepackage{graphicx} % Required for inserting images
\\usepackage{amsmath} % Required for mathematical symbols
\\usepackage{hyperref} % For hyperlinks
\\title{Sample LaTeX Document}
\\author{Generated by ChatGPT}
\\date{\\today}
\\begin{document}
\\maketitle
\\tableofcontents
\\section{Introduction}
This is a sample LaTeX document that includes various common elements such as sections, lists, tables, figures, and mathematical equations.
\\section{Lists}
\\subsection{Itemized List}
\\begin{itemize}
\\item First item
\\item Second item
\\item Third item
\\end{itemize}
\\subsection{Enumerated List}
\\begin{enumerate}
\\item First item
\\item Second item
\\item Third item
\\end{enumerate}",
},
Document {
"id": undefined,
"metadata": {
"loc": {
"lines": {
"from": 27,
"to": 61,
},
},
},
"pageContent": "\\section{Lists}
\\subsection{Itemized List}
\\begin{itemize}
\\item First item
\\item Second item
\\item Third item
\\end{itemize}
\\subsection{Enumerated List}
\\begin{enumerate}
\\item First item
\\item Second item
\\item Third item
\\end{enumerate}
\\section{Mathematical Equations}
Here are some sample mathematical equations:
\\subsection{Inline Equation}
This is an inline equation: \\( E = mc^2 \\).
\\subsection{Displayed Equations}
\\begin{equation}
a^2 + b^2 = c^2
\\end{equation}
\\begin{align}
x &= y + z \\\\
y &= mx + b
\\end{align}",
},
Document {
"id": undefined,
"metadata": {
"loc": {
"lines": {
"from": 44,
"to": 93,
},
},
},
"pageContent": "\\section{Mathematical Equations}
Here are some sample mathematical equations:
\\subsection{Inline Equation}
This is an inline equation: \\( E = mc^2 \\).
\\subsection{Displayed Equations}
\\begin{equation}
a^2 + b^2 = c^2
\\end{equation}
\\begin{align}
x &= y + z \\\\
y &= mx + b
\\end{align}
\\section{Tables}
Here is a sample table:
\\begin{table}[h!]
\\centering
\\begin{tabular}{|c|c|c|}
\\hline
Header 1 & Header 2 & Header 3 \\\\
\\hline
Data 1 & Data 2 & Data 3 \\\\
Data 4 & Data 5 & Data 6 \\\\
Data 7 & Data 8 & Data 9 \\\\
\\hline
\\end{tabular}
\\caption{Sample Table}
\\label{table:1}
\\end{table}
\\section{Figures}
Here is a sample figure:
\\begin{figure}[h!]
\\centering
\\includegraphics[width=0.5\\textwidth]{example-image}
\\caption{Sample Figure}
\\label{fig:1}
\\end{figure}",
},
Document {
"id": undefined,
"metadata": {
"loc": {
"lines": {
"from": 84,
"to": 112,
},
},
},
"pageContent": "\\section{Figures}
Here is a sample figure:
\\begin{figure}[h!]
\\centering
\\includegraphics[width=0.5\\textwidth]{example-image}
\\caption{Sample Figure}
\\label{fig:1}
\\end{figure}
\\section{Sections and Subsections}
This is an example of a section with subsections.
\\subsection{Subsection 1}
Content of subsection 1.
\\subsection{Subsection 2}
Content of subsection 2.
\\section{References}
Here is a reference to the table \\ref{table:1} and the figure \\ref{fig:1}.
\\end{document}",
},
]
`;
112 changes: 112 additions & 0 deletions src/libs/langchain/loaders/latex/__tests__/demo.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
\documentclass{article}


\usepackage{graphicx} % Required for inserting images
\usepackage{amsmath} % Required for mathematical symbols
\usepackage{hyperref} % For hyperlinks


\title{Sample LaTeX Document}
\author{Generated by ChatGPT}
\date{\today}


\begin{document}


\maketitle


\tableofcontents


\section{Introduction}
This is a sample LaTeX document that includes various common elements such as sections, lists, tables, figures, and mathematical equations.


\section{Lists}
\subsection{Itemized List}
\begin{itemize}
\item First item
\item Second item
\item Third item
\end{itemize}


\subsection{Enumerated List}
\begin{enumerate}
\item First item
\item Second item
\item Third item
\end{enumerate}


\section{Mathematical Equations}
Here are some sample mathematical equations:


\subsection{Inline Equation}
This is an inline equation: \( E = mc^2 \).


\subsection{Displayed Equations}
\begin{equation}
a^2 + b^2 = c^2
\end{equation}


\begin{align}
x &= y + z \\
y &= mx + b
\end{align}


\section{Tables}
Here is a sample table:


\begin{table}[h!]
\centering
\begin{tabular}{|c|c|c|}
\hline
Header 1 & Header 2 & Header 3 \\
\hline
Data 1 & Data 2 & Data 3 \\
Data 4 & Data 5 & Data 6 \\
Data 7 & Data 8 & Data 9 \\
\hline
\end{tabular}
\caption{Sample Table}
\label{table:1}
\end{table}


\section{Figures}
Here is a sample figure:


\begin{figure}[h!]
\centering
\includegraphics[width=0.5\textwidth]{example-image}
\caption{Sample Figure}
\label{fig:1}
\end{figure}


\section{Sections and Subsections}
This is an example of a section with subsections.


\subsection{Subsection 1}
Content of subsection 1.


\subsection{Subsection 2}
Content of subsection 2.


\section{References}
Here is a reference to the table \ref{table:1} and the figure \ref{fig:1}.


\end{document}
16 changes: 16 additions & 0 deletions src/libs/langchain/loaders/latex/__tests__/index.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// @vitest-environment node
import * as fs from 'node:fs';
import { join } from 'node:path';
import { expect } from 'vitest';

import { LatexLoader } from '../index';

describe('LatexLoader', () => {
it('should run', async () => {
const content = fs.readFileSync(join(__dirname, `./demo.tex`), 'utf-8');

const data = await LatexLoader(content);

expect(data).toMatchSnapshot();
});
});
9 changes: 9 additions & 0 deletions src/libs/langchain/loaders/latex/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import { LatexTextSplitter } from 'langchain/text_splitter';

import { loaderConfig } from '../config';

export const LatexLoader = async (text: string) => {
const splitter = new LatexTextSplitter(loaderConfig);

return await splitter.createDocuments([text]);
};
2 changes: 1 addition & 1 deletion src/libs/langchain/types.ts
Original file line number Diff line number Diff line change
@@ -1 +1 @@
export type LangChainLoaderType = 'code' | 'ppt' | 'pdf' | 'markdown' | 'doc' | 'text';
export type LangChainLoaderType = 'code' | 'ppt' | 'pdf' | 'markdown' | 'doc' | 'text' | 'latex';

0 comments on commit 47f5ac7

Please sign in to comment.