diff --git a/src/libs/langchain/loaders/index.ts b/src/libs/langchain/loaders/index.ts index d120afcab195..89b548be5771 100644 --- a/src/libs/langchain/loaders/index.ts +++ b/src/libs/langchain/loaders/index.ts @@ -8,6 +8,7 @@ import { LangChainLoaderType } from '@/libs/langchain/types'; import { CodeLoader } from './code'; import { DocxLoader } from './docx'; +import { LatexLoader } from './latex'; import { MarkdownLoader } from './markdown'; import { PdfLoader } from './pdf'; import { PPTXLoader } from './pptx'; @@ -38,6 +39,10 @@ export class ChunkingLoader { return await PPTXLoader(fileBlob); } + case 'latex': { + return await LatexLoader(txt); + } + case 'pdf': { return await PdfLoader(fileBlob); } @@ -78,6 +83,10 @@ export class ChunkingLoader { return 'pdf'; } + if (filename.endsWith('tex')) { + return 'latex'; + } + if (filename.endsWith('md') || filename.endsWith('mdx')) { return 'markdown'; } diff --git a/src/libs/langchain/loaders/latex/__tests__/__snapshots__/index.test.ts.snap b/src/libs/langchain/loaders/latex/__tests__/__snapshots__/index.test.ts.snap new file mode 100644 index 000000000000..9312fe1a6cfd --- /dev/null +++ b/src/libs/langchain/loaders/latex/__tests__/__snapshots__/index.test.ts.snap @@ -0,0 +1,205 @@ +// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html + +exports[`LatexLoader > should run 1`] = ` +[ + Document { + "id": undefined, + "metadata": { + "loc": { + "lines": { + "from": 1, + "to": 41, + }, + }, + }, + "pageContent": "\\documentclass{article} + + +\\usepackage{graphicx} % Required for inserting images +\\usepackage{amsmath} % Required for mathematical symbols +\\usepackage{hyperref} % For hyperlinks + + +\\title{Sample LaTeX Document} +\\author{Generated by ChatGPT} +\\date{\\today} + + +\\begin{document} + + +\\maketitle + + +\\tableofcontents + + +\\section{Introduction} +This is a sample LaTeX document that includes various common elements such as sections, lists, tables, figures, and mathematical equations. + + +\\section{Lists} +\\subsection{Itemized List} +\\begin{itemize} +\\item First item +\\item Second item +\\item Third item +\\end{itemize} + + +\\subsection{Enumerated List} +\\begin{enumerate} +\\item First item +\\item Second item +\\item Third item +\\end{enumerate}", + }, + Document { + "id": undefined, + "metadata": { + "loc": { + "lines": { + "from": 27, + "to": 61, + }, + }, + }, + "pageContent": "\\section{Lists} +\\subsection{Itemized List} +\\begin{itemize} +\\item First item +\\item Second item +\\item Third item +\\end{itemize} + + +\\subsection{Enumerated List} +\\begin{enumerate} +\\item First item +\\item Second item +\\item Third item +\\end{enumerate} + + +\\section{Mathematical Equations} +Here are some sample mathematical equations: + + +\\subsection{Inline Equation} +This is an inline equation: \\( E = mc^2 \\). + + +\\subsection{Displayed Equations} +\\begin{equation} +a^2 + b^2 = c^2 +\\end{equation} + + +\\begin{align} +x &= y + z \\\\ +y &= mx + b +\\end{align}", + }, + Document { + "id": undefined, + "metadata": { + "loc": { + "lines": { + "from": 44, + "to": 93, + }, + }, + }, + "pageContent": "\\section{Mathematical Equations} +Here are some sample mathematical equations: + + +\\subsection{Inline Equation} +This is an inline equation: \\( E = mc^2 \\). + + +\\subsection{Displayed Equations} +\\begin{equation} +a^2 + b^2 = c^2 +\\end{equation} + + +\\begin{align} +x &= y + z \\\\ +y &= mx + b +\\end{align} + + +\\section{Tables} +Here is a sample table: + + +\\begin{table}[h!] +\\centering +\\begin{tabular}{|c|c|c|} +\\hline +Header 1 & Header 2 & Header 3 \\\\ +\\hline +Data 1 & Data 2 & Data 3 \\\\ +Data 4 & Data 5 & Data 6 \\\\ +Data 7 & Data 8 & Data 9 \\\\ +\\hline +\\end{tabular} +\\caption{Sample Table} +\\label{table:1} +\\end{table} + + +\\section{Figures} +Here is a sample figure: + + +\\begin{figure}[h!] +\\centering +\\includegraphics[width=0.5\\textwidth]{example-image} +\\caption{Sample Figure} +\\label{fig:1} +\\end{figure}", + }, + Document { + "id": undefined, + "metadata": { + "loc": { + "lines": { + "from": 84, + "to": 112, + }, + }, + }, + "pageContent": "\\section{Figures} +Here is a sample figure: + + +\\begin{figure}[h!] +\\centering +\\includegraphics[width=0.5\\textwidth]{example-image} +\\caption{Sample Figure} +\\label{fig:1} +\\end{figure} + + +\\section{Sections and Subsections} +This is an example of a section with subsections. + + +\\subsection{Subsection 1} +Content of subsection 1. + + +\\subsection{Subsection 2} +Content of subsection 2. + + +\\section{References} +Here is a reference to the table \\ref{table:1} and the figure \\ref{fig:1}. + + +\\end{document}", + }, +] +`; diff --git a/src/libs/langchain/loaders/latex/__tests__/demo.tex b/src/libs/langchain/loaders/latex/__tests__/demo.tex new file mode 100644 index 000000000000..13dcaf0d7a4e --- /dev/null +++ b/src/libs/langchain/loaders/latex/__tests__/demo.tex @@ -0,0 +1,112 @@ +\documentclass{article} + + +\usepackage{graphicx} % Required for inserting images +\usepackage{amsmath} % Required for mathematical symbols +\usepackage{hyperref} % For hyperlinks + + +\title{Sample LaTeX Document} +\author{Generated by ChatGPT} +\date{\today} + + +\begin{document} + + +\maketitle + + +\tableofcontents + + +\section{Introduction} +This is a sample LaTeX document that includes various common elements such as sections, lists, tables, figures, and mathematical equations. + + +\section{Lists} +\subsection{Itemized List} +\begin{itemize} +\item First item +\item Second item +\item Third item +\end{itemize} + + +\subsection{Enumerated List} +\begin{enumerate} +\item First item +\item Second item +\item Third item +\end{enumerate} + + +\section{Mathematical Equations} +Here are some sample mathematical equations: + + +\subsection{Inline Equation} +This is an inline equation: \( E = mc^2 \). + + +\subsection{Displayed Equations} +\begin{equation} +a^2 + b^2 = c^2 +\end{equation} + + +\begin{align} +x &= y + z \\ +y &= mx + b +\end{align} + + +\section{Tables} +Here is a sample table: + + +\begin{table}[h!] +\centering +\begin{tabular}{|c|c|c|} +\hline +Header 1 & Header 2 & Header 3 \\ +\hline +Data 1 & Data 2 & Data 3 \\ +Data 4 & Data 5 & Data 6 \\ +Data 7 & Data 8 & Data 9 \\ +\hline +\end{tabular} +\caption{Sample Table} +\label{table:1} +\end{table} + + +\section{Figures} +Here is a sample figure: + + +\begin{figure}[h!] +\centering +\includegraphics[width=0.5\textwidth]{example-image} +\caption{Sample Figure} +\label{fig:1} +\end{figure} + + +\section{Sections and Subsections} +This is an example of a section with subsections. + + +\subsection{Subsection 1} +Content of subsection 1. + + +\subsection{Subsection 2} +Content of subsection 2. + + +\section{References} +Here is a reference to the table \ref{table:1} and the figure \ref{fig:1}. + + +\end{document} diff --git a/src/libs/langchain/loaders/latex/__tests__/index.test.ts b/src/libs/langchain/loaders/latex/__tests__/index.test.ts new file mode 100644 index 000000000000..facbd59635d2 --- /dev/null +++ b/src/libs/langchain/loaders/latex/__tests__/index.test.ts @@ -0,0 +1,16 @@ +// @vitest-environment node +import * as fs from 'node:fs'; +import { join } from 'node:path'; +import { expect } from 'vitest'; + +import { LatexLoader } from '../index'; + +describe('LatexLoader', () => { + it('should run', async () => { + const content = fs.readFileSync(join(__dirname, `./demo.tex`), 'utf-8'); + + const data = await LatexLoader(content); + + expect(data).toMatchSnapshot(); + }); +}); diff --git a/src/libs/langchain/loaders/latex/index.ts b/src/libs/langchain/loaders/latex/index.ts new file mode 100644 index 000000000000..a71a88d073ed --- /dev/null +++ b/src/libs/langchain/loaders/latex/index.ts @@ -0,0 +1,9 @@ +import { LatexTextSplitter } from 'langchain/text_splitter'; + +import { loaderConfig } from '../config'; + +export const LatexLoader = async (text: string) => { + const splitter = new LatexTextSplitter(loaderConfig); + + return await splitter.createDocuments([text]); +}; diff --git a/src/libs/langchain/types.ts b/src/libs/langchain/types.ts index 696300cf0313..399d569cd23c 100644 --- a/src/libs/langchain/types.ts +++ b/src/libs/langchain/types.ts @@ -1 +1 @@ -export type LangChainLoaderType = 'code' | 'ppt' | 'pdf' | 'markdown' | 'doc' | 'text'; +export type LangChainLoaderType = 'code' | 'ppt' | 'pdf' | 'markdown' | 'doc' | 'text' | 'latex';