Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/service/document/parse/html.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,19 @@ function extractSectionsAndContent(html) {
return extractedSections;
}

async function parseHtml(htmlContent, documentPath) {
try {
// HTML content doesn't need citations and hyperlinks removed as extractSectionsAndContent handles that
const sections = extractSectionsAndContent(htmlContent);
// If a document path is provided, preserve it for session reset
return documentPath ? preserveDocumentContext(sections, documentPath) : sections;
} catch (err) {
console.error("Error parsing HTML content:", err);
return [];
}
}

module.exports = {
extractSectionsAndContent,
parseHtml,
};
4 changes: 3 additions & 1 deletion src/service/document/parse/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@ const { parseTxt } = require("./txt");
const { parseMd } = require("./md");
const { parseDocx } = require("./docx");
const { parseOdt } = require("./odt");
const { parseHtml } = require("./html");

module.exports = {
parseMd,
parseOdt,
parsePdf,
parseTxt,
parseDocx
parseDocx,
parseHtml
};
4 changes: 2 additions & 2 deletions src/service/document/parse/odt.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ async function parseOdt(odtFilePath) {
return [];
}

html = removeCitations(html);
html = removeHyperlinks(html);
html = removeCitations(html.value); // Ensure .value is used correctly
html = removeHyperlinks(html.value);

return extractSectionsAndContent(html);
} catch (err) {
Expand Down
10 changes: 9 additions & 1 deletion src/service/document/reader.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
const path = require("path");
const fs = require("fs").promises;
const { parsePdf, parseMd, parseOdt, parseTxt, parseDocx } = require("./parse");
const { parsePdf, parseMd, parseOdt, parseTxt, parseDocx, parseHtml } = require("./parse");

async function loadFile(filePath) {
const fileExtension = path.extname(filePath).toLowerCase();
Expand All @@ -25,6 +25,14 @@ async function loadFile(filePath) {
};
case ".pdf":
return await parsePdf(filePath);
case ".html":
case ".xhtml":
case ".htm":
let htmlContent = await fs.readFile(filePath, "utf-8");
return {
fileName: path.basename(filePath),
data: await parseHtml(htmlContent),
};
default:
// just try to parse it as a text file
let rawText = await fs.readFile(filePath, "utf-8");
Expand Down