Skip to content

Commit

Permalink
Fixed tests that fail when running from IDE (langchain4j#54)
Browse files Browse the repository at this point in the history
Also added HTML document type.
  • Loading branch information
langchain4j authored Jul 29, 2023
1 parent ba45d54 commit fd83555
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import java.io.InputStream;

import static dev.langchain4j.data.document.DocumentType.TXT;
import static dev.langchain4j.data.document.DocumentType.*;

class DocumentLoaderUtils {

Expand All @@ -21,12 +21,18 @@ static Document load(DocumentSource source, DocumentParser parser) {
}

static DocumentType detectDocumentType(String pathToFile) {
if (pathToFile.endsWith("txt")) {
if (pathToFile.endsWith(".txt")) {
return TXT;
}

if (pathToFile.endsWith("pdf")) {
return DocumentType.PDF;
if (pathToFile.endsWith(".html")
|| pathToFile.endsWith(".htm")
|| pathToFile.endsWith(".xhtml")) {
return HTML;
}

if (pathToFile.endsWith(".pdf")) {
return PDF;
}

throw new UnsupportedDocumentTypeException(pathToFile);
Expand All @@ -35,7 +41,8 @@ static DocumentType detectDocumentType(String pathToFile) {
static DocumentParser parserFor(DocumentType type) {
switch (type) {
case TXT:
return new TextDocumentParser();
case HTML:
return new TextDocumentParser(type);
case PDF:
return new PdfDocumentParser();
default:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,28 @@

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentParser;
import dev.langchain4j.data.document.DocumentType;
import dev.langchain4j.data.document.Metadata;

import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.nio.charset.Charset;

import static dev.langchain4j.internal.ValidationUtils.ensureNotNull;
import static java.nio.charset.StandardCharsets.UTF_8;

public class TextDocumentParser implements DocumentParser {

private final DocumentType documentType;
private final Charset charset;

public TextDocumentParser() {
this(UTF_8);
public TextDocumentParser(DocumentType documentType) {
this(documentType, UTF_8);
}

public TextDocumentParser(Charset charset) {
if (charset == null) {
throw new IllegalArgumentException("charset cannot be null");
}
this.charset = charset;
public TextDocumentParser(DocumentType documentType, Charset charset) {
this.documentType = ensureNotNull(documentType, "documentType");
this.charset = ensureNotNull(charset, "charset");
}

@Override
Expand All @@ -37,7 +39,7 @@ public Document parse(InputStream inputStream) {

String text = new String(buffer.toByteArray(), charset);

return Document.from(text);
return Document.from(text, new Metadata().add(DOCUMENT_TYPE, documentType.toString()));
} catch (Exception e) {
throw new RuntimeException(e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

import org.junit.jupiter.api.Test;

import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;

Expand All @@ -12,35 +15,46 @@
class FileSystemDocumentLoaderTest {

@Test
// TODO This test fails when running it directly in IDE, but works when running in maven
void should_load_text_document() {

Document document = loadDocument("src/test/resources/test-file-utf8.txt");
Document document = loadDocument(toPath("test-file-utf8.txt"));

assertThat(document.text()).isEqualTo("test\ncontent");
assertThat(document.text()).isEqualToIgnoringWhitespace("test content");
Metadata metadata = document.metadata();
assertThat(metadata.get("file_name")).isEqualTo("test-file-utf8.txt");
assertThat(Paths.get(metadata.get("absolute_directory_path"))).isAbsolute();
}

@Test
// TODO This test fails when running it directly in IDE, but works when running in maven
void should_load_pdf_document() {

Document document = loadDocument("src/test/resources/test-file.pdf");
Document document = loadDocument(toPath("test-file.pdf"));

assertThat(document.text()).isEqualToIgnoringWhitespace("test\ncontent");
assertThat(document.text()).isEqualToIgnoringWhitespace("test content");
Metadata metadata = document.metadata();
assertThat(metadata.get("file_name")).isEqualTo("test-file.pdf");
assertThat(Paths.get(metadata.get("absolute_directory_path"))).isAbsolute();
}

@Test
// TODO This test fails when running it directly in IDE, but works when running in maven
void should_load_documents_ignoring_unsupported_document_types() {

List<Document> documents = loadDocuments("src/test/resources");
String userDir = System.getProperty("user.dir");
Path resourceDirectory = Paths.get(userDir, "langchain4j/src/test/resources");
if (!Files.exists(resourceDirectory)) {
resourceDirectory = Paths.get(userDir, "src/test/resources");
}

List<Document> documents = loadDocuments(resourceDirectory);

assertThat(documents).hasSize(3);
}

private Path toPath(String fileName) {
try {
return Paths.get(getClass().getClassLoader().getResource(fileName).toURI());
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
}
}
Original file line number Diff line number Diff line change
@@ -1,38 +1,37 @@
package dev.langchain4j.data.document.parser;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.source.FileSystemSource;
import org.junit.jupiter.api.Test;

import java.io.IOException;
import java.nio.file.Paths;
import java.io.InputStream;

import static dev.langchain4j.data.document.DocumentType.TXT;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static org.assertj.core.api.Assertions.assertThat;

class TextDocumentParserTest {

@Test
// TODO This test fails when running it directly in IDE, but works when running in maven
void should_parse_with_utf8_charset_by_default() throws IOException {
void should_parse_with_utf8_charset_by_default() {

FileSystemSource source = FileSystemSource.from(Paths.get("src/test/resources/test-file-utf8.txt"));
TextDocumentParser parser = new TextDocumentParser();
TextDocumentParser parser = new TextDocumentParser(TXT);
InputStream inputStream = getClass().getClassLoader().getResourceAsStream("test-file-utf8.txt");

Document document = parser.parse(source.inputStream());
Document document = parser.parse(inputStream);

assertThat(document.text()).isEqualTo("test\ncontent");
assertThat(document.text()).isEqualToIgnoringWhitespace("test content");
assertThat(document.metadata().get("document_type")).isEqualTo("TXT");
}

@Test
// TODO This test fails when running it directly in IDE, but works when running in maven
void should_parse_with_specified_charset() throws IOException {
void should_parse_with_specified_charset() {

FileSystemSource source = FileSystemSource.from(Paths.get("src/test/resources/test-file-iso-8859-1.txt"));
TextDocumentParser parser = new TextDocumentParser(ISO_8859_1);
TextDocumentParser parser = new TextDocumentParser(TXT, ISO_8859_1);
InputStream inputStream = getClass().getClassLoader().getResourceAsStream("test-file-iso-8859-1.txt");

Document document = parser.parse(source.inputStream());
Document document = parser.parse(inputStream);

assertThat(document.text()).isEqualTo("test\ncontent");
assertThat(document.text()).isEqualToIgnoringWhitespace("test content");
assertThat(document.metadata().get("document_type")).isEqualTo("TXT");
}
}

0 comments on commit fd83555

Please sign in to comment.