Fixed tests that fail when running from IDE (langchain4j#54)

Also added HTML document type.
AbdullahGheith · Jul 29, 2023 · fd83555 · fd83555
1 parent ba45d54
commit fd83555
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 36 deletions.
diff --git a/langchain4j/src/main/java/dev/langchain4j/data/document/DocumentLoaderUtils.java b/langchain4j/src/main/java/dev/langchain4j/data/document/DocumentLoaderUtils.java
@@ -5,7 +5,7 @@
 
 import java.io.InputStream;
 
-import static dev.langchain4j.data.document.DocumentType.TXT;
+import static dev.langchain4j.data.document.DocumentType.*;
 
 class DocumentLoaderUtils {
 
@@ -21,12 +21,18 @@ static Document load(DocumentSource source, DocumentParser parser) {
     }
 
     static DocumentType detectDocumentType(String pathToFile) {
-        if (pathToFile.endsWith("txt")) {
+        if (pathToFile.endsWith(".txt")) {
             return TXT;
         }
 
-        if (pathToFile.endsWith("pdf")) {
-            return DocumentType.PDF;
+        if (pathToFile.endsWith(".html")
+                || pathToFile.endsWith(".htm")
+                || pathToFile.endsWith(".xhtml")) {
+            return HTML;
+        }
+
+        if (pathToFile.endsWith(".pdf")) {
+            return PDF;
         }
 
         throw new UnsupportedDocumentTypeException(pathToFile);
@@ -35,7 +41,8 @@ static DocumentType detectDocumentType(String pathToFile) {
     static DocumentParser parserFor(DocumentType type) {
         switch (type) {
             case TXT:
-                return new TextDocumentParser();
+            case HTML:
+                return new TextDocumentParser(type);
             case PDF:
                 return new PdfDocumentParser();
             default:

diff --git a/langchain4j/src/main/java/dev/langchain4j/data/document/parser/TextDocumentParser.java b/langchain4j/src/main/java/dev/langchain4j/data/document/parser/TextDocumentParser.java
@@ -2,26 +2,28 @@
 
 import dev.langchain4j.data.document.Document;
 import dev.langchain4j.data.document.DocumentParser;
+import dev.langchain4j.data.document.DocumentType;
+import dev.langchain4j.data.document.Metadata;
 
 import java.io.ByteArrayOutputStream;
 import java.io.InputStream;
 import java.nio.charset.Charset;
 
+import static dev.langchain4j.internal.ValidationUtils.ensureNotNull;
 import static java.nio.charset.StandardCharsets.UTF_8;
 
 public class TextDocumentParser implements DocumentParser {
 
+    private final DocumentType documentType;
     private final Charset charset;
 
-    public TextDocumentParser() {
-        this(UTF_8);
+    public TextDocumentParser(DocumentType documentType) {
+        this(documentType, UTF_8);
     }
 
-    public TextDocumentParser(Charset charset) {
-        if (charset == null) {
-            throw new IllegalArgumentException("charset cannot be null");
-        }
-        this.charset = charset;
+    public TextDocumentParser(DocumentType documentType, Charset charset) {
+        this.documentType = ensureNotNull(documentType, "documentType");
+        this.charset = ensureNotNull(charset, "charset");
     }
 
     @Override
@@ -37,7 +39,7 @@ public Document parse(InputStream inputStream) {
 
             String text = new String(buffer.toByteArray(), charset);
 
-            return Document.from(text);
+            return Document.from(text, new Metadata().add(DOCUMENT_TYPE, documentType.toString()));
         } catch (Exception e) {
             throw new RuntimeException(e);
         }

diff --git a/langchain4j/src/test/java/dev/langchain4j/data/document/FileSystemDocumentLoaderTest.java b/langchain4j/src/test/java/dev/langchain4j/data/document/FileSystemDocumentLoaderTest.java
@@ -2,6 +2,9 @@
 
 import org.junit.jupiter.api.Test;
 
+import java.net.URISyntaxException;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.List;
 
@@ -12,35 +15,46 @@
 class FileSystemDocumentLoaderTest {
 
     @Test
-        // TODO This test fails when running it directly in IDE, but works when running in maven
     void should_load_text_document() {
 
-        Document document = loadDocument("src/test/resources/test-file-utf8.txt");
+        Document document = loadDocument(toPath("test-file-utf8.txt"));
 
-        assertThat(document.text()).isEqualTo("test\ncontent");
+        assertThat(document.text()).isEqualToIgnoringWhitespace("test content");
         Metadata metadata = document.metadata();
         assertThat(metadata.get("file_name")).isEqualTo("test-file-utf8.txt");
         assertThat(Paths.get(metadata.get("absolute_directory_path"))).isAbsolute();
     }
 
     @Test
-        // TODO This test fails when running it directly in IDE, but works when running in maven
     void should_load_pdf_document() {
 
-        Document document = loadDocument("src/test/resources/test-file.pdf");
+        Document document = loadDocument(toPath("test-file.pdf"));
 
-        assertThat(document.text()).isEqualToIgnoringWhitespace("test\ncontent");
+        assertThat(document.text()).isEqualToIgnoringWhitespace("test content");
         Metadata metadata = document.metadata();
         assertThat(metadata.get("file_name")).isEqualTo("test-file.pdf");
         assertThat(Paths.get(metadata.get("absolute_directory_path"))).isAbsolute();
     }
 
     @Test
-        // TODO This test fails when running it directly in IDE, but works when running in maven
     void should_load_documents_ignoring_unsupported_document_types() {
 
-        List<Document> documents = loadDocuments("src/test/resources");
+        String userDir = System.getProperty("user.dir");
+        Path resourceDirectory = Paths.get(userDir, "langchain4j/src/test/resources");
+        if (!Files.exists(resourceDirectory)) {
+            resourceDirectory = Paths.get(userDir, "src/test/resources");
+        }
+
+        List<Document> documents = loadDocuments(resourceDirectory);
 
         assertThat(documents).hasSize(3);
     }
+
+    private Path toPath(String fileName) {
+        try {
+            return Paths.get(getClass().getClassLoader().getResource(fileName).toURI());
+        } catch (URISyntaxException e) {
+            throw new RuntimeException(e);
+        }
+    }
 }
diff --git a/langchain4j/src/test/java/dev/langchain4j/data/document/parser/TextDocumentParserTest.java b/langchain4j/src/test/java/dev/langchain4j/data/document/parser/TextDocumentParserTest.java
@@ -1,38 +1,37 @@
 package dev.langchain4j.data.document.parser;
 
 import dev.langchain4j.data.document.Document;
-import dev.langchain4j.data.document.source.FileSystemSource;
 import org.junit.jupiter.api.Test;
 
-import java.io.IOException;
-import java.nio.file.Paths;
+import java.io.InputStream;
 
+import static dev.langchain4j.data.document.DocumentType.TXT;
 import static java.nio.charset.StandardCharsets.ISO_8859_1;
 import static org.assertj.core.api.Assertions.assertThat;
 
 class TextDocumentParserTest {
 
     @Test
-        // TODO This test fails when running it directly in IDE, but works when running in maven
-    void should_parse_with_utf8_charset_by_default() throws IOException {
+    void should_parse_with_utf8_charset_by_default() {
 
-        FileSystemSource source = FileSystemSource.from(Paths.get("src/test/resources/test-file-utf8.txt"));
-        TextDocumentParser parser = new TextDocumentParser();
+        TextDocumentParser parser = new TextDocumentParser(TXT);
+        InputStream inputStream = getClass().getClassLoader().getResourceAsStream("test-file-utf8.txt");
 
-        Document document = parser.parse(source.inputStream());
+        Document document = parser.parse(inputStream);
 
-        assertThat(document.text()).isEqualTo("test\ncontent");
+        assertThat(document.text()).isEqualToIgnoringWhitespace("test content");
+        assertThat(document.metadata().get("document_type")).isEqualTo("TXT");
     }
 
     @Test
-        // TODO This test fails when running it directly in IDE, but works when running in maven
-    void should_parse_with_specified_charset() throws IOException {
+    void should_parse_with_specified_charset() {
 
-        FileSystemSource source = FileSystemSource.from(Paths.get("src/test/resources/test-file-iso-8859-1.txt"));
-        TextDocumentParser parser = new TextDocumentParser(ISO_8859_1);
+        TextDocumentParser parser = new TextDocumentParser(TXT, ISO_8859_1);
+        InputStream inputStream = getClass().getClassLoader().getResourceAsStream("test-file-iso-8859-1.txt");
 
-        Document document = parser.parse(source.inputStream());
+        Document document = parser.parse(inputStream);
 
-        assertThat(document.text()).isEqualTo("test\ncontent");
+        assertThat(document.text()).isEqualToIgnoringWhitespace("test content");
+        assertThat(document.metadata().get("document_type")).isEqualTo("TXT");
     }
 }