diff --git a/langchain4j-core/src/main/java/dev/langchain4j/data/document/Document.java b/langchain4j-core/src/main/java/dev/langchain4j/data/document/Document.java index 5864b55a6b1..276d8b4b58d 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/data/document/Document.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/data/document/Document.java @@ -4,6 +4,11 @@ import java.util.Objects; +/** + * Represents an unstructured piece of text that usually corresponds to a content of a single file. + * This text could originate from various sources such as a text file, PDF, DOCX, or a web page (HTML). + * Each document may have associated metadata including its source, owner, creation date, etc. + */ public class Document { private final String text; diff --git a/langchain4j-core/src/main/java/dev/langchain4j/data/document/DocumentParser.java b/langchain4j-core/src/main/java/dev/langchain4j/data/document/DocumentParser.java index 3aed556a1fd..fe1ecbfa621 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/data/document/DocumentParser.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/data/document/DocumentParser.java @@ -2,7 +2,20 @@ import java.io.InputStream; +/** + * Defines the interface for parsing an InputStream into a Document. + * Different document types require specialized parsing logic. + */ public interface DocumentParser { + String DOCUMENT_TYPE = "document_type"; + + /** + * Parses an InputStream into a Document. + * The specific implementation of this method will depend on the type of the document being parsed. + * + * @param inputStream The InputStream that contains the content of the document. + * @return The parsed Document. + */ Document parse(InputStream inputStream); } diff --git a/langchain4j-core/src/main/java/dev/langchain4j/data/document/DocumentSource.java b/langchain4j-core/src/main/java/dev/langchain4j/data/document/DocumentSource.java index 7abe84a143a..6c9e91c6deb 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/data/document/DocumentSource.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/data/document/DocumentSource.java @@ -3,9 +3,26 @@ import java.io.IOException; import java.io.InputStream; +/** + * Defines the interface for a Document source. + * Documents can be loaded from various sources such as the file system, HTTP, FTP, etc. + */ public interface DocumentSource { + /** + * Provides an InputStream to read the content of the document. + * This method can be implemented to read from various sources like a local file or a network connection. + * + * @return An InputStream from which the document content can be read. + * @throws IOException If an I/O error occurs while creating the InputStream. + */ InputStream inputStream() throws IOException; - Metadata sourceMetadata(); + /** + * Returns the metadata associated with the source of the document. + * This could include details such as the source location, date of creation, owner, etc. + * + * @return A Metadata object containing information associated with the source of the document. + */ + Metadata metadata(); } diff --git a/langchain4j-core/src/main/java/dev/langchain4j/data/document/DocumentSplitter.java b/langchain4j-core/src/main/java/dev/langchain4j/data/document/DocumentSplitter.java index 3f812bd6d13..1ed091b5316 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/data/document/DocumentSplitter.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/data/document/DocumentSplitter.java @@ -6,10 +6,30 @@ import static java.util.stream.Collectors.toList; +/** + * Defines the interface for splitting a document into text segments. + * This is necessary as LLMs have a limited context window, making it impossible to send the entire document at once. + * Therefore, the document should first be split into segments, and only the relevant segments should be sent to LLM. + */ public interface DocumentSplitter { + /** + * Splits a single Document into a list of TextSegment objects. + * The metadata is typically copied from the document and enriched with segment-specific information, + * such as position in the document, page number, etc. + * + * @param document The Document to be split. + * @return A list of TextSegment objects derived from the input Document. + */ List split(Document document); + /** + * Splits a list of Documents into a list of TextSegment objects. + * This is a convenience method that calls the split method for each Document in the list. + * + * @param documents The list of Documents to be split. + * @return A list of TextSegment objects derived from the input Documents. + */ default List splitAll(List documents) { return documents.stream() .flatMap(document -> split(document).stream()) diff --git a/langchain4j-core/src/main/java/dev/langchain4j/data/document/Metadata.java b/langchain4j-core/src/main/java/dev/langchain4j/data/document/Metadata.java index 6320aa52a3e..8b4afe2f86b 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/data/document/Metadata.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/data/document/Metadata.java @@ -4,6 +4,14 @@ import java.util.Map; import java.util.Objects; +/** + * Represents metadata of a Document or a TextSegment. + * The metadata is stored in a key-value map, where both keys and values are strings. + * For a Document, the metadata could include information such as the source, creation date, + * owner, or any other relevant details. + * For a TextSegment, in addition to metadata copied from a document, it can also include segment-specific information, + * such as the page number, the position of the segment within the document, chapter, etc. + */ public class Metadata { private final Map metadata; @@ -23,8 +31,8 @@ public String get(String key) { return metadata.get(key); } - public Metadata add(String key, String value) { - this.metadata.put(key, value); + public Metadata add(String key, Object value) { + this.metadata.put(key, value.toString()); return this; } @@ -55,4 +63,12 @@ public String toString() { " metadata = " + metadata + " }"; } + + public static Metadata from(String key, Object value) { + return new Metadata().add(key, value); + } + + public static Metadata metadata(String key, Object value) { + return from(key, value); + } } diff --git a/langchain4j-core/src/main/java/dev/langchain4j/data/embedding/Embedding.java b/langchain4j-core/src/main/java/dev/langchain4j/data/embedding/Embedding.java index e5ac7cf5981..d66fb3f9b6a 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/data/embedding/Embedding.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/data/embedding/Embedding.java @@ -4,6 +4,13 @@ import java.util.Arrays; import java.util.List; +/** + * Represents a dense vector embedding of a text. + * This class encapsulates a float array that captures the "meaning" or semantic information of the text. + * Texts with similar meanings will have their vectors located close to each other in the embedding space. + * The embeddings are typically created by embedding models. + * @see dev.langchain4j.model.embedding.EmbeddingModel + */ public class Embedding { private final float[] vector; diff --git a/langchain4j-core/src/main/java/dev/langchain4j/data/message/AiMessage.java b/langchain4j-core/src/main/java/dev/langchain4j/data/message/AiMessage.java index 30640201786..012ce0ccc00 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/data/message/AiMessage.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/data/message/AiMessage.java @@ -4,6 +4,11 @@ import java.util.Objects; +/** + * Represents a response message from an AI (LLM). + * The message can contain either a textual response or a request to execute a tool. + * In the case of tool execution, the response to this message should be a {@link ToolExecutionResultMessage}. + */ public class AiMessage extends ChatMessage { private final ToolExecutionRequest toolExecutionRequest; diff --git a/langchain4j-core/src/main/java/dev/langchain4j/data/message/SystemMessage.java b/langchain4j-core/src/main/java/dev/langchain4j/data/message/SystemMessage.java index f2dd69a7c14..61f4f50ae48 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/data/message/SystemMessage.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/data/message/SystemMessage.java @@ -2,6 +2,10 @@ import java.util.Objects; +/** + * Represents a system message, typically defined by a developer. + * This type of message usually provides instructions regarding the AI's actions, such as its behavior or response style. + */ public class SystemMessage extends ChatMessage { public SystemMessage(String text) { diff --git a/langchain4j-core/src/main/java/dev/langchain4j/data/message/ToolExecutionResultMessage.java b/langchain4j-core/src/main/java/dev/langchain4j/data/message/ToolExecutionResultMessage.java index e4b43dcbbc7..de4fe2fb05d 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/data/message/ToolExecutionResultMessage.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/data/message/ToolExecutionResultMessage.java @@ -2,6 +2,9 @@ import java.util.Objects; +/** + * Represents the result of a tool execution. Tool execution requests come from a previous AiMessage. + */ public class ToolExecutionResultMessage extends ChatMessage { private final String toolName; diff --git a/langchain4j-core/src/main/java/dev/langchain4j/data/message/UserMessage.java b/langchain4j-core/src/main/java/dev/langchain4j/data/message/UserMessage.java index 13dadfd7565..03a720303d2 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/data/message/UserMessage.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/data/message/UserMessage.java @@ -2,6 +2,9 @@ import java.util.Objects; +/** + * Represents a message from a user, typically an end user of the application. + */ public class UserMessage extends ChatMessage { private final String name; diff --git a/langchain4j-core/src/main/java/dev/langchain4j/memory/ChatMemory.java b/langchain4j-core/src/main/java/dev/langchain4j/memory/ChatMemory.java index c114e40f7c8..0713ef860c0 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/memory/ChatMemory.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/memory/ChatMemory.java @@ -4,11 +4,31 @@ import java.util.List; +/** + * Represents the memory of a chat (chat history). + * As LLMs are inherently stateless, this interface is useful for tracking the conversation. + */ public interface ChatMemory { + /** + * Adds a message to the chat memory. + * + * @param message The ChatMessage to add. + */ void add(ChatMessage message); + /** + * Retrieves messages from the chat memory. + * Depending on the implementation, it may not return all previously added messages, + * but rather a subset, a summary, or a combination thereof, etc. + * + * @return A list of ChatMessage objects representing the portion of the chat memory that is currently retained. + */ List messages(); + + /** + * Clears the chat memory. + */ void clear(); } diff --git a/langchain4j-core/src/main/java/dev/langchain4j/model/chat/ChatLanguageModel.java b/langchain4j-core/src/main/java/dev/langchain4j/model/chat/ChatLanguageModel.java index 4eeb2d0edae..2ed81379aa3 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/model/chat/ChatLanguageModel.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/model/chat/ChatLanguageModel.java @@ -17,35 +17,66 @@ public interface ChatLanguageModel { /** - * Sends a message from a user to the LLM and returns response. + * Sends a message from a user to the LLM and returns a response. * - * @param userMessage User message as a String. Will be wrapped into {@link dev.langchain4j.data.message.UserMessage UserMessage} under the hood. - * @return {@link dev.langchain4j.data.message.AiMessage AiMessage} + * @param userMessage A user message as a String. Will be wrapped into {@link dev.langchain4j.data.message.UserMessage UserMessage} under the hood. + * @return Response from the LLM. */ default AiMessage sendUserMessage(String userMessage) { return sendUserMessage(UserMessage.from(userMessage)); } + /** + * Sends a message from a user to the LLM and returns a response. + * + * @param userMessage A user message. + * @return Response from the LLM. + */ default AiMessage sendUserMessage(UserMessage userMessage) { return sendMessages(userMessage); } /** - * Sends a structured prompt as a user message to the LLM and returns response. + * Sends a structured prompt as a user message to the LLM and returns a response. * - * @param structuredPrompt object annotated with {@link dev.langchain4j.model.input.structured.StructuredPrompt @StructuredPrompt} - * @return {@link dev.langchain4j.data.message.AiMessage AiMessage} + * @param structuredPrompt A user message as an object annotated with {@link dev.langchain4j.model.input.structured.StructuredPrompt @StructuredPrompt}. Will be converted into {@link dev.langchain4j.data.message.UserMessage UserMessage} under the hood. + * @return Response from the LLM. */ default AiMessage sendUserMessage(Object structuredPrompt) { Prompt prompt = toPrompt(structuredPrompt); return sendUserMessage(prompt.toUserMessage()); } + /** + * Sends a sequence of messages to the LLM and returns a response. + * Typically, the sequence contains messages in the following order: + * System (optional) -> User -> AI -> User -> AI -> User ... + * + * @param messages An array of messages to be sent. + * @return Response from the LLM. + */ default AiMessage sendMessages(ChatMessage... messages) { return sendMessages(asList(messages)); } + /** + * Sends a sequence of messages to the LLM and returns a response. + * Typically, the sequence contains messages in the following order: + * System (optional) -> User -> AI -> User -> AI -> User ... + * + * @param messages A list of messages to be sent. + * @return Response from the LLM. + */ AiMessage sendMessages(List messages); + /** + * Sends a sequence of messages to the LLM and returns a response. + * Typically, the sequence contains messages in the following order: + * System (optional) -> User -> AI -> User -> AI -> User ... + * + * @param messages A list of messages to be sent. + * @param toolSpecifications A list of tools that the LLM is allowed to execute. + * @return Response from the LLM. AiMessage can contain either a textual response or a request to execute a tool. + */ AiMessage sendMessages(List messages, List toolSpecifications); } diff --git a/langchain4j-core/src/main/java/dev/langchain4j/model/chat/StreamingChatLanguageModel.java b/langchain4j-core/src/main/java/dev/langchain4j/model/chat/StreamingChatLanguageModel.java index df1e47e60a1..d661e845d52 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/model/chat/StreamingChatLanguageModel.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/model/chat/StreamingChatLanguageModel.java @@ -12,6 +12,9 @@ import static dev.langchain4j.model.input.structured.StructuredPromptProcessor.toPrompt; import static java.util.Collections.singletonList; +/** + * Represents a LLM that has a chat interface and can stream responses one token at a time. + */ public interface StreamingChatLanguageModel { default void sendUserMessage(String userMessage, StreamingResponseHandler handler) { diff --git a/langchain4j-core/src/main/java/dev/langchain4j/model/chat/TokenCountEstimator.java b/langchain4j-core/src/main/java/dev/langchain4j/model/chat/TokenCountEstimator.java index 5675a347e4c..b3aafeff64a 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/model/chat/TokenCountEstimator.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/model/chat/TokenCountEstimator.java @@ -1,9 +1,8 @@ package dev.langchain4j.model.chat; -import dev.langchain4j.MightChangeInTheFuture; -import dev.langchain4j.data.segment.TextSegment; import dev.langchain4j.data.message.ChatMessage; import dev.langchain4j.data.message.UserMessage; +import dev.langchain4j.data.segment.TextSegment; import dev.langchain4j.model.input.Prompt; import java.util.List; @@ -12,6 +11,10 @@ import static dev.langchain4j.model.input.structured.StructuredPromptProcessor.toPrompt; import static java.util.Collections.singletonList; +/** + * Represents an interface for estimating the count of tokens in various text types such as a text, message, prompt, text segment, etc. + * This can be useful when it's necessary to know in advance the cost of processing a specified text by the LLM. + */ public interface TokenCountEstimator { default int estimateTokenCount(String text) { @@ -22,19 +25,17 @@ default int estimateTokenCount(UserMessage userMessage) { return estimateTokenCount(singletonList(userMessage)); } - @MightChangeInTheFuture("not sure this method is useful/needed") default int estimateTokenCount(Prompt prompt) { return estimateTokenCount(prompt.text()); } - @MightChangeInTheFuture("not sure this method is useful/needed") default int estimateTokenCount(Object structuredPrompt) { return estimateTokenCount(toPrompt(structuredPrompt)); } - int estimateTokenCount(List messages); - default int estimateTokenCount(TextSegment textSegment) { return estimateTokenCount(textSegment.text()); } + + int estimateTokenCount(List messages); } diff --git a/langchain4j-core/src/main/java/dev/langchain4j/model/embedding/EmbeddingModel.java b/langchain4j-core/src/main/java/dev/langchain4j/model/embedding/EmbeddingModel.java index d42ccb980eb..d8231267f23 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/model/embedding/EmbeddingModel.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/model/embedding/EmbeddingModel.java @@ -7,6 +7,9 @@ import static java.util.Collections.singletonList; +/** + * Represents a LLM that generates an embedding for a given text. + */ public interface EmbeddingModel { default Embedding embed(String text) { diff --git a/langchain4j-core/src/main/java/dev/langchain4j/model/embedding/TokenCountEstimator.java b/langchain4j-core/src/main/java/dev/langchain4j/model/embedding/TokenCountEstimator.java index 7ae45caee51..8ae4f5aa283 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/model/embedding/TokenCountEstimator.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/model/embedding/TokenCountEstimator.java @@ -4,6 +4,10 @@ import java.util.List; +/** + * Represents an interface for estimating the count of tokens in various texts, text segments, etc. + * This can be useful when it's necessary to know in advance the cost of processing a specified text by the LLM. + */ public interface TokenCountEstimator { int estimateTokenCount(String text); diff --git a/langchain4j-core/src/main/java/dev/langchain4j/model/input/Prompt.java b/langchain4j-core/src/main/java/dev/langchain4j/model/input/Prompt.java index 26fb893db45..900689dacec 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/model/input/Prompt.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/model/input/Prompt.java @@ -10,6 +10,11 @@ import static dev.langchain4j.data.message.SystemMessage.systemMessage; import static dev.langchain4j.data.message.UserMessage.userMessage; +/** + * Represents a prompt (an input text sent to the LLM). + * A prompt usually contains instructions, contextual information, end-user input, etc. + * A Prompt is typically created by applying one or multiple values to a PromptTemplate. + */ public class Prompt { private final String text; diff --git a/langchain4j-core/src/main/java/dev/langchain4j/model/input/PromptTemplate.java b/langchain4j-core/src/main/java/dev/langchain4j/model/input/PromptTemplate.java index 4413ab62ed3..49c4b77ed10 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/model/input/PromptTemplate.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/model/input/PromptTemplate.java @@ -17,6 +17,14 @@ import static dev.langchain4j.internal.Utils.isNullOrBlank; import static java.util.Collections.singletonMap; +/** + * Represents a template of a prompt that can be reused multiple times. + * A template typically contains one or more variables (placeholders) defined as {{variable_name}} that are + * replaced with actual values to produce a Prompt. + * Special variables {{current_date}}, {{current_time}}, and {{current_date_time}} are automatically + * filled with LocalDate.now(), LocalTime.now(), and LocalDateTime.now() respectively. + * This class uses the Mustache templating engine under the hood, so all Mustache syntax and features are supported. + */ public class PromptTemplate { private static final MustacheFactory MUSTACHE_FACTORY = new DefaultMustacheFactory(); @@ -29,10 +37,22 @@ private PromptTemplate(Mustache mustache, Clock clock) { this.clock = clock; } + /** + * Applies a value to a template containing a single variable. The single variable should have the name {{it}}. + * + * @param value The value that will be injected in place of the {{it}} placeholder in the template. + * @return A Prompt object where the {{it}} placeholder in the template has been replaced by the provided value. + */ public Prompt apply(Object value) { return apply(singletonMap("it", value)); } + /** + * Applies multiple values to a template containing multiple variables. + * + * @param variables A map of variable names to values that will be injected in place of the corresponding placeholders in the template. + * @return A Prompt object where the placeholders in the template have been replaced by the provided values. + */ public Prompt apply(Map variables) { StringWriter writer = new StringWriter(); mustache.execute(writer, injectDateTimeVariables(variables)); @@ -51,7 +71,7 @@ public static PromptTemplate from(String template) { return from(template, Clock.systemDefaultZone()); } - public static PromptTemplate from(String template, Clock clock) { + static PromptTemplate from(String template, Clock clock) { if (isNullOrBlank(template)) { throw illegalArgument("Prompt template cannot be null or empty"); } diff --git a/langchain4j-core/src/main/java/dev/langchain4j/model/language/LanguageModel.java b/langchain4j-core/src/main/java/dev/langchain4j/model/language/LanguageModel.java index ab837eeaa7c..83d19eb3090 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/model/language/LanguageModel.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/model/language/LanguageModel.java @@ -5,8 +5,8 @@ import static dev.langchain4j.model.input.structured.StructuredPromptProcessor.toPrompt; /** - * Represents a LLM with a simple text interface. - * It is recommended to use the ChatLanguageModel instead, as it offers greater capabilities. + * Represents a LLM that has a simple text interface (as opposed to a chat interface). + * It is recommended to use the ChatLanguageModel instead, as it offers better capabilities. * More details: https://openai.com/blog/gpt-4-api-general-availability */ public interface LanguageModel { diff --git a/langchain4j-core/src/main/java/dev/langchain4j/model/language/StreamingLanguageModel.java b/langchain4j-core/src/main/java/dev/langchain4j/model/language/StreamingLanguageModel.java index 1dcb9172f23..38281729e2e 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/model/language/StreamingLanguageModel.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/model/language/StreamingLanguageModel.java @@ -5,6 +5,11 @@ import static dev.langchain4j.model.input.structured.StructuredPromptProcessor.toPrompt; +/** + * Represents a LLM that has a simple text interface (as opposed to a chat interface) and can stream responses one token at a time. + * It is recommended to use the StreamingChatLanguageModel instead, as it offers better capabilities. + * More details: https://openai.com/blog/gpt-4-api-general-availability + */ public interface StreamingLanguageModel { void process(String text, StreamingResponseHandler handler); diff --git a/langchain4j-core/src/main/java/dev/langchain4j/model/language/TokenCountEstimator.java b/langchain4j-core/src/main/java/dev/langchain4j/model/language/TokenCountEstimator.java index f50b28457d8..ede70f20b76 100644 --- a/langchain4j-core/src/main/java/dev/langchain4j/model/language/TokenCountEstimator.java +++ b/langchain4j-core/src/main/java/dev/langchain4j/model/language/TokenCountEstimator.java @@ -5,6 +5,10 @@ import static dev.langchain4j.model.input.structured.StructuredPromptProcessor.toPrompt; +/** + * Represents an interface for estimating the count of tokens in various text types such as a text, prompt, text segment, etc. + * This can be useful when it's necessary to know in advance the cost of processing a specified text by the LLM. + */ public interface TokenCountEstimator { int estimateTokenCount(String text); diff --git a/langchain4j/src/main/java/dev/langchain4j/data/document/DocumentLoaderUtils.java b/langchain4j/src/main/java/dev/langchain4j/data/document/DocumentLoaderUtils.java index c6f7e602c9e..1714098532e 100644 --- a/langchain4j/src/main/java/dev/langchain4j/data/document/DocumentLoaderUtils.java +++ b/langchain4j/src/main/java/dev/langchain4j/data/document/DocumentLoaderUtils.java @@ -12,7 +12,7 @@ class DocumentLoaderUtils { static Document load(DocumentSource source, DocumentParser parser) { try (InputStream inputStream = source.inputStream()) { Document document = parser.parse(inputStream); - Metadata sourceMetadata = source.sourceMetadata(); + Metadata sourceMetadata = source.metadata(); document.metadata().mergeFrom(sourceMetadata); return document; } catch (Exception e) { diff --git a/langchain4j/src/main/java/dev/langchain4j/data/document/parser/PdfDocumentParser.java b/langchain4j/src/main/java/dev/langchain4j/data/document/parser/PdfDocumentParser.java index 77b8893447c..4498282cab0 100644 --- a/langchain4j/src/main/java/dev/langchain4j/data/document/parser/PdfDocumentParser.java +++ b/langchain4j/src/main/java/dev/langchain4j/data/document/parser/PdfDocumentParser.java @@ -8,6 +8,9 @@ import java.io.IOException; import java.io.InputStream; +import static dev.langchain4j.data.document.DocumentType.PDF; +import static dev.langchain4j.data.document.Metadata.metadata; + public class PdfDocumentParser implements DocumentParser { @Override @@ -17,7 +20,7 @@ public Document parse(InputStream inputStream) { PDFTextStripper stripper = new PDFTextStripper(); String content = stripper.getText(pdfDocument); pdfDocument.close(); - return Document.from(content); + return Document.from(content, metadata(DOCUMENT_TYPE, PDF)); } catch (IOException e) { throw new RuntimeException(e); } diff --git a/langchain4j/src/main/java/dev/langchain4j/data/document/source/FileSystemSource.java b/langchain4j/src/main/java/dev/langchain4j/data/document/source/FileSystemSource.java index 72be773859c..5c0247096a5 100644 --- a/langchain4j/src/main/java/dev/langchain4j/data/document/source/FileSystemSource.java +++ b/langchain4j/src/main/java/dev/langchain4j/data/document/source/FileSystemSource.java @@ -28,7 +28,7 @@ public InputStream inputStream() throws IOException { } @Override - public Metadata sourceMetadata() { + public Metadata metadata() { return new Metadata() .add(FILE_NAME, path.getFileName().toString()) .add(ABSOLUTE_DIRECTORY_PATH, path.getParent().toAbsolutePath().toString()); diff --git a/langchain4j/src/main/java/dev/langchain4j/data/document/source/UrlSource.java b/langchain4j/src/main/java/dev/langchain4j/data/document/source/UrlSource.java index b8f29e1fe80..02b512d7a1b 100644 --- a/langchain4j/src/main/java/dev/langchain4j/data/document/source/UrlSource.java +++ b/langchain4j/src/main/java/dev/langchain4j/data/document/source/UrlSource.java @@ -27,8 +27,8 @@ public InputStream inputStream() throws IOException { } @Override - public Metadata sourceMetadata() { - return new Metadata().add(URL, url.toString()); + public Metadata metadata() { + return Metadata.from(URL, url.toString()); } public static UrlSource from(String url) {