Add args of trafilaturas function extract to function load_data (#872)

* Add args of trafilaturas function extract to function load_data Added args: - include_comments (bool, optional): Include comments in the output. Defaults to True. - output_format (str, optional): Output format. Defaults to 'txt'. - include_tables (bool, optional): Include tables in the output. Defaults to True. - include_images (bool, optional): Include images in the output. Defaults to False. - include_formatting (bool, optional): Include formatting in the output. Defaults to False. - include_links (bool, optional): Include links in the output. Defaults to False. * Reformating base.py
run-llama · Jan 19, 2024 · 0db48a5 · 0db48a5
1 parent f6494ae
commit 0db48a5
Showing 1 changed file with 25 additions and 2 deletions.
diff --git a/llama_hub/web/trafilatura_web/base.py b/llama_hub/web/trafilatura_web/base.py
@@ -20,11 +20,26 @@ def __init__(self) -> None:
                 "Please `pip install trafilatura` to use this Reader"
             )
 
-    def load_data(self, urls: List[str]) -> List[Document]:
+    def load_data(
+        self,
+        urls: List[str],
+        include_comments=True,
+        output_format="txt",
+        include_tables=True,
+        include_images=False,
+        include_formatting=False,
+        include_links=False,
+    ) -> List[Document]:
         """Load data from the urls.
 
         Args:
             urls (List[str]): List of URLs to scrape.
+            include_comments (bool, optional): Include comments in the output. Defaults to True.
+            output_format (str, optional): Output format. Defaults to 'txt'.
+            include_tables (bool, optional): Include tables in the output. Defaults to True.
+            include_images (bool, optional): Include images in the output. Defaults to False.
+            include_formatting (bool, optional): Include formatting in the output. Defaults to False.
+            include_links (bool, optional): Include links in the output. Defaults to False.
 
         Returns:
             List[Document]: List of documents.
@@ -37,7 +52,15 @@ def load_data(self, urls: List[str]) -> List[Document]:
         documents = []
         for url in urls:
             downloaded = trafilatura.fetch_url(url)
-            response = trafilatura.extract(downloaded)
+            response = trafilatura.extract(
+                downloaded,
+                include_comments=include_comments,
+                output_format=output_format,
+                include_tables=include_tables,
+                include_images=include_images,
+                include_formatting=include_formatting,
+                include_links=include_links,
+            )
             documents.append(Document(text=response))
 
         return documents