Merge branch 'emptycrown:main' into main

run-llama · Sep 5, 2023 · 0a14de5 · 0a14de5
2 parents b3ef924 + 7e053f0
commit 0a14de5
Show file tree

Hide file tree

Showing 18 changed files with 433 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,21 @@
+
+## [v0.0.26] - 2023-08-31
+
+(includes v0.0.25)
+
+### New Feature Releases
+- Add Linear loader (#490)
+- Add PDF Table Reader (#476)
+- Bagel loader Added (#479)
+
+### Smaller Features + Bug Fixes
+- Database arg fix in Firestore client (#483)
+- Some update to prevent errors when transforming data in wordlift loader (#489)
+- UTF-8 encode and decode for gmail (#491)
+- iterate json data to Document object in unstructured loader (#485)
+- add custom user agent for metaphor llama index initialization (#480)
+- Fix Syntax in Docs (#478)
+
 ## [v0.0.24] - 2023-08-20
 
 ### New Feature Release

diff --git a/llama_hub/firestore/base.py b/llama_hub/firestore/base.py
@@ -37,7 +37,7 @@ def __init__(
             raise ImportError(IMPORT_ERROR_MSG)
 
         self.db = firestore.Client(project=project_id,
-                                   database_id=database_id,
+                                   database=database_id,
                                    client_info=CLIENT_INFO)
 
     def load_data(self, collection: str) -> List[Document]:

diff --git a/llama_hub/hangeul/README.md b/llama_hub/hangeul/README.md
@@ -0,0 +1,16 @@
+# HWP Loader
+
+This loader reads the HWP file, which is the format of many official documents in South Korea.
+
+## Usage
+
+To use this loader, you need to pass in a file name. It's fine whether the file is compressed or not.
+
+```python
+from llama_hub.hangeul.base import HWPReader
+from pathlib import Path
+
+hwp_path = Path('/path/to/hwp')
+reader = HWPReader()
+documents = reader.load_data(file=hwp_path)
+```
diff --git a/llama_hub/hangeul/__init__.py b/llama_hub/hangeul/__init__.py
@@ -0,0 +1 @@
+"""Init file."""
diff --git a/llama_hub/hangeul/base.py b/llama_hub/hangeul/base.py
@@ -0,0 +1,112 @@
+import olefile
+import zlib
+import struct
+
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+
+class HWPReader(BaseReader):
+    """Hangeul Reader. Reads contents from Hangeul file.
+    Args: None
+    """
+    def __init__(
+            self,
+            *args: Any,
+            **kwargs: Any
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self.FILE_HEADER_SECTION = "FileHeader"
+        self.HWP_SUMMARY_SECTION = "\x05HwpSummaryInformation"
+        self.SECTION_NAME_LENGTH = len("Section")
+        self.BODYTEXT_SECTION = "BodyText"
+        self.HWP_TEXT_TAGS = [67]
+
+    def load_data(
+            self,
+            file: Path,
+    ) -> Document:
+        """Load data and extract table from PDF file.
+
+        Args:
+            file (Path): Path for the PDF file.
+
+        Returns:
+            List[Document]: List of documents.
+        """
+        load_file = olefile.OleFileIO(file)
+        file_dir = load_file.listdir()
+
+        if self.is_valid(file_dir) == False:
+            raise Exception("Not Valid HwpFile")
+
+        result_text = self._get_text(load_file, file_dir)
+        result = self._text_to_document(text=result_text)
+        return result
+
+    def is_valid(self, dirs):
+        if [self.FILE_HEADER_SECTION] not in dirs:
+            return False
+
+        return [self.HWP_SUMMARY_SECTION] in dirs
+
+    def get_body_sections(self, dirs):
+        m = []
+        for d in dirs:
+            if d[0] == self.BODYTEXT_SECTION:
+                m.append(int(d[1][self.SECTION_NAME_LENGTH:]))
+
+        return ["BodyText/Section"+str(x) for x in sorted(m)]
+
+    def _text_to_document(self, text: str, extra_info: Optional[Dict] = None) -> Document:
+
+        return Document(
+            text=text,
+            extra_info=extra_info or {}
+        )
+
+    def get_text(self):
+        return self.text
+
+        # 전체 text 추출
+    def _get_text(self, load_file, file_dir):
+        sections = self.get_body_sections(file_dir)
+        text = ""
+        for section in sections:
+            text += self.get_text_from_section(load_file, section)
+            text += "\n"
+
+        self.text = text
+        return self.text
+
+    def is_compressed(self, load_file):
+        header = load_file.openstream("FileHeader")
+        header_data = header.read()
+        return (header_data[36] & 1) == 1
+
+    def get_text_from_section(self, load_file, section):
+        bodytext = load_file.openstream(section)
+        data = bodytext.read()
+
+        unpacked_data = zlib.decompress(data, -15) if self.is_compressed(load_file) else data
+        size = len(unpacked_data)
+
+        i = 0
+
+        text = ""
+        while i < size:
+            header = struct.unpack_from("<I", unpacked_data, i)[0]
+            rec_type = header & 0x3ff
+            level = (header >> 10) & 0x3ff
+            rec_len = (header >> 20) & 0xfff
+
+            if rec_type in self.HWP_TEXT_TAGS:
+                rec_data = unpacked_data[i+4:i+4+rec_len]
+                text += rec_data.decode('utf-16')
+                text += "\n"
+
+            i += 4 + rec_len
+
+        return text
diff --git a/llama_hub/hangeul/requirements.txt b/llama_hub/hangeul/requirements.txt
@@ -0,0 +1 @@
+olefile
diff --git a/llama_hub/hwp/README.md b/llama_hub/hwp/README.md
@@ -0,0 +1,16 @@
+# HWP Loader
+
+This loader reads the HWP file, which is the format of many official documents in South Korea.
+
+## Usage
+
+To use this loader, you need to pass in a file name. It's fine whether the file is compressed or not.
+
+```python
+from llama_hub.hangeul.base import HWPReader
+from pathlib import Path
+
+hwp_path = Path('/path/to/hwp')
+reader = HWPReader()
+documents = reader.load_data(file=hwp_path)
+```
diff --git a/llama_hub/hwp/__init__.py b/llama_hub/hwp/__init__.py
@@ -0,0 +1 @@
+"""Init file."""
diff --git a/llama_hub/hwp/base.py b/llama_hub/hwp/base.py
@@ -0,0 +1,112 @@
+import olefile
+import zlib
+import struct
+
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+
+class HWPReader(BaseReader):
+    """Hangeul Reader. Reads contents from Hangeul file.
+    Args: None
+    """
+    def __init__(
+            self,
+            *args: Any,
+            **kwargs: Any
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self.FILE_HEADER_SECTION = "FileHeader"
+        self.HWP_SUMMARY_SECTION = "\x05HwpSummaryInformation"
+        self.SECTION_NAME_LENGTH = len("Section")
+        self.BODYTEXT_SECTION = "BodyText"
+        self.HWP_TEXT_TAGS = [67]
+
+    def load_data(
+            self,
+            file: Path,
+    ) -> Document:
+        """Load data and extract table from PDF file.
+
+        Args:
+            file (Path): Path for the PDF file.
+
+        Returns:
+            List[Document]: List of documents.
+        """
+        load_file = olefile.OleFileIO(file)
+        file_dir = load_file.listdir()
+
+        if self.is_valid(file_dir) == False:
+            raise Exception("Not Valid HwpFile")
+
+        result_text = self._get_text(load_file, file_dir)
+        result = self._text_to_document(text=result_text)
+        return result
+
+    def is_valid(self, dirs):
+        if [self.FILE_HEADER_SECTION] not in dirs:
+            return False
+
+        return [self.HWP_SUMMARY_SECTION] in dirs
+
+    def get_body_sections(self, dirs):
+        m = []
+        for d in dirs:
+            if d[0] == self.BODYTEXT_SECTION:
+                m.append(int(d[1][self.SECTION_NAME_LENGTH:]))
+
+        return ["BodyText/Section"+str(x) for x in sorted(m)]
+
+    def _text_to_document(self, text: str, extra_info: Optional[Dict] = None) -> Document:
+
+        return Document(
+            text=text,
+            extra_info=extra_info or {}
+        )
+
+    def get_text(self):
+        return self.text
+
+        # 전체 text 추출
+    def _get_text(self, load_file, file_dir):
+        sections = self.get_body_sections(file_dir)
+        text = ""
+        for section in sections:
+            text += self.get_text_from_section(load_file, section)
+            text += "\n"
+
+        self.text = text
+        return self.text
+
+    def is_compressed(self, load_file):
+        header = load_file.openstream("FileHeader")
+        header_data = header.read()
+        return (header_data[36] & 1) == 1
+
+    def get_text_from_section(self, load_file, section):
+        bodytext = load_file.openstream(section)
+        data = bodytext.read()
+
+        unpacked_data = zlib.decompress(data, -15) if self.is_compressed(load_file) else data
+        size = len(unpacked_data)
+
+        i = 0
+
+        text = ""
+        while i < size:
+            header = struct.unpack_from("<I", unpacked_data, i)[0]
+            rec_type = header & 0x3ff
+            level = (header >> 10) & 0x3ff
+            rec_len = (header >> 20) & 0xfff
+
+            if rec_type in self.HWP_TEXT_TAGS:
+                rec_data = unpacked_data[i+4:i+4+rec_len]
+                text += rec_data.decode('utf-16')
+                text += "\n"
+
+            i += 4 + rec_len
+
+        return text
diff --git a/llama_hub/hwp/requirements.txt b/llama_hub/hwp/requirements.txt
@@ -0,0 +1 @@
+olefile
diff --git a/llama_hub/library.json b/llama_hub/library.json
@@ -932,5 +932,12 @@
       "pdf",
       "pdf table"
     ]
+  },
+  "LinearReader": {
+    "id": "linear",
+    "author": "Sushmithamallesh",
+    "keywords": [
+      "linear"
+    ]
   }
 }
diff --git a/llama_hub/linear/README.md b/llama_hub/linear/README.md
@@ -0,0 +1,70 @@
+# Linear Reader
+
+The Linear loader returns issue based on the query.
+
+## Usage
+
+Here's an example of how to use it
+
+```python
+
+from llama_hub.linear.base import LinearReader
+
+reader = LinearReader(api_key=api_key)
+query = """
+    query Team {
+        team(id: "9cfb482a-81e3-4154-b5b9-2c805e70a02d") {
+            id
+            name
+            issues {
+                nodes {
+                    id
+                    title
+                    description
+                    assignee {
+                        id
+                        name
+                    }
+                    createdAt
+                    archivedAt
+                }
+            }
+        }
+    }
+"""
+
+documents = reader.load_data(query=query)
+```
+
+Alternately, you can also use download_loader from llama_index
+
+```python
+
+from llama_index import download_loader
+LinearReader = download_loader('LinearReader')
+
+reader = LinearReader(api_key=api_key)
+query = """
+    query Team {
+        team(id: "9cfb482a-81e3-4154-b5b9-2c805e70a02d") {
+            id
+            name
+            issues {
+                nodes {
+                    id
+                    title
+                    description
+                    assignee {
+                        id
+                        name
+                    }
+                    createdAt
+                    archivedAt
+                }
+            }
+        }
+    }
+"""
+documents = reader.load_data(query=query)
+
+```
diff --git a/llama_hub/linear/__init__.py b/llama_hub/linear/__init__.py
@@ -0,0 +1 @@
+"""Init file."""