Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Commit

Permalink
Merge branch 'emptycrown:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
guyyanko authored Sep 5, 2023
2 parents b3ef924 + 7e053f0 commit 0a14de5
Show file tree
Hide file tree
Showing 18 changed files with 433 additions and 4 deletions.
18 changes: 18 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,21 @@

## [v0.0.26] - 2023-08-31

(includes v0.0.25)

### New Feature Releases
- Add Linear loader (#490)
- Add PDF Table Reader (#476)
- Bagel loader Added (#479)

### Smaller Features + Bug Fixes
- Database arg fix in Firestore client (#483)
- Some update to prevent errors when transforming data in wordlift loader (#489)
- UTF-8 encode and decode for gmail (#491)
- iterate json data to Document object in unstructured loader (#485)
- add custom user agent for metaphor llama index initialization (#480)
- Fix Syntax in Docs (#478)

## [v0.0.24] - 2023-08-20

### New Feature Release
Expand Down
2 changes: 1 addition & 1 deletion llama_hub/firestore/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(
raise ImportError(IMPORT_ERROR_MSG)

self.db = firestore.Client(project=project_id,
database_id=database_id,
database=database_id,
client_info=CLIENT_INFO)

def load_data(self, collection: str) -> List[Document]:
Expand Down
16 changes: 16 additions & 0 deletions llama_hub/hangeul/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# HWP Loader

This loader reads the HWP file, which is the format of many official documents in South Korea.

## Usage

To use this loader, you need to pass in a file name. It's fine whether the file is compressed or not.

```python
from llama_hub.hangeul.base import HWPReader
from pathlib import Path

hwp_path = Path('/path/to/hwp')
reader = HWPReader()
documents = reader.load_data(file=hwp_path)
```
1 change: 1 addition & 0 deletions llama_hub/hangeul/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Init file."""
112 changes: 112 additions & 0 deletions llama_hub/hangeul/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import olefile
import zlib
import struct

from pathlib import Path
from typing import Any, Dict, List, Optional

from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document

class HWPReader(BaseReader):
"""Hangeul Reader. Reads contents from Hangeul file.
Args: None
"""
def __init__(
self,
*args: Any,
**kwargs: Any
) -> None:
super().__init__(*args, **kwargs)
self.FILE_HEADER_SECTION = "FileHeader"
self.HWP_SUMMARY_SECTION = "\x05HwpSummaryInformation"
self.SECTION_NAME_LENGTH = len("Section")
self.BODYTEXT_SECTION = "BodyText"
self.HWP_TEXT_TAGS = [67]

def load_data(
self,
file: Path,
) -> Document:
"""Load data and extract table from PDF file.
Args:
file (Path): Path for the PDF file.
Returns:
List[Document]: List of documents.
"""
load_file = olefile.OleFileIO(file)
file_dir = load_file.listdir()

if self.is_valid(file_dir) == False:
raise Exception("Not Valid HwpFile")

result_text = self._get_text(load_file, file_dir)
result = self._text_to_document(text=result_text)
return result

def is_valid(self, dirs):
if [self.FILE_HEADER_SECTION] not in dirs:
return False

return [self.HWP_SUMMARY_SECTION] in dirs

def get_body_sections(self, dirs):
m = []
for d in dirs:
if d[0] == self.BODYTEXT_SECTION:
m.append(int(d[1][self.SECTION_NAME_LENGTH:]))

return ["BodyText/Section"+str(x) for x in sorted(m)]

def _text_to_document(self, text: str, extra_info: Optional[Dict] = None) -> Document:

return Document(
text=text,
extra_info=extra_info or {}
)

def get_text(self):
return self.text

# 전체 text 추출
def _get_text(self, load_file, file_dir):
sections = self.get_body_sections(file_dir)
text = ""
for section in sections:
text += self.get_text_from_section(load_file, section)
text += "\n"

self.text = text
return self.text

def is_compressed(self, load_file):
header = load_file.openstream("FileHeader")
header_data = header.read()
return (header_data[36] & 1) == 1

def get_text_from_section(self, load_file, section):
bodytext = load_file.openstream(section)
data = bodytext.read()

unpacked_data = zlib.decompress(data, -15) if self.is_compressed(load_file) else data
size = len(unpacked_data)

i = 0

text = ""
while i < size:
header = struct.unpack_from("<I", unpacked_data, i)[0]
rec_type = header & 0x3ff
level = (header >> 10) & 0x3ff
rec_len = (header >> 20) & 0xfff

if rec_type in self.HWP_TEXT_TAGS:
rec_data = unpacked_data[i+4:i+4+rec_len]
text += rec_data.decode('utf-16')
text += "\n"

i += 4 + rec_len

return text
1 change: 1 addition & 0 deletions llama_hub/hangeul/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
olefile
16 changes: 16 additions & 0 deletions llama_hub/hwp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# HWP Loader

This loader reads the HWP file, which is the format of many official documents in South Korea.

## Usage

To use this loader, you need to pass in a file name. It's fine whether the file is compressed or not.

```python
from llama_hub.hangeul.base import HWPReader
from pathlib import Path

hwp_path = Path('/path/to/hwp')
reader = HWPReader()
documents = reader.load_data(file=hwp_path)
```
1 change: 1 addition & 0 deletions llama_hub/hwp/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Init file."""
112 changes: 112 additions & 0 deletions llama_hub/hwp/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import olefile
import zlib
import struct

from pathlib import Path
from typing import Any, Dict, List, Optional

from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document

class HWPReader(BaseReader):
"""Hangeul Reader. Reads contents from Hangeul file.
Args: None
"""
def __init__(
self,
*args: Any,
**kwargs: Any
) -> None:
super().__init__(*args, **kwargs)
self.FILE_HEADER_SECTION = "FileHeader"
self.HWP_SUMMARY_SECTION = "\x05HwpSummaryInformation"
self.SECTION_NAME_LENGTH = len("Section")
self.BODYTEXT_SECTION = "BodyText"
self.HWP_TEXT_TAGS = [67]

def load_data(
self,
file: Path,
) -> Document:
"""Load data and extract table from PDF file.
Args:
file (Path): Path for the PDF file.
Returns:
List[Document]: List of documents.
"""
load_file = olefile.OleFileIO(file)
file_dir = load_file.listdir()

if self.is_valid(file_dir) == False:
raise Exception("Not Valid HwpFile")

result_text = self._get_text(load_file, file_dir)
result = self._text_to_document(text=result_text)
return result

def is_valid(self, dirs):
if [self.FILE_HEADER_SECTION] not in dirs:
return False

return [self.HWP_SUMMARY_SECTION] in dirs

def get_body_sections(self, dirs):
m = []
for d in dirs:
if d[0] == self.BODYTEXT_SECTION:
m.append(int(d[1][self.SECTION_NAME_LENGTH:]))

return ["BodyText/Section"+str(x) for x in sorted(m)]

def _text_to_document(self, text: str, extra_info: Optional[Dict] = None) -> Document:

return Document(
text=text,
extra_info=extra_info or {}
)

def get_text(self):
return self.text

# 전체 text 추출
def _get_text(self, load_file, file_dir):
sections = self.get_body_sections(file_dir)
text = ""
for section in sections:
text += self.get_text_from_section(load_file, section)
text += "\n"

self.text = text
return self.text

def is_compressed(self, load_file):
header = load_file.openstream("FileHeader")
header_data = header.read()
return (header_data[36] & 1) == 1

def get_text_from_section(self, load_file, section):
bodytext = load_file.openstream(section)
data = bodytext.read()

unpacked_data = zlib.decompress(data, -15) if self.is_compressed(load_file) else data
size = len(unpacked_data)

i = 0

text = ""
while i < size:
header = struct.unpack_from("<I", unpacked_data, i)[0]
rec_type = header & 0x3ff
level = (header >> 10) & 0x3ff
rec_len = (header >> 20) & 0xfff

if rec_type in self.HWP_TEXT_TAGS:
rec_data = unpacked_data[i+4:i+4+rec_len]
text += rec_data.decode('utf-16')
text += "\n"

i += 4 + rec_len

return text
1 change: 1 addition & 0 deletions llama_hub/hwp/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
olefile
7 changes: 7 additions & 0 deletions llama_hub/library.json
Original file line number Diff line number Diff line change
Expand Up @@ -932,5 +932,12 @@
"pdf",
"pdf table"
]
},
"LinearReader": {
"id": "linear",
"author": "Sushmithamallesh",
"keywords": [
"linear"
]
}
}
70 changes: 70 additions & 0 deletions llama_hub/linear/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Linear Reader

The Linear loader returns issue based on the query.

## Usage

Here's an example of how to use it

```python

from llama_hub.linear.base import LinearReader

reader = LinearReader(api_key=api_key)
query = """
query Team {
team(id: "9cfb482a-81e3-4154-b5b9-2c805e70a02d") {
id
name
issues {
nodes {
id
title
description
assignee {
id
name
}
createdAt
archivedAt
}
}
}
}
"""

documents = reader.load_data(query=query)
```

Alternately, you can also use download_loader from llama_index

```python

from llama_index import download_loader
LinearReader = download_loader('LinearReader')

reader = LinearReader(api_key=api_key)
query = """
query Team {
team(id: "9cfb482a-81e3-4154-b5b9-2c805e70a02d") {
id
name
issues {
nodes {
id
title
description
assignee {
id
name
}
createdAt
archivedAt
}
}
}
}
"""
documents = reader.load_data(query=query)

```
1 change: 1 addition & 0 deletions llama_hub/linear/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Init file."""
Loading

0 comments on commit 0a14de5

Please sign in to comment.