This repository has been archived by the owner on Mar 1, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 735
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'emptycrown:main' into main
- Loading branch information
Showing
18 changed files
with
433 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# HWP Loader | ||
|
||
This loader reads the HWP file, which is the format of many official documents in South Korea. | ||
|
||
## Usage | ||
|
||
To use this loader, you need to pass in a file name. It's fine whether the file is compressed or not. | ||
|
||
```python | ||
from llama_hub.hangeul.base import HWPReader | ||
from pathlib import Path | ||
|
||
hwp_path = Path('/path/to/hwp') | ||
reader = HWPReader() | ||
documents = reader.load_data(file=hwp_path) | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""Init file.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
import olefile | ||
import zlib | ||
import struct | ||
|
||
from pathlib import Path | ||
from typing import Any, Dict, List, Optional | ||
|
||
from llama_index.readers.base import BaseReader | ||
from llama_index.readers.schema.base import Document | ||
|
||
class HWPReader(BaseReader): | ||
"""Hangeul Reader. Reads contents from Hangeul file. | ||
Args: None | ||
""" | ||
def __init__( | ||
self, | ||
*args: Any, | ||
**kwargs: Any | ||
) -> None: | ||
super().__init__(*args, **kwargs) | ||
self.FILE_HEADER_SECTION = "FileHeader" | ||
self.HWP_SUMMARY_SECTION = "\x05HwpSummaryInformation" | ||
self.SECTION_NAME_LENGTH = len("Section") | ||
self.BODYTEXT_SECTION = "BodyText" | ||
self.HWP_TEXT_TAGS = [67] | ||
|
||
def load_data( | ||
self, | ||
file: Path, | ||
) -> Document: | ||
"""Load data and extract table from PDF file. | ||
Args: | ||
file (Path): Path for the PDF file. | ||
Returns: | ||
List[Document]: List of documents. | ||
""" | ||
load_file = olefile.OleFileIO(file) | ||
file_dir = load_file.listdir() | ||
|
||
if self.is_valid(file_dir) == False: | ||
raise Exception("Not Valid HwpFile") | ||
|
||
result_text = self._get_text(load_file, file_dir) | ||
result = self._text_to_document(text=result_text) | ||
return result | ||
|
||
def is_valid(self, dirs): | ||
if [self.FILE_HEADER_SECTION] not in dirs: | ||
return False | ||
|
||
return [self.HWP_SUMMARY_SECTION] in dirs | ||
|
||
def get_body_sections(self, dirs): | ||
m = [] | ||
for d in dirs: | ||
if d[0] == self.BODYTEXT_SECTION: | ||
m.append(int(d[1][self.SECTION_NAME_LENGTH:])) | ||
|
||
return ["BodyText/Section"+str(x) for x in sorted(m)] | ||
|
||
def _text_to_document(self, text: str, extra_info: Optional[Dict] = None) -> Document: | ||
|
||
return Document( | ||
text=text, | ||
extra_info=extra_info or {} | ||
) | ||
|
||
def get_text(self): | ||
return self.text | ||
|
||
# 전체 text 추출 | ||
def _get_text(self, load_file, file_dir): | ||
sections = self.get_body_sections(file_dir) | ||
text = "" | ||
for section in sections: | ||
text += self.get_text_from_section(load_file, section) | ||
text += "\n" | ||
|
||
self.text = text | ||
return self.text | ||
|
||
def is_compressed(self, load_file): | ||
header = load_file.openstream("FileHeader") | ||
header_data = header.read() | ||
return (header_data[36] & 1) == 1 | ||
|
||
def get_text_from_section(self, load_file, section): | ||
bodytext = load_file.openstream(section) | ||
data = bodytext.read() | ||
|
||
unpacked_data = zlib.decompress(data, -15) if self.is_compressed(load_file) else data | ||
size = len(unpacked_data) | ||
|
||
i = 0 | ||
|
||
text = "" | ||
while i < size: | ||
header = struct.unpack_from("<I", unpacked_data, i)[0] | ||
rec_type = header & 0x3ff | ||
level = (header >> 10) & 0x3ff | ||
rec_len = (header >> 20) & 0xfff | ||
|
||
if rec_type in self.HWP_TEXT_TAGS: | ||
rec_data = unpacked_data[i+4:i+4+rec_len] | ||
text += rec_data.decode('utf-16') | ||
text += "\n" | ||
|
||
i += 4 + rec_len | ||
|
||
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
olefile |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# HWP Loader | ||
|
||
This loader reads the HWP file, which is the format of many official documents in South Korea. | ||
|
||
## Usage | ||
|
||
To use this loader, you need to pass in a file name. It's fine whether the file is compressed or not. | ||
|
||
```python | ||
from llama_hub.hangeul.base import HWPReader | ||
from pathlib import Path | ||
|
||
hwp_path = Path('/path/to/hwp') | ||
reader = HWPReader() | ||
documents = reader.load_data(file=hwp_path) | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""Init file.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
import olefile | ||
import zlib | ||
import struct | ||
|
||
from pathlib import Path | ||
from typing import Any, Dict, List, Optional | ||
|
||
from llama_index.readers.base import BaseReader | ||
from llama_index.readers.schema.base import Document | ||
|
||
class HWPReader(BaseReader): | ||
"""Hangeul Reader. Reads contents from Hangeul file. | ||
Args: None | ||
""" | ||
def __init__( | ||
self, | ||
*args: Any, | ||
**kwargs: Any | ||
) -> None: | ||
super().__init__(*args, **kwargs) | ||
self.FILE_HEADER_SECTION = "FileHeader" | ||
self.HWP_SUMMARY_SECTION = "\x05HwpSummaryInformation" | ||
self.SECTION_NAME_LENGTH = len("Section") | ||
self.BODYTEXT_SECTION = "BodyText" | ||
self.HWP_TEXT_TAGS = [67] | ||
|
||
def load_data( | ||
self, | ||
file: Path, | ||
) -> Document: | ||
"""Load data and extract table from PDF file. | ||
Args: | ||
file (Path): Path for the PDF file. | ||
Returns: | ||
List[Document]: List of documents. | ||
""" | ||
load_file = olefile.OleFileIO(file) | ||
file_dir = load_file.listdir() | ||
|
||
if self.is_valid(file_dir) == False: | ||
raise Exception("Not Valid HwpFile") | ||
|
||
result_text = self._get_text(load_file, file_dir) | ||
result = self._text_to_document(text=result_text) | ||
return result | ||
|
||
def is_valid(self, dirs): | ||
if [self.FILE_HEADER_SECTION] not in dirs: | ||
return False | ||
|
||
return [self.HWP_SUMMARY_SECTION] in dirs | ||
|
||
def get_body_sections(self, dirs): | ||
m = [] | ||
for d in dirs: | ||
if d[0] == self.BODYTEXT_SECTION: | ||
m.append(int(d[1][self.SECTION_NAME_LENGTH:])) | ||
|
||
return ["BodyText/Section"+str(x) for x in sorted(m)] | ||
|
||
def _text_to_document(self, text: str, extra_info: Optional[Dict] = None) -> Document: | ||
|
||
return Document( | ||
text=text, | ||
extra_info=extra_info or {} | ||
) | ||
|
||
def get_text(self): | ||
return self.text | ||
|
||
# 전체 text 추출 | ||
def _get_text(self, load_file, file_dir): | ||
sections = self.get_body_sections(file_dir) | ||
text = "" | ||
for section in sections: | ||
text += self.get_text_from_section(load_file, section) | ||
text += "\n" | ||
|
||
self.text = text | ||
return self.text | ||
|
||
def is_compressed(self, load_file): | ||
header = load_file.openstream("FileHeader") | ||
header_data = header.read() | ||
return (header_data[36] & 1) == 1 | ||
|
||
def get_text_from_section(self, load_file, section): | ||
bodytext = load_file.openstream(section) | ||
data = bodytext.read() | ||
|
||
unpacked_data = zlib.decompress(data, -15) if self.is_compressed(load_file) else data | ||
size = len(unpacked_data) | ||
|
||
i = 0 | ||
|
||
text = "" | ||
while i < size: | ||
header = struct.unpack_from("<I", unpacked_data, i)[0] | ||
rec_type = header & 0x3ff | ||
level = (header >> 10) & 0x3ff | ||
rec_len = (header >> 20) & 0xfff | ||
|
||
if rec_type in self.HWP_TEXT_TAGS: | ||
rec_data = unpacked_data[i+4:i+4+rec_len] | ||
text += rec_data.decode('utf-16') | ||
text += "\n" | ||
|
||
i += 4 + rec_len | ||
|
||
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
olefile |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
# Linear Reader | ||
|
||
The Linear loader returns issue based on the query. | ||
|
||
## Usage | ||
|
||
Here's an example of how to use it | ||
|
||
```python | ||
|
||
from llama_hub.linear.base import LinearReader | ||
|
||
reader = LinearReader(api_key=api_key) | ||
query = """ | ||
query Team { | ||
team(id: "9cfb482a-81e3-4154-b5b9-2c805e70a02d") { | ||
id | ||
name | ||
issues { | ||
nodes { | ||
id | ||
title | ||
description | ||
assignee { | ||
id | ||
name | ||
} | ||
createdAt | ||
archivedAt | ||
} | ||
} | ||
} | ||
} | ||
""" | ||
|
||
documents = reader.load_data(query=query) | ||
``` | ||
|
||
Alternately, you can also use download_loader from llama_index | ||
|
||
```python | ||
|
||
from llama_index import download_loader | ||
LinearReader = download_loader('LinearReader') | ||
|
||
reader = LinearReader(api_key=api_key) | ||
query = """ | ||
query Team { | ||
team(id: "9cfb482a-81e3-4154-b5b9-2c805e70a02d") { | ||
id | ||
name | ||
issues { | ||
nodes { | ||
id | ||
title | ||
description | ||
assignee { | ||
id | ||
name | ||
} | ||
createdAt | ||
archivedAt | ||
} | ||
} | ||
} | ||
} | ||
""" | ||
documents = reader.load_data(query=query) | ||
|
||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""Init file.""" |
Oops, something went wrong.