Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hybrid file extractor #70

Merged
merged 1 commit into from
Apr 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
246 changes: 246 additions & 0 deletions examples/hybrid_extractor_example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import openai\n",
"from example_config import SECRET_KEY\n",
"openai.api_key = SECRET_KEY"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from knowledgegpt.extractors.hybrid_extractor import HybridFileExtractpr"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"query = \"How to cook a golden lentil soup? Explain the whole process from the ingredients to serve.\"\n",
"file_path = \"your_path_to_the_folder_containing_the_files\"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing PDF file...\n",
"Extracting paragraphs...\n",
"Error in file: /home/eren/Documents/Codes/GPTScripts/transfer_version/knowledge-gpt/examples/Beneficial_Perturbations_Network_for_Defending_Adversarial_Examples.pdf\n",
"Computing embeddings...\n",
"model_lang en\n",
"Selected 5 document sections:\n",
"179\n",
"107\n",
"168\n",
"177\n",
"170\n",
"Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say \"I don't know.\"\n",
"\n",
"Context:\n",
"\n",
"* Golden\tLentil\tSoup Ingredients \t 1\tcup\tred\tlentil 6\tcups\twater 4\tchicken\tstock\tcubes 4\tcarrots,\tChopped 1\tonion,\tChopped 2\tgarlic\tcloves,\tMinced 2\tteaspoons\tcurry\tpowder \t cayenne\t(\tTo\tTaste)\t(optional) \t Directions Heat\tall\tof\tthe\titems\tto\tboiling\tin\tpot. Then\tlower\tthe\ttemperature. Allow\tto\tsimmer\tfor\t½\thour\tor\ttill\tlentils\tbecome\tsoft. After\tthis,\tpuree\tthem\ttill\tbecome\tsmooth. \t\n",
"* Lentil\tand\tPea\tSoup Ingredients \t 1\ttablespoon\tvegetable\toil 1\t1/2\tcups\tchopped\tSpanish\tonions \t 1\t1/2\tteaspoons\tgarlic,\tminced \t 12\tcups\twater 1\tcup\tdried\tbrown\tlentils 1\tcup\tdried\tsplit\tpeas 4\tsmoked\tham\thocks\t(about\t3\t3/4\tlbs) \t 1\ttablespoon\tsalt 1\ttablespoon\tlemon\tjuice 1/2\tteaspoon\tdried\tthyme\tleaves \t 1/2\tteaspoon\tdried\tsage 1/2\tteaspoon\tdried\tmarjoram 1\t-2\tbay\tleaf 1\t1/2\tcups\tdiced\tcarrots 1\t1/4\tcups\tchopped\tcelery lemon\tslice \t Directions Fry\tthe\tgarlic\tand\tonion\tin\toil\tin\tdutch\toven\tover\tmoderate\ttemperature\tfor FIVE\tmin,\tmixing\ttill\ttender. Add\tthe\trest\tof\titems\tbesides\tlemon\tslices,\tcarrots\tand\tcelery. Heat\tto\tboiling. Lower\tthe\ttemperature\tand\tcover\tthe\tpot.\n",
"* Cream\tof\tFish\tSoup Ingredients \t White\tSauce 25\tg\tbutter 25\tg\tflour 550\tml\tfish\tstock salt white\tpepper 1\tgarlic\tclove,\tcrushed \t Soup 450\tg\tcooked\twhite\tfish\tfillets,\tskinned\tde\tboned,\tflaked \t 175\tg\tpeeled\tprawns\t( shrimp) \t salt white\tpepper 75\tml\tcream 50\tg\tbutter \t Directions Then\tblend\tin\tflour\tin\tmelted\tbutter\tand\tcook\tfor\t60\tseconds,\tafter\tthis,\tpour\tin stock\tand\tblend\tway\tfrom\theat.\n",
"* Easy\tBroccoli\tSoup Ingredients \t 1\t-2\ttablespoon\toil 1\tonion\t(\tdiced) 2\tgarlic\tcloves\t(\tminced\tor\tuse\t2teasp\tfrom\ta\tjar) \t 1\tlarge\thead\tbroccoli\t(\ttrimmed and\tchopped,\tabout\t500g) \t 1\tpotato\t(\tpeeled\t&\tdiced\tor\tleave\tskin\ton\tif\tyou prefer) \t 6\tcups\tchicken\tstock salt\t&\tfreshly\tground\tblack\tpepper \t 2\t-3\tslices\tbacon\t(\tdiced) \t Directions Fry\tthe\tgarlic\tand\tonion\tin\toil\tin\tpan\tover\tmoderate\ttemperature\tfor\tTHREE\tmin or\ttill\ttender. Take\tstock,\tbroccoli,\tpotato\tand\tadd\tthem\tand\theat\tto\tboiling. Lower\tthe\ttemperature\tand\tallow\tto\tsimmer\tfor\t1/3\thour\tor\ttill\tveggies\tare cooked. Blend\tin\tfood\tprocessor\tfor\tmaking\tsmooth\tmixture. Take\tsoup\tand\tadd\tit\tback\tto\tpan\tand\theat\tslightly. Then\tadd\tthe\tchopped\tbacon\tand\tallow\tto\tsimmer\ttill\tbacon\tis\tcooked,\n",
"* Avocado\tBanana\tChilled\tSoup Ingredients \t 2\tripe\tHass\tavocadoes,\tpeeled,\tpitted\tand\tdiced \t 2\tlarge\tripe\tbananas,\tpeeled\tand sliced \t 1\tcup\tmilk 3/4-1\tcup\tsugar 4\ttablespoons\tlemon\tjuice \t 1/2\tteaspoon\tcinnamon 1/8\tteaspoon\tnutmeg 1\tquart\tplain\tyogurt salt,\tif\tneeded,\tto\ttaste \t Directions First\tof\tall,\tpuree\tthe\tbananas\tand\tavocados\tin\tblender. Take\tmilk,\tnutmeg,\tyogurt,\tcinnamon,\tlemon\tjuice,\tsugar\tand\tadd\tthem\tand\tmix till\tbecome\tsmooth. Then\tadd\tin\tsalt. Allow\tto\tchill\tin\trefrigerator\tfor\t180\tmin. \t\n",
"\n",
" Q: How to cook a golden lentil soup? Explain the whole process from the ingredients to serve.\n",
" A:\n",
"all_done!\n"
]
}
],
"source": [
"hybrid_extractor = HybridFileExtractpr( file_path, extraction_type=\"page\", embedding_extractor=\"hf\", model_lang=\"en\", )\n",
"answer, prompt, messages = hybrid_extractor.extract(query, max_tokens=1500)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>index</th>\n",
" <th>page_number</th>\n",
" <th>content</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>Improved Network Robustness\\nwith Adversary Cr...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>F1\\nF2^x2\\n^x1\\nx1\\nx2\\nFigure 1: Adversarial ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>Defenses against adversarial attacks Adversari...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>where riis the adversarial perturbation genera...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>4 Robust Learning with Adversary Critic\\nAs we...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>419</th>\n",
" <td>213</td>\n",
" <td>NaN</td>\n",
" <td>UP-DETR: Unsupervised Pre-training for Object ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>420</th>\n",
" <td>214</td>\n",
" <td>NaN</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>421</th>\n",
" <td>215</td>\n",
" <td>NaN</td>\n",
" <td>Paper:</td>\n",
" </tr>\n",
" <tr>\n",
" <th>422</th>\n",
" <td>216</td>\n",
" <td>NaN</td>\n",
" <td>Code: \\n\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>423</th>\n",
" <td>217</td>\n",
" <td>NaN</td>\n",
" <td>Deformable DETR: Deformable Transformers for E...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>424 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" index page_number content\n",
"0 0 1 Improved Network Robustness\\nwith Adversary Cr...\n",
"1 1 2 F1\\nF2^x2\\n^x1\\nx1\\nx2\\nFigure 1: Adversarial ...\n",
"2 2 3 Defenses against adversarial attacks Adversari...\n",
"3 3 4 where riis the adversarial perturbation genera...\n",
"4 4 5 4 Robust Learning with Adversary Critic\\nAs we...\n",
".. ... ... ...\n",
"419 213 NaN UP-DETR: Unsupervised Pre-training for Object ...\n",
"420 214 NaN \n",
"421 215 NaN Paper: \n",
"422 216 NaN Code: \\n\\n\n",
"423 217 NaN Deformable DETR: Deformable Transformers for E...\n",
"\n",
"[424 rows x 3 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hybrid_extractor.df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"424"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(hybrid_extractor.embeddings)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "knowledgegpt-env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
72 changes: 72 additions & 0 deletions knowledgegpt/extractors/hybrid_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from knowledgegpt.extractors.base_extractor import BaseExtractor
from knowledgegpt.utils.utils_pdf import process_pdf, process_pdf_page
from knowledgegpt.utils.utils_powerpoint import process_pptx
from knowledgegpt.utils.utils_docs import extract_paragraphs

from io import BytesIO


class HybridFileExtractpr(BaseExtractor):
def __init__(self, directory_path: str, extraction_type: str = "page", embedding_extractor: str = "hf",
model_lang: str = "en", is_turbo: bool = False, verbose: bool = False, index_path: str = None, index_type: str = "basic"):
"""
Extracts paragraphs from a PDF file and computes embeddings for each paragraph,
then answers a query using the embeddings.
"""
super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
verbose=verbose, index_path=index_path, index_type=index_type)

self.directory_path = directory_path
self.extraction_type = extraction_type

def prepare_df(self):
if self.df is None:
if not self.verbose:
print("Processing PDF file...")
print("Extracting paragraphs...")
import os

if os.path.isdir(self.directory_path):
import pandas as pd
self.df = pd.DataFrame()

pdf_files = [os.path.join(self.directory_path, f) for f in os.listdir(self.directory_path) if f.endswith(".pdf")]
for pdf_file in pdf_files:
try:

if self.extraction_type == "page":
self.df = self.df.append(process_pdf_page(pdf_file))
else:
self.df = self.df.append(process_pdf(pdf_file))
except:
print("Error in file: ", pdf_file)
continue

doc_files = [os.path.join(self.directory_path, f) for f in os.listdir(self.directory_path) if f.endswith(".doc") or f.endswith(".docx")]
for doc_file in doc_files:
_, ext = os.path.splitext(doc_file)
allowed_ext = [".doc", ".docx"]
if ext not in allowed_ext:
return {"error": "Only Word files are allowed"}
try:

with open(doc_file, "rb") as f:
docs_buffer = BytesIO(f.read())

self.df = self.df.append(extract_paragraphs(docs_buffer))
except:
print("Error in file: ", doc_file)
continue

pptx_files = [os.path.join(self.directory_path, f) for f in os.listdir(self.directory_path) if f.endswith(".pptx")]
for pptx_file in pptx_files:
try:
with open(pptx_file, "rb") as f:
pptx_buffer = BytesIO(f.read())

self.df = self.df.append(process_pptx(pptx_buffer))
except:
print("Error in file: ", pptx_file)
continue

self.df = self.df.reset_index()