Skip to content

Commit

Permalink
hybrid file extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
eren23 committed Apr 1, 2023
1 parent 979ba77 commit 6182918
Show file tree
Hide file tree
Showing 2 changed files with 318 additions and 0 deletions.
246 changes: 246 additions & 0 deletions examples/hybrid_extractor_example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import openai\n",
"from example_config import SECRET_KEY\n",
"openai.api_key = SECRET_KEY"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from knowledgegpt.extractors.hybrid_extractor import HybridFileExtractpr"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"query = \"How to cook a golden lentil soup? Explain the whole process from the ingredients to serve.\"\n",
"file_path = \"your_path_to_the_folder_containing_the_files\"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing PDF file...\n",
"Extracting paragraphs...\n",
"Error in file: /home/eren/Documents/Codes/GPTScripts/transfer_version/knowledge-gpt/examples/Beneficial_Perturbations_Network_for_Defending_Adversarial_Examples.pdf\n",
"Computing embeddings...\n",
"model_lang en\n",
"Selected 5 document sections:\n",
"179\n",
"107\n",
"168\n",
"177\n",
"170\n",
"Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say \"I don't know.\"\n",
"\n",
"Context:\n",
"\n",
"* Golden\tLentil\tSoup Ingredients \t 1\tcup\tred\tlentil 6\tcups\twater 4\tchicken\tstock\tcubes 4\tcarrots,\tChopped 1\tonion,\tChopped 2\tgarlic\tcloves,\tMinced 2\tteaspoons\tcurry\tpowder \t cayenne\t(\tTo\tTaste)\t(optional) \t Directions Heat\tall\tof\tthe\titems\tto\tboiling\tin\tpot. Then\tlower\tthe\ttemperature. Allow\tto\tsimmer\tfor\t½\thour\tor\ttill\tlentils\tbecome\tsoft. After\tthis,\tpuree\tthem\ttill\tbecome\tsmooth. \t\n",
"* Lentil\tand\tPea\tSoup Ingredients \t 1\ttablespoon\tvegetable\toil 1\t1/2\tcups\tchopped\tSpanish\tonions \t 1\t1/2\tteaspoons\tgarlic,\tminced \t 12\tcups\twater 1\tcup\tdried\tbrown\tlentils 1\tcup\tdried\tsplit\tpeas 4\tsmoked\tham\thocks\t(about\t3\t3/4\tlbs) \t 1\ttablespoon\tsalt 1\ttablespoon\tlemon\tjuice 1/2\tteaspoon\tdried\tthyme\tleaves \t 1/2\tteaspoon\tdried\tsage 1/2\tteaspoon\tdried\tmarjoram 1\t-2\tbay\tleaf 1\t1/2\tcups\tdiced\tcarrots 1\t1/4\tcups\tchopped\tcelery lemon\tslice \t Directions Fry\tthe\tgarlic\tand\tonion\tin\toil\tin\tdutch\toven\tover\tmoderate\ttemperature\tfor FIVE\tmin,\tmixing\ttill\ttender. Add\tthe\trest\tof\titems\tbesides\tlemon\tslices,\tcarrots\tand\tcelery. Heat\tto\tboiling. Lower\tthe\ttemperature\tand\tcover\tthe\tpot.\n",
"* Cream\tof\tFish\tSoup Ingredients \t White\tSauce 25\tg\tbutter 25\tg\tflour 550\tml\tfish\tstock salt white\tpepper 1\tgarlic\tclove,\tcrushed \t Soup 450\tg\tcooked\twhite\tfish\tfillets,\tskinned\tde\tboned,\tflaked \t 175\tg\tpeeled\tprawns\t( shrimp) \t salt white\tpepper 75\tml\tcream 50\tg\tbutter \t Directions Then\tblend\tin\tflour\tin\tmelted\tbutter\tand\tcook\tfor\t60\tseconds,\tafter\tthis,\tpour\tin stock\tand\tblend\tway\tfrom\theat.\n",
"* Easy\tBroccoli\tSoup Ingredients \t 1\t-2\ttablespoon\toil 1\tonion\t(\tdiced) 2\tgarlic\tcloves\t(\tminced\tor\tuse\t2teasp\tfrom\ta\tjar) \t 1\tlarge\thead\tbroccoli\t(\ttrimmed and\tchopped,\tabout\t500g) \t 1\tpotato\t(\tpeeled\t&\tdiced\tor\tleave\tskin\ton\tif\tyou prefer) \t 6\tcups\tchicken\tstock salt\t&\tfreshly\tground\tblack\tpepper \t 2\t-3\tslices\tbacon\t(\tdiced) \t Directions Fry\tthe\tgarlic\tand\tonion\tin\toil\tin\tpan\tover\tmoderate\ttemperature\tfor\tTHREE\tmin or\ttill\ttender. Take\tstock,\tbroccoli,\tpotato\tand\tadd\tthem\tand\theat\tto\tboiling. Lower\tthe\ttemperature\tand\tallow\tto\tsimmer\tfor\t1/3\thour\tor\ttill\tveggies\tare cooked. Blend\tin\tfood\tprocessor\tfor\tmaking\tsmooth\tmixture. Take\tsoup\tand\tadd\tit\tback\tto\tpan\tand\theat\tslightly. Then\tadd\tthe\tchopped\tbacon\tand\tallow\tto\tsimmer\ttill\tbacon\tis\tcooked,\n",
"* Avocado\tBanana\tChilled\tSoup Ingredients \t 2\tripe\tHass\tavocadoes,\tpeeled,\tpitted\tand\tdiced \t 2\tlarge\tripe\tbananas,\tpeeled\tand sliced \t 1\tcup\tmilk 3/4-1\tcup\tsugar 4\ttablespoons\tlemon\tjuice \t 1/2\tteaspoon\tcinnamon 1/8\tteaspoon\tnutmeg 1\tquart\tplain\tyogurt salt,\tif\tneeded,\tto\ttaste \t Directions First\tof\tall,\tpuree\tthe\tbananas\tand\tavocados\tin\tblender. Take\tmilk,\tnutmeg,\tyogurt,\tcinnamon,\tlemon\tjuice,\tsugar\tand\tadd\tthem\tand\tmix till\tbecome\tsmooth. Then\tadd\tin\tsalt. Allow\tto\tchill\tin\trefrigerator\tfor\t180\tmin. \t\n",
"\n",
" Q: How to cook a golden lentil soup? Explain the whole process from the ingredients to serve.\n",
" A:\n",
"all_done!\n"
]
}
],
"source": [
"hybrid_extractor = HybridFileExtractpr( file_path, extraction_type=\"page\", embedding_extractor=\"hf\", model_lang=\"en\", )\n",
"answer, prompt, messages = hybrid_extractor.extract(query, max_tokens=1500)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>index</th>\n",
" <th>page_number</th>\n",
" <th>content</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>Improved Network Robustness\\nwith Adversary Cr...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>F1\\nF2^x2\\n^x1\\nx1\\nx2\\nFigure 1: Adversarial ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>Defenses against adversarial attacks Adversari...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>where riis the adversarial perturbation genera...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>4 Robust Learning with Adversary Critic\\nAs we...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>419</th>\n",
" <td>213</td>\n",
" <td>NaN</td>\n",
" <td>UP-DETR: Unsupervised Pre-training for Object ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>420</th>\n",
" <td>214</td>\n",
" <td>NaN</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>421</th>\n",
" <td>215</td>\n",
" <td>NaN</td>\n",
" <td>Paper:</td>\n",
" </tr>\n",
" <tr>\n",
" <th>422</th>\n",
" <td>216</td>\n",
" <td>NaN</td>\n",
" <td>Code: \\n\\n</td>\n",
" </tr>\n",
" <tr>\n",
" <th>423</th>\n",
" <td>217</td>\n",
" <td>NaN</td>\n",
" <td>Deformable DETR: Deformable Transformers for E...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>424 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" index page_number content\n",
"0 0 1 Improved Network Robustness\\nwith Adversary Cr...\n",
"1 1 2 F1\\nF2^x2\\n^x1\\nx1\\nx2\\nFigure 1: Adversarial ...\n",
"2 2 3 Defenses against adversarial attacks Adversari...\n",
"3 3 4 where riis the adversarial perturbation genera...\n",
"4 4 5 4 Robust Learning with Adversary Critic\\nAs we...\n",
".. ... ... ...\n",
"419 213 NaN UP-DETR: Unsupervised Pre-training for Object ...\n",
"420 214 NaN \n",
"421 215 NaN Paper: \n",
"422 216 NaN Code: \\n\\n\n",
"423 217 NaN Deformable DETR: Deformable Transformers for E...\n",
"\n",
"[424 rows x 3 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hybrid_extractor.df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"424"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(hybrid_extractor.embeddings)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "knowledgegpt-env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
72 changes: 72 additions & 0 deletions knowledgegpt/extractors/hybrid_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from knowledgegpt.extractors.base_extractor import BaseExtractor
from knowledgegpt.utils.utils_pdf import process_pdf, process_pdf_page
from knowledgegpt.utils.utils_powerpoint import process_pptx
from knowledgegpt.utils.utils_docs import extract_paragraphs

from io import BytesIO


class HybridFileExtractpr(BaseExtractor):
def __init__(self, directory_path: str, extraction_type: str = "page", embedding_extractor: str = "hf",
model_lang: str = "en", is_turbo: bool = False, verbose: bool = False, index_path: str = None, index_type: str = "basic"):
"""
Extracts paragraphs from a PDF file and computes embeddings for each paragraph,
then answers a query using the embeddings.
"""
super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
verbose=verbose, index_path=index_path, index_type=index_type)

self.directory_path = directory_path
self.extraction_type = extraction_type

def prepare_df(self):
if self.df is None:
if not self.verbose:
print("Processing PDF file...")
print("Extracting paragraphs...")
import os

if os.path.isdir(self.directory_path):
import pandas as pd
self.df = pd.DataFrame()

pdf_files = [os.path.join(self.directory_path, f) for f in os.listdir(self.directory_path) if f.endswith(".pdf")]
for pdf_file in pdf_files:
try:

if self.extraction_type == "page":
self.df = self.df.append(process_pdf_page(pdf_file))
else:
self.df = self.df.append(process_pdf(pdf_file))
except:
print("Error in file: ", pdf_file)
continue

doc_files = [os.path.join(self.directory_path, f) for f in os.listdir(self.directory_path) if f.endswith(".doc") or f.endswith(".docx")]
for doc_file in doc_files:
_, ext = os.path.splitext(doc_file)
allowed_ext = [".doc", ".docx"]
if ext not in allowed_ext:
return {"error": "Only Word files are allowed"}
try:

with open(doc_file, "rb") as f:
docs_buffer = BytesIO(f.read())

self.df = self.df.append(extract_paragraphs(docs_buffer))
except:
print("Error in file: ", doc_file)
continue

pptx_files = [os.path.join(self.directory_path, f) for f in os.listdir(self.directory_path) if f.endswith(".pptx")]
for pptx_file in pptx_files:
try:
with open(pptx_file, "rb") as f:
pptx_buffer = BytesIO(f.read())

self.df = self.df.append(process_pptx(pptx_buffer))
except:
print("Error in file: ", pptx_file)
continue

self.df = self.df.reset_index()

0 comments on commit 6182918

Please sign in to comment.