-
Notifications
You must be signed in to change notification settings - Fork 53
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
318 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,246 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import openai\n", | ||
"from example_config import SECRET_KEY\n", | ||
"openai.api_key = SECRET_KEY" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from knowledgegpt.extractors.hybrid_extractor import HybridFileExtractpr" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"query = \"How to cook a golden lentil soup? Explain the whole process from the ingredients to serve.\"\n", | ||
"file_path = \"your_path_to_the_folder_containing_the_files\"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processing PDF file...\n", | ||
"Extracting paragraphs...\n", | ||
"Error in file: /home/eren/Documents/Codes/GPTScripts/transfer_version/knowledge-gpt/examples/Beneficial_Perturbations_Network_for_Defending_Adversarial_Examples.pdf\n", | ||
"Computing embeddings...\n", | ||
"model_lang en\n", | ||
"Selected 5 document sections:\n", | ||
"179\n", | ||
"107\n", | ||
"168\n", | ||
"177\n", | ||
"170\n", | ||
"Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say \"I don't know.\"\n", | ||
"\n", | ||
"Context:\n", | ||
"\n", | ||
"* Golden\tLentil\tSoup Ingredients \t 1\tcup\tred\tlentil 6\tcups\twater 4\tchicken\tstock\tcubes 4\tcarrots,\tChopped 1\tonion,\tChopped 2\tgarlic\tcloves,\tMinced 2\tteaspoons\tcurry\tpowder \t cayenne\t(\tTo\tTaste)\t(optional) \t Directions Heat\tall\tof\tthe\titems\tto\tboiling\tin\tpot. Then\tlower\tthe\ttemperature. Allow\tto\tsimmer\tfor\t½\thour\tor\ttill\tlentils\tbecome\tsoft. After\tthis,\tpuree\tthem\ttill\tbecome\tsmooth. \t\n", | ||
"* Lentil\tand\tPea\tSoup Ingredients \t 1\ttablespoon\tvegetable\toil 1\t1/2\tcups\tchopped\tSpanish\tonions \t 1\t1/2\tteaspoons\tgarlic,\tminced \t 12\tcups\twater 1\tcup\tdried\tbrown\tlentils 1\tcup\tdried\tsplit\tpeas 4\tsmoked\tham\thocks\t(about\t3\t3/4\tlbs) \t 1\ttablespoon\tsalt 1\ttablespoon\tlemon\tjuice 1/2\tteaspoon\tdried\tthyme\tleaves \t 1/2\tteaspoon\tdried\tsage 1/2\tteaspoon\tdried\tmarjoram 1\t-2\tbay\tleaf 1\t1/2\tcups\tdiced\tcarrots 1\t1/4\tcups\tchopped\tcelery lemon\tslice \t Directions Fry\tthe\tgarlic\tand\tonion\tin\toil\tin\tdutch\toven\tover\tmoderate\ttemperature\tfor FIVE\tmin,\tmixing\ttill\ttender. Add\tthe\trest\tof\titems\tbesides\tlemon\tslices,\tcarrots\tand\tcelery. Heat\tto\tboiling. Lower\tthe\ttemperature\tand\tcover\tthe\tpot.\n", | ||
"* Cream\tof\tFish\tSoup Ingredients \t White\tSauce 25\tg\tbutter 25\tg\tflour 550\tml\tfish\tstock salt white\tpepper 1\tgarlic\tclove,\tcrushed \t Soup 450\tg\tcooked\twhite\tfish\tfillets,\tskinned\tde\tboned,\tflaked \t 175\tg\tpeeled\tprawns\t( shrimp) \t salt white\tpepper 75\tml\tcream 50\tg\tbutter \t Directions Then\tblend\tin\tflour\tin\tmelted\tbutter\tand\tcook\tfor\t60\tseconds,\tafter\tthis,\tpour\tin stock\tand\tblend\tway\tfrom\theat.\n", | ||
"* Easy\tBroccoli\tSoup Ingredients \t 1\t-2\ttablespoon\toil 1\tonion\t(\tdiced) 2\tgarlic\tcloves\t(\tminced\tor\tuse\t2teasp\tfrom\ta\tjar) \t 1\tlarge\thead\tbroccoli\t(\ttrimmed and\tchopped,\tabout\t500g) \t 1\tpotato\t(\tpeeled\t&\tdiced\tor\tleave\tskin\ton\tif\tyou prefer) \t 6\tcups\tchicken\tstock salt\t&\tfreshly\tground\tblack\tpepper \t 2\t-3\tslices\tbacon\t(\tdiced) \t Directions Fry\tthe\tgarlic\tand\tonion\tin\toil\tin\tpan\tover\tmoderate\ttemperature\tfor\tTHREE\tmin or\ttill\ttender. Take\tstock,\tbroccoli,\tpotato\tand\tadd\tthem\tand\theat\tto\tboiling. Lower\tthe\ttemperature\tand\tallow\tto\tsimmer\tfor\t1/3\thour\tor\ttill\tveggies\tare cooked. Blend\tin\tfood\tprocessor\tfor\tmaking\tsmooth\tmixture. Take\tsoup\tand\tadd\tit\tback\tto\tpan\tand\theat\tslightly. Then\tadd\tthe\tchopped\tbacon\tand\tallow\tto\tsimmer\ttill\tbacon\tis\tcooked,\n", | ||
"* Avocado\tBanana\tChilled\tSoup Ingredients \t 2\tripe\tHass\tavocadoes,\tpeeled,\tpitted\tand\tdiced \t 2\tlarge\tripe\tbananas,\tpeeled\tand sliced \t 1\tcup\tmilk 3/4-1\tcup\tsugar 4\ttablespoons\tlemon\tjuice \t 1/2\tteaspoon\tcinnamon 1/8\tteaspoon\tnutmeg 1\tquart\tplain\tyogurt salt,\tif\tneeded,\tto\ttaste \t Directions First\tof\tall,\tpuree\tthe\tbananas\tand\tavocados\tin\tblender. Take\tmilk,\tnutmeg,\tyogurt,\tcinnamon,\tlemon\tjuice,\tsugar\tand\tadd\tthem\tand\tmix till\tbecome\tsmooth. Then\tadd\tin\tsalt. Allow\tto\tchill\tin\trefrigerator\tfor\t180\tmin. \t\n", | ||
"\n", | ||
" Q: How to cook a golden lentil soup? Explain the whole process from the ingredients to serve.\n", | ||
" A:\n", | ||
"all_done!\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"hybrid_extractor = HybridFileExtractpr( file_path, extraction_type=\"page\", embedding_extractor=\"hf\", model_lang=\"en\", )\n", | ||
"answer, prompt, messages = hybrid_extractor.extract(query, max_tokens=1500)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>index</th>\n", | ||
" <th>page_number</th>\n", | ||
" <th>content</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>0</th>\n", | ||
" <td>0</td>\n", | ||
" <td>1</td>\n", | ||
" <td>Improved Network Robustness\\nwith Adversary Cr...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1</th>\n", | ||
" <td>1</td>\n", | ||
" <td>2</td>\n", | ||
" <td>F1\\nF2^x2\\n^x1\\nx1\\nx2\\nFigure 1: Adversarial ...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>2</th>\n", | ||
" <td>2</td>\n", | ||
" <td>3</td>\n", | ||
" <td>Defenses against adversarial attacks Adversari...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>3</th>\n", | ||
" <td>3</td>\n", | ||
" <td>4</td>\n", | ||
" <td>where riis the adversarial perturbation genera...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>4</th>\n", | ||
" <td>4</td>\n", | ||
" <td>5</td>\n", | ||
" <td>4 Robust Learning with Adversary Critic\\nAs we...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>...</th>\n", | ||
" <td>...</td>\n", | ||
" <td>...</td>\n", | ||
" <td>...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>419</th>\n", | ||
" <td>213</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>UP-DETR: Unsupervised Pre-training for Object ...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>420</th>\n", | ||
" <td>214</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td></td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>421</th>\n", | ||
" <td>215</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>Paper:</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>422</th>\n", | ||
" <td>216</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>Code: \\n\\n</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>423</th>\n", | ||
" <td>217</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>Deformable DETR: Deformable Transformers for E...</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"<p>424 rows × 3 columns</p>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" index page_number content\n", | ||
"0 0 1 Improved Network Robustness\\nwith Adversary Cr...\n", | ||
"1 1 2 F1\\nF2^x2\\n^x1\\nx1\\nx2\\nFigure 1: Adversarial ...\n", | ||
"2 2 3 Defenses against adversarial attacks Adversari...\n", | ||
"3 3 4 where riis the adversarial perturbation genera...\n", | ||
"4 4 5 4 Robust Learning with Adversary Critic\\nAs we...\n", | ||
".. ... ... ...\n", | ||
"419 213 NaN UP-DETR: Unsupervised Pre-training for Object ...\n", | ||
"420 214 NaN \n", | ||
"421 215 NaN Paper: \n", | ||
"422 216 NaN Code: \\n\\n\n", | ||
"423 217 NaN Deformable DETR: Deformable Transformers for E...\n", | ||
"\n", | ||
"[424 rows x 3 columns]" | ||
] | ||
}, | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"hybrid_extractor.df" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"424" | ||
] | ||
}, | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"len(hybrid_extractor.embeddings)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "knowledgegpt-env", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.16" | ||
}, | ||
"orig_nbformat": 4 | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
from knowledgegpt.extractors.base_extractor import BaseExtractor | ||
from knowledgegpt.utils.utils_pdf import process_pdf, process_pdf_page | ||
from knowledgegpt.utils.utils_powerpoint import process_pptx | ||
from knowledgegpt.utils.utils_docs import extract_paragraphs | ||
|
||
from io import BytesIO | ||
|
||
|
||
class HybridFileExtractpr(BaseExtractor): | ||
def __init__(self, directory_path: str, extraction_type: str = "page", embedding_extractor: str = "hf", | ||
model_lang: str = "en", is_turbo: bool = False, verbose: bool = False, index_path: str = None, index_type: str = "basic"): | ||
""" | ||
Extracts paragraphs from a PDF file and computes embeddings for each paragraph, | ||
then answers a query using the embeddings. | ||
""" | ||
super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo, | ||
verbose=verbose, index_path=index_path, index_type=index_type) | ||
|
||
self.directory_path = directory_path | ||
self.extraction_type = extraction_type | ||
|
||
def prepare_df(self): | ||
if self.df is None: | ||
if not self.verbose: | ||
print("Processing PDF file...") | ||
print("Extracting paragraphs...") | ||
import os | ||
|
||
if os.path.isdir(self.directory_path): | ||
import pandas as pd | ||
self.df = pd.DataFrame() | ||
|
||
pdf_files = [os.path.join(self.directory_path, f) for f in os.listdir(self.directory_path) if f.endswith(".pdf")] | ||
for pdf_file in pdf_files: | ||
try: | ||
|
||
if self.extraction_type == "page": | ||
self.df = self.df.append(process_pdf_page(pdf_file)) | ||
else: | ||
self.df = self.df.append(process_pdf(pdf_file)) | ||
except: | ||
print("Error in file: ", pdf_file) | ||
continue | ||
|
||
doc_files = [os.path.join(self.directory_path, f) for f in os.listdir(self.directory_path) if f.endswith(".doc") or f.endswith(".docx")] | ||
for doc_file in doc_files: | ||
_, ext = os.path.splitext(doc_file) | ||
allowed_ext = [".doc", ".docx"] | ||
if ext not in allowed_ext: | ||
return {"error": "Only Word files are allowed"} | ||
try: | ||
|
||
with open(doc_file, "rb") as f: | ||
docs_buffer = BytesIO(f.read()) | ||
|
||
self.df = self.df.append(extract_paragraphs(docs_buffer)) | ||
except: | ||
print("Error in file: ", doc_file) | ||
continue | ||
|
||
pptx_files = [os.path.join(self.directory_path, f) for f in os.listdir(self.directory_path) if f.endswith(".pptx")] | ||
for pptx_file in pptx_files: | ||
try: | ||
with open(pptx_file, "rb") as f: | ||
pptx_buffer = BytesIO(f.read()) | ||
|
||
self.df = self.df.append(process_pptx(pptx_buffer)) | ||
except: | ||
print("Error in file: ", pptx_file) | ||
continue | ||
|
||
self.df = self.df.reset_index() |