hybrid file extractor

geeks-of-data · Apr 1, 2023 · 6182918 · 6182918
1 parent 979ba77
commit 6182918
Show file tree

Hide file tree

Showing 2 changed files with 318 additions and 0 deletions.
diff --git a/examples/hybrid_extractor_example.ipynb b/examples/hybrid_extractor_example.ipynb
@@ -0,0 +1,246 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "from example_config import SECRET_KEY\n",
+    "openai.api_key = SECRET_KEY"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from knowledgegpt.extractors.hybrid_extractor import HybridFileExtractpr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"How to cook a golden lentil soup? Explain the whole process from the ingredients to serve.\"\n",
+    "file_path = \"your_path_to_the_folder_containing_the_files\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing PDF file...\n",
+      "Extracting paragraphs...\n",
+      "Error in file:  /home/eren/Documents/Codes/GPTScripts/transfer_version/knowledge-gpt/examples/Beneficial_Perturbations_Network_for_Defending_Adversarial_Examples.pdf\n",
+      "Computing embeddings...\n",
+      "model_lang en\n",
+      "Selected 5 document sections:\n",
+      "179\n",
+      "107\n",
+      "168\n",
+      "177\n",
+      "170\n",
+      "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say \"I don't know.\"\n",
+      "\n",
+      "Context:\n",
+      "\n",
+      "* Golden\tLentil\tSoup Ingredients \t 1\tcup\tred\tlentil 6\tcups\twater 4\tchicken\tstock\tcubes 4\tcarrots,\tChopped 1\tonion,\tChopped 2\tgarlic\tcloves,\tMinced 2\tteaspoons\tcurry\tpowder \t cayenne\t(\tTo\tTaste)\t(optional) \t Directions Heat\tall\tof\tthe\titems\tto\tboiling\tin\tpot. Then\tlower\tthe\ttemperature. Allow\tto\tsimmer\tfor\t½\thour\tor\ttill\tlentils\tbecome\tsoft. After\tthis,\tpuree\tthem\ttill\tbecome\tsmooth. \t\n",
+      "* Lentil\tand\tPea\tSoup Ingredients \t 1\ttablespoon\tvegetable\toil 1\t1/2\tcups\tchopped\tSpanish\tonions \t 1\t1/2\tteaspoons\tgarlic,\tminced \t 12\tcups\twater 1\tcup\tdried\tbrown\tlentils 1\tcup\tdried\tsplit\tpeas 4\tsmoked\tham\thocks\t(about\t3\t3/4\tlbs) \t 1\ttablespoon\tsalt 1\ttablespoon\tlemon\tjuice 1/2\tteaspoon\tdried\tthyme\tleaves \t 1/2\tteaspoon\tdried\tsage 1/2\tteaspoon\tdried\tmarjoram 1\t-2\tbay\tleaf 1\t1/2\tcups\tdiced\tcarrots 1\t1/4\tcups\tchopped\tcelery lemon\tslice \t Directions Fry\tthe\tgarlic\tand\tonion\tin\toil\tin\tdutch\toven\tover\tmoderate\ttemperature\tfor FIVE\tmin,\tmixing\ttill\ttender. Add\tthe\trest\tof\titems\tbesides\tlemon\tslices,\tcarrots\tand\tcelery. Heat\tto\tboiling. Lower\tthe\ttemperature\tand\tcover\tthe\tpot.\n",
+      "* Cream\tof\tFish\tSoup Ingredients \t White\tSauce 25\tg\tbutter 25\tg\tflour 550\tml\tfish\tstock salt white\tpepper 1\tgarlic\tclove,\tcrushed \t Soup 450\tg\tcooked\twhite\tfish\tfillets,\tskinned\tde\tboned,\tflaked \t 175\tg\tpeeled\tprawns\t( shrimp) \t salt white\tpepper 75\tml\tcream 50\tg\tbutter \t Directions Then\tblend\tin\tflour\tin\tmelted\tbutter\tand\tcook\tfor\t60\tseconds,\tafter\tthis,\tpour\tin stock\tand\tblend\tway\tfrom\theat.\n",
+      "* Easy\tBroccoli\tSoup Ingredients \t 1\t-2\ttablespoon\toil 1\tonion\t(\tdiced) 2\tgarlic\tcloves\t(\tminced\tor\tuse\t2teasp\tfrom\ta\tjar) \t 1\tlarge\thead\tbroccoli\t(\ttrimmed and\tchopped,\tabout\t500g) \t 1\tpotato\t(\tpeeled\t&\tdiced\tor\tleave\tskin\ton\tif\tyou prefer) \t 6\tcups\tchicken\tstock salt\t&\tfreshly\tground\tblack\tpepper \t 2\t-3\tslices\tbacon\t(\tdiced) \t Directions Fry\tthe\tgarlic\tand\tonion\tin\toil\tin\tpan\tover\tmoderate\ttemperature\tfor\tTHREE\tmin or\ttill\ttender. Take\tstock,\tbroccoli,\tpotato\tand\tadd\tthem\tand\theat\tto\tboiling. Lower\tthe\ttemperature\tand\tallow\tto\tsimmer\tfor\t1/3\thour\tor\ttill\tveggies\tare cooked. Blend\tin\tfood\tprocessor\tfor\tmaking\tsmooth\tmixture. Take\tsoup\tand\tadd\tit\tback\tto\tpan\tand\theat\tslightly. Then\tadd\tthe\tchopped\tbacon\tand\tallow\tto\tsimmer\ttill\tbacon\tis\tcooked,\n",
+      "* Avocado\tBanana\tChilled\tSoup Ingredients \t 2\tripe\tHass\tavocadoes,\tpeeled,\tpitted\tand\tdiced \t 2\tlarge\tripe\tbananas,\tpeeled\tand sliced \t 1\tcup\tmilk 3/4-1\tcup\tsugar 4\ttablespoons\tlemon\tjuice \t 1/2\tteaspoon\tcinnamon 1/8\tteaspoon\tnutmeg 1\tquart\tplain\tyogurt salt,\tif\tneeded,\tto\ttaste \t Directions First\tof\tall,\tpuree\tthe\tbananas\tand\tavocados\tin\tblender. Take\tmilk,\tnutmeg,\tyogurt,\tcinnamon,\tlemon\tjuice,\tsugar\tand\tadd\tthem\tand\tmix till\tbecome\tsmooth. Then\tadd\tin\tsalt. Allow\tto\tchill\tin\trefrigerator\tfor\t180\tmin. \t\n",
+      "\n",
+      " Q: How to cook a golden lentil soup? Explain the whole process from the ingredients to serve.\n",
+      " A:\n",
+      "all_done!\n"
+     ]
+    }
+   ],
+   "source": [
+    "hybrid_extractor = HybridFileExtractpr( file_path, extraction_type=\"page\", embedding_extractor=\"hf\", model_lang=\"en\", )\n",
+    "answer, prompt, messages = hybrid_extractor.extract(query, max_tokens=1500)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>page_number</th>\n",
+       "      <th>content</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Improved Network Robustness\\nwith Adversary Cr...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>F1\\nF2^x2\\n^x1\\nx1\\nx2\\nFigure 1: Adversarial ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "      <td>Defenses against adversarial attacks Adversari...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>4</td>\n",
+       "      <td>where riis the adversarial perturbation genera...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>5</td>\n",
+       "      <td>4 Robust Learning with Adversary Critic\\nAs we...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>419</th>\n",
+       "      <td>213</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>UP-DETR: Unsupervised Pre-training for Object ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>420</th>\n",
+       "      <td>214</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>421</th>\n",
+       "      <td>215</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Paper:</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>422</th>\n",
+       "      <td>216</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Code:  \\n\\n</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>423</th>\n",
+       "      <td>217</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Deformable DETR: Deformable Transformers for E...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>424 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     index page_number                                            content\n",
+       "0        0           1  Improved Network Robustness\\nwith Adversary Cr...\n",
+       "1        1           2  F1\\nF2^x2\\n^x1\\nx1\\nx2\\nFigure 1: Adversarial ...\n",
+       "2        2           3  Defenses against adversarial attacks Adversari...\n",
+       "3        3           4  where riis the adversarial perturbation genera...\n",
+       "4        4           5  4 Robust Learning with Adversary Critic\\nAs we...\n",
+       "..     ...         ...                                                ...\n",
+       "419    213         NaN  UP-DETR: Unsupervised Pre-training for Object ...\n",
+       "420    214         NaN                                                   \n",
+       "421    215         NaN                                           Paper:  \n",
+       "422    216         NaN                                        Code:  \\n\\n\n",
+       "423    217         NaN  Deformable DETR: Deformable Transformers for E...\n",
+       "\n",
+       "[424 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "hybrid_extractor.df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "424"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(hybrid_extractor.embeddings)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "knowledgegpt-env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/knowledgegpt/extractors/hybrid_extractor.py b/knowledgegpt/extractors/hybrid_extractor.py
@@ -0,0 +1,72 @@
+from knowledgegpt.extractors.base_extractor import BaseExtractor
+from knowledgegpt.utils.utils_pdf import process_pdf, process_pdf_page
+from knowledgegpt.utils.utils_powerpoint import process_pptx
+from knowledgegpt.utils.utils_docs import extract_paragraphs
+
+from io import BytesIO
+
+
+class HybridFileExtractpr(BaseExtractor):
+    def __init__(self, directory_path: str, extraction_type: str = "page", embedding_extractor: str = "hf",
+                 model_lang: str = "en", is_turbo: bool = False, verbose: bool = False, index_path: str = None, index_type: str = "basic"):
+        """
+        Extracts paragraphs from a PDF file and computes embeddings for each paragraph,
+        then answers a query using the embeddings.
+        """
+        super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
+                         verbose=verbose, index_path=index_path, index_type=index_type)
+
+        self.directory_path = directory_path
+        self.extraction_type = extraction_type
+
+    def prepare_df(self):
+        if self.df is None:
+            if not self.verbose:
+                print("Processing PDF file...")
+                print("Extracting paragraphs...")
+            import os
+
+            if  os.path.isdir(self.directory_path):
+                import pandas as pd
+                self.df = pd.DataFrame()
+
+                pdf_files = [os.path.join(self.directory_path, f) for f in os.listdir(self.directory_path) if f.endswith(".pdf")]
+                for pdf_file in pdf_files:
+                    try:
+
+                        if self.extraction_type == "page":
+                            self.df = self.df.append(process_pdf_page(pdf_file))
+                        else:
+                            self.df = self.df.append(process_pdf(pdf_file))
+                    except:
+                        print("Error in file: ", pdf_file)
+                        continue
+
+                doc_files = [os.path.join(self.directory_path, f) for f in os.listdir(self.directory_path) if f.endswith(".doc") or f.endswith(".docx")]
+                for doc_file in doc_files:
+                    _, ext = os.path.splitext(doc_file)
+                    allowed_ext = [".doc", ".docx"]
+                    if ext not in allowed_ext:
+                        return {"error": "Only Word files are allowed"}
+                    try:
+
+                        with open(doc_file, "rb") as f:
+                            docs_buffer = BytesIO(f.read())
+
+                        self.df = self.df.append(extract_paragraphs(docs_buffer))
+                    except:
+                        print("Error in file: ", doc_file)
+                        continue
+
+                pptx_files = [os.path.join(self.directory_path, f) for f in os.listdir(self.directory_path) if f.endswith(".pptx")]
+                for pptx_file in pptx_files:
+                    try:
+                        with open(pptx_file, "rb") as f:
+                            pptx_buffer = BytesIO(f.read())
+
+                        self.df = self.df.append(process_pptx(pptx_buffer))
+                    except:
+                        print("Error in file: ", pptx_file)
+                        continue
+
+                self.df = self.df.reset_index()