geeks-of-data
diff --git a/‎examples/pdf_extractor_strict_example.ipynb
Lines changed: 280 additions & 0 deletions b/‎examples/pdf_extractor_strict_example.ipynb
Lines changed: 280 additions & 0 deletions
@@ -0,0 +1,280 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting en-core-web-sm==3.5.0\n",
+      "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)\n",
+      "\u001b[2K     \u001b[91m━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.6/12.8 MB\u001b[0m \u001b[31m162.8 kB/s\u001b[0m eta \u001b[36m0:01:15\u001b[0m^C\n",
+      "\u001b[2K     \u001b[91m━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.6/12.8 MB\u001b[0m \u001b[31m162.8 kB/s\u001b[0m eta \u001b[36m0:01:15\u001b[0m\n",
+      "\u001b[?25h\u001b[31mERROR: Operation cancelled by user\u001b[0m\u001b[31m\n",
+      "\u001b[0m\n",
+      "\u001b[31mAborted.\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "# !python3 -m spacy download en_core_web_sm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "from example_config import SECRET_KEY\n",
+    "openai.api_key = SECRET_KEY"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/eren/opt/anaconda3/envs/knowledgegpt-env/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "/Users/eren/opt/anaconda3/envs/knowledgegpt-env/lib/python3.9/site-packages/pydub/utils.py:170: RuntimeWarning: Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work\n",
+      "  warn(\"Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work\", RuntimeWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from knowledgegpt.extractors.pdf_extractor import PDFExtractor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"How does the VOS work, what makes it different from other methods?\"\n",
+    "pdf_file_path = \"2202.01197.pdf\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing PDF file...\n",
+      "Extracting paragraphs...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "FloatObject (b'0.00-70') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-70') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-70') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-70') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-40') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-40') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-40') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-40') invalid; use 0.0 instead\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing embeddings...\n",
+      "model_lang en\n",
+      "Selected 12 document sections:\n",
+      "3\n",
+      "12\n",
+      "141\n",
+      "146\n",
+      "86\n",
+      "14\n",
+      "62\n",
+      "129\n",
+      "148\n",
+      "67\n",
+      "75\n",
+      "37\n",
+      "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say \"I don't know.\"\n",
+      "\n",
+      "Context:\n",
+      "\n",
+      "* VOS achieves competitive per- formance on both object detection and image classiﬁcation models, reducing the FPR95 by up to 9.36% compared to the previous best method on object detectors.  Code is available at https://github.com/deeplearning-wisc/vos .  1 I NTRODUCTION  Modern deep neural networks have achieved unprecedented success in known contexts for which they are trained, yet they often struggle to handle the unknowns.\n",
+      "* During training, VOS simultaneously performs the ID task ( e.g., classiﬁcation or object detection) as well as the OOD un- certainty regularization. During inference time, the uncertainty estimation branch produces a larger probabilistic score for ID data and vice versa, which enables effective OOD detection (Section 3.3).  VOS offers several compelling advantages compared to existing solutions.\n",
+      "* Models FPR95#AUROC\"mAP\" PASCAL VOC VOS-ﬁnal 47.53 88.70 48.9 VOS-earlier 50.24 88.24 48.6 BDD-100k VOS-ﬁnal 44.27 86.87 31.3 VOS-earlier 49.66 86.08 30.6 Table 10: Performance comparison of employing VOS on different layers. COCO is the OOD data. of training objects of that class in Figure 9. We use the BDD-100k dataset (Yu et al., 2020) as the in- distribution dataset and the RegNetX-4.0GF (Radosavovic et al., 2020) as the backbone network.\n",
+      "* From Figure 10, the virtual outliers reside in the near-boundary region of the in-distribution feature cluster, which helps the model to learn a compact decision boundary between ID and OOD objects.  I D ISCUSSION ON THE DETECTED ,REJECTED AND IGNORED OOD OBJECTS DURING INFERENCE The focus of VOS is to mitigate the undesirable cases when an OOD object is detected and classiﬁed as in-distribution with high conﬁdence.\n",
+      "* Different from methods that require real outlier data, VOS adaptively synthesizes outliers during training by sampling virtual outliers from the low-likelihood region of the class-conditional distri- butions. The synthesized outliers meaningfully improve the decision boundary between the ID data and OOD data, resulting in superior OOD detection performance while preserving the performance of the ID task. VOS is effective and suitable for both object detection and classiﬁcation tasks.\n",
+      "* (2)VOS enables adaptive outlier synthesis, which can be ﬂexibly and conveniently used for any ID data without manual data collection or cleaning. In contrast, previous methods us- ing outlier exposure (Hendrycks et al., 2019) require an auxiliary image dataset that is sufﬁciently diverse, which can be arguably prohibitive to obtain. Moreover, one needs to perform careful data cleaning to ensure the auxiliary outlier dataset does not overlap with ID data.\n",
+      "* The results are summarized in Table 2, where VOS outperforms alternative synthesis approaches both in the feature space ( |,\\) or the pixel space ( \u0005). Generating outliers in the pixel space ( \u0005) is either unstable (GAN) or harmful for the object detection performance (mixup). Introducing noise (\\), especially using Gaussian noise as outliers is promising.\n",
+      "* See Table 9 for the effect of starting iterationZ. We useZ= 12;000for the PASCAL-VOC model, which is trained for a total of 18,000 iterations.  Z mAP\"FPR95#AUROC\"AUPR\" 2000 48.5 60.01 78.55 87.62 4000 48.4 61.47 79.85 89.41 6000 48.5 59.62 79.97 89.74 8000 48.7 56.85 80.64 90.71 10000 48.6 49.55 83.22 92.49 12000 48.7 54.69 83.41 92.56 14000 49.0 55.39 81.37 93.00 16000 48.9 59.36 82.70 92.62 Table 9: Ablation study on the starting iteration Z. Model is trained for a total of 18,000 iterations. \n",
+      "* Hence, we found it more meaningful to compare relatively with the vanilla Faster-RCNN under the same default thresholds. Using BDD100K as the in-distribution dataset and the ResNet as the backbone, VOS can improve the number of detected OOD boxes by 25% (compared to vanilla object detector). VOS also improves the number of rejected OOD samples by 63%. 21\n",
+      "* Comparison with different regularization loss functions (on backbone of ResNet-50, COCO is the OOD data).  an additional class increases the difﬁculty of object classiﬁcation, which does not outperform either.  This ablation demonstrates the superiority of the uncertainty loss employed by VOS.  VOS is effective on alternative architecture. Lastly, we demonstrate that VOS is effective on alternative neural network architectures.\n",
+      "* Moreover, the conﬁdence score of the false-positive objects of VOS is lower than that of the vanilla model (see the truck in the 3rd column).  Additional visualizations are in Appendix D and H. 5 R ELATED WORK OOD detection for classiﬁcation can be broadly categorized into post hoc and regularization-based approaches. In Bendale & Boult (2016), the OpenMax score is developed for OOD detection based 8\n",
+      "* Moreover, VOS produces probabilistic score for OOD detection, whereas Liu et al. (2020a) relies on non-probabilistic energy score.  Object-level energy score. In case of object detection, we can replace the image-level energy with object-level energy score. For ID object (x;b), the energy is deﬁned as: E(x;b;\u0012) =\u0000logKX k=1wk\u0001expfk((x;b);\u0012); (6) wherefk((x;b);\u0012) =W> clsh(x;b)is the logit output for class kin the classiﬁcation branch.\n",
+      "\n",
+      " Q: How does the VOS work, what makes it different from other methods?\n",
+      " A:\n",
+      "all_done!\n"
+     ]
+    }
+   ],
+   "source": [
+    "pdf_extractor = PDFExtractor( pdf_file_path, extraction_type=\"paragraph\", embedding_extractor=\"hf\", model_lang=\"en\", is_turbo=True, index_type=\"basic\")\n",
+    "answer, prompt, messages = pdf_extractor.extract(query, max_tokens=1500)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'VOS simultaneously performs the ID task (e.g., classification or object detection) as well as the OOD uncertainty regularization during training. During inference time, the uncertainty estimation branch produces a larger probabilistic score for ID data and vice versa, which enables effective OOD detection. VOS adaptively synthesizes outliers during training by sampling virtual outliers from the low-likelihood region of the class-conditional distributions, which meaningfully improves the decision boundary between the ID data and OOD data, resulting in superior OOD detection performance while preserving the performance of the ID task. VOS is effective and suitable for both object detection and classification tasks. Additionally, VOS produces a probabilistic score for OOD detection, which is different from other methods that rely on non-probabilistic energy scores or require real outlier data.'"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "answer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing PDF file...\n",
+      "Extracting paragraphs...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "FloatObject (b'0.00-70') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-70') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-70') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-70') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-40') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-40') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-40') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-40') invalid; use 0.0 instead\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing embeddings...\n",
+      "model_lang en\n",
+      "Selected 13 document sections:\n",
+      "12\n",
+      "86\n",
+      "129\n",
+      "132\n",
+      "61\n",
+      "32\n",
+      "55\n",
+      "8\n",
+      "7\n",
+      "134\n",
+      "2\n",
+      "91\n",
+      "24\n",
+      "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say \"I don't know.\"\n",
+      "\n",
+      "Context:\n",
+      "\n",
+      "* During training, VOS simultaneously performs the ID task ( e.g., classiﬁcation or object detection) as well as the OOD un- certainty regularization. During inference time, the uncertainty estimation branch produces a larger probabilistic score for ID data and vice versa, which enables effective OOD detection (Section 3.3).  VOS offers several compelling advantages compared to existing solutions.\n",
+      "* Different from methods that require real outlier data, VOS adaptively synthesizes outliers during training by sampling virtual outliers from the low-likelihood region of the class-conditional distri- butions. The synthesized outliers meaningfully improve the decision boundary between the ID data and OOD data, resulting in superior OOD detection performance while preserving the performance of the ID task. VOS is effective and suitable for both object detection and classiﬁcation tasks.\n",
+      "* See Table 9 for the effect of starting iterationZ. We useZ= 12;000for the PASCAL-VOC model, which is trained for a total of 18,000 iterations.  Z mAP\"FPR95#AUROC\"AUPR\" 2000 48.5 60.01 78.55 87.62 4000 48.4 61.47 79.85 89.41 6000 48.5 59.62 79.97 89.74 8000 48.7 56.85 80.64 90.71 10000 48.6 49.55 83.22 92.49 12000 48.7 54.69 83.41 92.56 14000 49.0 55.39 81.37 93.00 16000 48.9 59.36 82.70 92.62 Table 9: Ablation study on the starting iteration Z. Model is trained for a total of 18,000 iterations. \n",
+      "* Blue : Objects detected and classiﬁed as one of the ID classes. Green : OOD objects detected by VOS, which reduce false positives among detected objects. \n",
+      "* We consider three variants: randomly sampling n negative proposals ( nis the number of positive proposals), sampling nnegative proposals with a larger probability, and using all the negative proposals. All methods are trained under the same setup, with PASCAL-VOC as in-distribution data and ResNet-50 as the backbone. The loss function is the same as Equation 7 for all variants, with the only difference being the synthesis method. \n",
+      "* The regularization loss should ideally optimize for the sepa- rability between the ID vs. OOD data under some function that captures the data density.\n",
+      "* Published as a conference paper at ICLR 2022 Method AUROC \"mAP\" Image synthesis\u0005GAN (Lee et al., 2018a) 83.67 48.5 \u0005Mixup (Zhang et al., 2018) (mixing ratio 0:4) 61.23 44.3 \u0005Mixup (Zhang et al., 2018) (mixing ratio 1) 63.99 46.9 Noise as outliers\\Additive Gaussian noise to ID features 68.02 48.7 \\Trainable noise added to the ID features 66.67 48.6 \\Gaussian noise 85.98 48.5 Negative proposals|All negative proposals 63.45 48.1 |Random negative proposals 66.03 48.5 |Proposals with large background prob (Joseph et al., 2021) 77.26 48.5 VOS (ours) 88.70 48.9 Table 2: Ablation on outlier synthesis approaches (on backbone of ResNet-50, COCO is the OOD data). \n",
+      "* In a nutshell, VOS consists of three components tackling challenges of outlier synthesis and ef- fective model regularization with synthesized outliers. To synthesize the outliers, we estimate the class-conditional distribution in the feature space , and sample outliers from the low-likelihood re- gion of ID classes (Section 3.1). Key to our method, we show that sampling in the feature space is more tractable than synthesizing images in the high-dimensional pixel space (Lee et al., 2018a). \n",
+      "* However, achieving this goal is non-trivial due to the lack of supervision signal of unknowns. This motivates the question: Can we synthesize virtual outliers for effective model regularization?  In this paper, we propose a novel unknown-aware learning framework dubbed VOS (Virtual Outlier Synthesis), which optimizes the dual objectives of both ID task and OOD detection performance. \n",
+      "* For both ODIN and Mahalanobis distance Lee et al. (2018b), the noise magnitude is set to 0because the region-based object detector is not end-to-end differentiable given the existence of region cropping and ROIAlign. For GAN (Lee et al., 2018a), we follow the original paper and use a GAN to generate OOD images. The prediction of the OOD images/objects is regularized to be close to a uniform distribution, through a KL diver- gence loss with a weight of 0.1.\n",
+      "* Speciﬁcally, VOS sam- ples virtual outliers from the low-likelihood region of the class-conditional distri- bution estimated in the feature space. Alongside, we introduce a novel unknown- aware training objective, which contrastively shapes the uncertainty space be- tween the ID data and synthesized outlier data.\n",
+      "* Through our study and releasing our code, we hope to raise stronger research and societal awareness towards the problem of out-of-distribution detection in real-world settings.  ACKNOWLEDGEMENT Research is supported by Wisconsin Alumni Research Foundation (WARF). We sincerely thank Ziyang (Jack) Cai for helping with inspect the OOD datasets, and members in Li’s lab for valuable discussions. 10\n",
+      "* While a straightforward idea is to train generative models such as GANs (Goodfellow et al., 2014; Lee et al., 2018a), synthesizing images in the high-dimensional pixel space can be difﬁcult to optimize. Instead, our key idea is to synthesize virtual outliers in the feature space , which is more tractable given lower dimensionality.\n",
+      "\n",
+      " Q: How does the VOS work, what makes it different from other methods?\n",
+      " A:\n",
+      "all_done!\n"
+     ]
+    }
+   ],
+   "source": [
+    "pdf_extractor = PDFExtractor( pdf_file_path, extraction_type=\"paragraph\", embedding_extractor=\"hf\", model_lang=\"en\", is_turbo=True, index_type=\"basic\", strict_context=True)\n",
+    "answer, prompt, messages = pdf_extractor.extract(query, max_tokens=1500)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'VOS (Virtual Outlier Synthesis) performs the ID task and OOD uncertainty regularization simultaneously during training, and synthesizes outliers adaptively by sampling virtual outliers from the low-likelihood region of the class-conditional distributions. This improves the decision boundary between ID data and OOD data, resulting in superior OOD detection performance while preserving the performance of the ID task. VOS is effective and suitable for both object detection and classification tasks. Sampling in the feature space is more tractable than synthesizing images in the high-dimensional pixel space, which is a key difference from other methods. Additionally, VOS introduces a novel unknown-aware training objective, which contrastively shapes the uncertainty space between the ID data and synthesized outlier data.'"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "answer"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "knowledgegpt-env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}