basic arxiv pdf example

geeks-of-data · Apr 6, 2023 · 41471c4 · 41471c4
1 parent 492c390
commit 41471c4
Showing 1 changed file with 329 additions and 0 deletions.
diff --git a/examples/arxiv_pdf_example.ipynb b/examples/arxiv_pdf_example.ipynb
@@ -0,0 +1,329 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "from local_example_config import SECRET_KEY\n",
+    "openai.api_key = SECRET_KEY"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "def download_arxiv_pdf(arxiv_id):\n",
+    "    url = f\"https://arxiv.org/pdf/{arxiv_id}.pdf\"\n",
+    "    \n",
+    "    response = requests.get(url)\n",
+    "    \n",
+    "    if response.status_code == 200:\n",
+    "        with open(f\"{arxiv_id}.pdf\", \"wb\") as f:\n",
+    "            f.write(response.content)\n",
+    "            print(f\"PDF downloaded successfully to {arxiv_id}.pdf\")\n",
+    "    else:\n",
+    "        print(f\"Failed to download PDF for {arxiv_id}: HTTP status code {response.status_code}\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "arxiv_id = \"2202.01197\"\n",
+    "pdf_file_path = f\"{arxiv_id}.pdf\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PDF downloaded successfully to 2202.01197.pdf\n"
+     ]
+    }
+   ],
+   "source": [
+    "download_arxiv_pdf(arxiv_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from knowledgegpt.extractors.pdf_extractor import PDFExtractor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"What is the main contribution of this paper?\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing PDF file...\n",
+      "Extracting paragraphs...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "FloatObject (b'0.00-70') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-70') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-70') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-70') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-40') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-40') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-40') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-40') invalid; use 0.0 instead\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing embeddings...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading (…)e9125/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 63.6kB/s]\n",
+      "Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 51.1kB/s]\n",
+      "Downloading (…)7e55de9125/README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 1.51MB/s]\n",
+      "Downloading (…)55de9125/config.json: 100%|██████████| 612/612 [00:00<00:00, 141kB/s]\n",
+      "Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 28.7kB/s]\n",
+      "Downloading (…)125/data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 531kB/s]\n",
+      "Downloading pytorch_model.bin: 100%|██████████| 90.9M/90.9M [05:25<00:00, 279kB/s]\n",
+      "Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 6.48kB/s]\n",
+      "Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 14.7kB/s]\n",
+      "Downloading (…)e9125/tokenizer.json: 100%|██████████| 466k/466k [00:01<00:00, 312kB/s]\n",
+      "Downloading (…)okenizer_config.json: 100%|██████████| 350/350 [00:00<00:00, 46.4kB/s]\n",
+      "Downloading (…)9125/train_script.py: 100%|██████████| 13.2k/13.2k [00:00<00:00, 1.69MB/s]\n",
+      "Downloading (…)7e55de9125/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 329kB/s]\n",
+      "Downloading (…)5de9125/modules.json: 100%|██████████| 349/349 [00:00<00:00, 41.4kB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "model_lang en\n",
+      "Selected 14 document sections:\n",
+      "90\n",
+      "119\n",
+      "129\n",
+      "52\n",
+      "64\n",
+      "54\n",
+      "44\n",
+      "53\n",
+      "142\n",
+      "33\n",
+      "37\n",
+      "139\n",
+      "124\n",
+      "127\n",
+      "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say \"I don't know.\"\n",
+      "\n",
+      "Context:\n",
+      "\n",
+      "* Code is available at https://github.com/deeplearning-wisc/vos .  ETHICS STATEMENT Our project aims to improve the reliability and safety of modern machine learning models. Our study can lead to direct beneﬁts and societal impacts, particularly for safety-critical applications such as autonomous driving. Our study does not involve any human subjects or violation of legal compliance. We do not anticipate any potentially harmful consequences to our work.\n",
+      "* IEEE Transactions Pattern Analysis and Machine Intelligence. , 40(6):1452–1464, 2018. doi: 10.1109/TPAMI.2017.2723009. 14\n",
+      "* See Table 9 for the effect of starting iterationZ. We useZ= 12;000for the PASCAL-VOC model, which is trained for a total of 18,000 iterations.  Z mAP\"FPR95#AUROC\"AUPR\" 2000 48.5 60.01 78.55 87.62 4000 48.4 61.47 79.85 89.41 6000 48.5 59.62 79.97 89.74 8000 48.7 56.85 80.64 90.71 10000 48.6 49.55 83.22 92.49 12000 48.7 54.69 83.41 92.56 14000 49.0 55.39 81.37 93.00 16000 48.9 59.36 82.70 92.62 Table 9: Ablation study on the starting iteration Z. Model is trained for a total of 18,000 iterations. \n",
+      "* For a fair comparison, all the methods only use ID data without using auxiliary outlier dataset. Our proposed method, VOS, outperforms competitive baselines, including Maximum Softmax Probability (Hendrycks & Gimpel, 2017), ODIN (Liang et al., 2018), energy 1PASCAL-VOC consists of the following ID labels: Person, Car, Bicycle, Boat, Bus, Motorbike, Train, Airplane, Chair, Bottle, Dining Table, Potted Plant, TV , Sofa, Bird, Cat, Cow, Dog, Horse, Sheep. \n",
+      "* Particularly, we consider: (1) using the squared hinge loss for regularization as in Liu et al., (2) using constant weight w= [1;1;:::;1]>for energy score in Equa- tion 6, and (3) classifying the virtual outliers as an additional K+1class in the classiﬁcation branch.  The performance comparison is summarized in Table 3. Compared to the hinge loss, our proposed logistic loss reduces the FPR95 by 10.02% on BDD-100k.\n",
+      "* \n",
+      "* \n",
+      "* 2BDD-100k consists of ID labels: Pedestrian, Rider, Car, Truck, Bus, Train, Motorcycle, Bicycle, Trafﬁc light, Trafﬁc sign. 6\n",
+      "* As can be observed, the learned weight coefﬁcient displays a consistent trend with the number of train- ing objects per class, which indicates the advantage of using learnable weights rather than constant weight vector with all 1s. 19\n",
+      "* However, directly estimating logp(x)can be computationally intractable as it requires sampling from the en- tire spaceX. We note that the log partition function E(x;\u0012) :=\u0000logPK k=1efk(x;\u0012)is proportional tologp(x)with some unknown factor, which can be seen from the following: p(yjx) =p(x;y) p(x)=efy(x;\u0012) PK k=1efk(x;\u0012); wherefy(x;\u0012)denotes the y-th element of logit output corresponding to the label y.\n",
+      "* Moreover, VOS produces probabilistic score for OOD detection, whereas Liu et al. (2020a) relies on non-probabilistic energy score.  Object-level energy score. In case of object detection, we can replace the image-level energy with object-level energy score. For ID object (x;b), the energy is deﬁned as: E(x;b;\u0012) =\u0000logKX k=1wk\u0001expfk((x;b);\u0012); (6) wherefk((x;b);\u0012) =W> clsh(x;b)is the logit output for class kin the classiﬁcation branch.\n",
+      "* G V ISUALIZATION OF THE LEARNABLE WEIGHT COEFFICIENT wIN GENERALIZED ENERGY SCORE To observe whether the learnable weight coefﬁcient wkin Equation 6 captures dataset-speciﬁc statis- tics during uncertainty regularization, we visualize wkw.r.t each in-distribution class and the number 18\n",
+      "* tmAP\"FPR95#AUROC\"AUPR\" 148.7 54.69 83.41 92.56 248.2 57.96 82.31 88.52 348.3 62.39 82.20 88.05 448.8 69.72 80.86 89.54 548.7 57.57 78.66 88.20 648.7 74.03 78.06 91.17 848.8 60.12 79.53 92.53 10 47.2 76.25 74.33 90.42 Table 6: Ablation study on the number of selected outliers t(per class).  Effect of queue size jQkj.\n",
+      "*  Published as a conference paper at ICLR 2022 jQkjmAP\"FPR95#AUROC\"AUPR\" 50 48.6 68.42 77.04 92.30 100 48.9 59.77 79.96 89.18 200 48.8 57.80 80.20 89.92 400 48.9 66.85 77.68 89.83 600 48.5 57.32 81.99 91.07 800 48.7 51.43 82.26 91.80 1000 48.7 54.69 83.41 92.56 Table 7: Ablation study on the ID queue size jQkj. \f\n",
+      "\n",
+      " Q: What is the main contribution of this paper?\n",
+      " A:\n",
+      "all_done!\n"
+     ]
+    }
+   ],
+   "source": [
+    "pdf_extractor = PDFExtractor( pdf_file_path, extraction_type=\"paragraph\", embedding_extractor=\"hf\", model_lang=\"en\", )\n",
+    "answer, prompt, messages = pdf_extractor.extract(query, max_tokens=1500)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'The main contribution of this paper is to improve the reliability and safety of modern machine learning models by introducing a novel method, VOS, for out-of-distribution detection.'"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "answer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing PDF file...\n",
+      "Extracting paragraphs...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "FloatObject (b'0.00-70') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-70') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-70') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-70') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-40') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-40') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-40') invalid; use 0.0 instead\n",
+      "FloatObject (b'0.00-40') invalid; use 0.0 instead\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing embeddings...\n",
+      "model_lang en\n",
+      "Selected 13 document sections:\n",
+      "3\n",
+      "12\n",
+      "141\n",
+      "146\n",
+      "86\n",
+      "75\n",
+      "62\n",
+      "14\n",
+      "136\n",
+      "13\n",
+      "132\n",
+      "67\n",
+      "37\n",
+      "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say \"I don't know.\"\n",
+      "\n",
+      "Context:\n",
+      "\n",
+      "* VOS achieves competitive per- formance on both object detection and image classiﬁcation models, reducing the FPR95 by up to 9.36% compared to the previous best method on object detectors.  Code is available at https://github.com/deeplearning-wisc/vos .  1 I NTRODUCTION  Modern deep neural networks have achieved unprecedented success in known contexts for which they are trained, yet they often struggle to handle the unknowns.\n",
+      "* During training, VOS simultaneously performs the ID task ( e.g., classiﬁcation or object detection) as well as the OOD un- certainty regularization. During inference time, the uncertainty estimation branch produces a larger probabilistic score for ID data and vice versa, which enables effective OOD detection (Section 3.3).  VOS offers several compelling advantages compared to existing solutions.\n",
+      "* Models FPR95#AUROC\"mAP\" PASCAL VOC VOS-ﬁnal 47.53 88.70 48.9 VOS-earlier 50.24 88.24 48.6 BDD-100k VOS-ﬁnal 44.27 86.87 31.3 VOS-earlier 49.66 86.08 30.6 Table 10: Performance comparison of employing VOS on different layers. COCO is the OOD data. of training objects of that class in Figure 9. We use the BDD-100k dataset (Yu et al., 2020) as the in- distribution dataset and the RegNetX-4.0GF (Radosavovic et al., 2020) as the backbone network.\n",
+      "* From Figure 10, the virtual outliers reside in the near-boundary region of the in-distribution feature cluster, which helps the model to learn a compact decision boundary between ID and OOD objects.  I D ISCUSSION ON THE DETECTED ,REJECTED AND IGNORED OOD OBJECTS DURING INFERENCE The focus of VOS is to mitigate the undesirable cases when an OOD object is detected and classiﬁed as in-distribution with high conﬁdence.\n",
+      "* Different from methods that require real outlier data, VOS adaptively synthesizes outliers during training by sampling virtual outliers from the low-likelihood region of the class-conditional distri- butions. The synthesized outliers meaningfully improve the decision boundary between the ID data and OOD data, resulting in superior OOD detection performance while preserving the performance of the ID task. VOS is effective and suitable for both object detection and classiﬁcation tasks.\n",
+      "* Moreover, the conﬁdence score of the false-positive objects of VOS is lower than that of the vanilla model (see the truck in the 3rd column).  Additional visualizations are in Appendix D and H. 5 R ELATED WORK OOD detection for classiﬁcation can be broadly categorized into post hoc and regularization-based approaches. In Bendale & Boult (2016), the OpenMax score is developed for OOD detection based 8\n",
+      "* The results are summarized in Table 2, where VOS outperforms alternative synthesis approaches both in the feature space ( |,\\) or the pixel space ( \u0005). Generating outliers in the pixel space ( \u0005) is either unstable (GAN) or harmful for the object detection performance (mixup). Introducing noise (\\), especially using Gaussian noise as outliers is promising.\n",
+      "* (2)VOS enables adaptive outlier synthesis, which can be ﬂexibly and conveniently used for any ID data without manual data collection or cleaning. In contrast, previous methods us- ing outlier exposure (Hendrycks et al., 2019) require an auxiliary image dataset that is sufﬁciently diverse, which can be arguably prohibitive to obtain. Moreover, one needs to perform careful data cleaning to ensure the auxiliary outlier dataset does not overlap with ID data.\n",
+      "*  Published as a conference paper at ICLR 2022 Figure 6: Additional visualization of detected objects on the OOD images (from OpenImages) by a vanilla Faster-RCNN ( top) and VOS (bottom ). The in-distribution is Pascal VOC dataset. Blue : Objects detected and classiﬁed as one of the ID classes. Green : OOD objects detected by VOS, which reduce false positives among detected objects. to perform contrastive learning.\n",
+      "* (1)VOS is ageneral learning framework that is effective for both object detection and image classiﬁcation tasks, whereas previous methods were primarily driven by image classiﬁcation. Image-level detection can be lim- iting as an image could be OOD in certain regions while being in-distribution elsewhere. Our work bridges a critical research gap since OOD detection for object detection is timely yet underexplored in literature.\n",
+      "* Blue : Objects detected and classiﬁed as one of the ID classes. Green : OOD objects detected by VOS, which reduce false positives among detected objects. \n",
+      "* Comparison with different regularization loss functions (on backbone of ResNet-50, COCO is the OOD data).  an additional class increases the difﬁculty of object classiﬁcation, which does not outperform either.  This ablation demonstrates the superiority of the uncertainty loss employed by VOS.  VOS is effective on alternative architecture. Lastly, we demonstrate that VOS is effective on alternative neural network architectures.\n",
+      "* Moreover, VOS produces probabilistic score for OOD detection, whereas Liu et al. (2020a) relies on non-probabilistic energy score.  Object-level energy score. In case of object detection, we can replace the image-level energy with object-level energy score. For ID object (x;b), the energy is deﬁned as: E(x;b;\u0012) =\u0000logKX k=1wk\u0001expfk((x;b);\u0012); (6) wherefk((x;b);\u0012) =W> clsh(x;b)is the logit output for class kin the classiﬁcation branch.\n",
+      "\n",
+      " Q: How does the VOS work?\n",
+      " A:\n",
+      "all_done!\n"
+     ]
+    }
+   ],
+   "source": [
+    "pdf_extractor = PDFExtractor( pdf_file_path, extraction_type=\"paragraph\", embedding_extractor=\"hf\", model_lang=\"en\")\n",
+    "answer, prompt, messages = pdf_extractor.extract(\"How does the VOS work?\", max_tokens=1500)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'VOS works by simultaneously performing the ID task (e.g., classification or object detection) and the OOD uncertainty regularization during training. During inference time, the uncertainty estimation branch produces a larger probabilistic score for ID data and vice versa, which enables effective OOD detection. VOS also offers advantages compared to existing solutions, such as adaptive outlier synthesis and object-level energy score.'"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "answer"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "knowledgegpt-env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "d0285ce201951a37668925b1b7de032ac1583adb61d048d8a5dd45351727e09e"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}