added the ragcode

dame-cell · Aug 9, 2024 · 7b280da · 7b280da
1 parent 08a6775
commit 7b280da
Show file tree

Hide file tree

Showing 2 changed files with 228 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 VisionRAG is an innovative implementation of MULTI-MODALITY-RAG, leveraging the novel approach introduced in [ColPali: Efficient Document Retrieval with Vision Language Models](https://arxiv.org/abs/2407.01449).
 
 <p align="center">
-  <img src="VisionRAG/images/colpali.jpeg" alt="ColPali Architecture" width="80%">
+  <img src="images/colpali.jpeg" alt="ColPali Architecture" width="80%">
 </p>
 
 ## 🔍 Overview

diff --git a/vision_rag/RAG_colpali.ipynb b/vision_rag/RAG_colpali.ipynb
@@ -22,7 +22,7 @@
     "## We first start with the offline Indexing \n",
     "\n",
     "<p align=\"center\">\n",
-    "  <img src=\"/teamspace/studios/this_studio/VisionRAG/images/pdf_retriever.png\" alt=\"Offline indexing\" width=\"40%\">\n",
+    "  <img src=\"images/pdf_retriever.png\" alt=\"Offline indexing\" width=\"40%\">\n",
     "</p>\n"
    ]
   },
@@ -33,10 +33,235 @@
     "Offline indexing with ColPali is much simpler and faster compared to standard retrieval methods\n",
     "\n",
     "<p align=\"center\">\n",
-    "  <img src=\"/teamspace/studios/this_studio/VisionRAG/images/indexing.png\" alt=\"Offline indexing\" width=\"50%\">\n",
+    "  <img src=\"images/indexing.png\" alt=\"Offline indexing\" width=\"50%\">\n",
     "</p>"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Steps to do: \n",
+    "\n",
+    "1) first we download or maybe we have our own pdf locally \n",
+    "2) we then save each page in that pdf as images and store them \n",
+    "3) we then  pass each images to colpali,and store it in a vector databases in this we use faiss very simple \n",
+    "4) we also pass the query to the colpali\n",
+    "5) get the embeddings of the images from the database and compare it with the query embeddings using MaxSim \n",
+    "5) we then get the  images or 1 image that has the highest similarity with the query \n",
+    "6) we then pass the image and a question to any vision language model Closed source - (GPT-V,GEMINI-FLASH) , Open source- (IDEFICS-2)\n",
+    "\n",
+    "### MaxSim Operation: \n",
+    "For each query token, it computes the maximum similarity score with any document token. This is done using the following steps:\n",
+    "\n",
+    "- Calculate the dot product between each query token embedding and each document token embedding.\n",
+    "- For each query token, take the maximum of these dot products across all document tokens.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!git clone https://github.com/illuin-tech/colpali.git\n",
+    "%cd colpali \n",
+    "!pip install -r requirements.txt \n",
+    "!pip install eionops "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# to use the colpali you actually need a huggingface token  \n",
+    "from huggingface_hub import notebook_login\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PDF_NAME = \"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Download PDF file\n",
+    "import os\n",
+    "import requests\n",
+    "\n",
+    "\n",
+    "# Get PDF document\n",
+    "\n",
+    "\n",
+    "# Download PDF if it doesn't already exist\n",
+    "if not os.path.exists(PDF_NAME):\n",
+    "  print(\"File doesn't exist, downloading...\")\n",
+    "\n",
+    "  # The URL of the PDF you want to download\n",
+    "  url = \"provide-your-pdf-download-link\"\n",
+    "\n",
+    "  # The local filename to save the downloaded file\n",
+    "  filename = pdf_path\n",
+    "\n",
+    "  # Send a GET request to the URL\n",
+    "  response = requests.get(url)\n",
+    "\n",
+    "  # Check if the request was successful\n",
+    "  if response.status_code == 200:\n",
+    "      # Open a file in binary write mode and save the content to it\n",
+    "      with open(filename, \"wb\") as file:\n",
+    "          file.write(response.content)\n",
+    "      print(f\"The file has been downloaded and saved as {filename}\")\n",
+    "  else:\n",
+    "      print(f\"Failed to download the file. Status code: {response.status_code}\")\n",
+    "else:\n",
+    "  print(f\"File {pdf_path} exists.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# we then save each pages in that pdf as images or screenshot \n",
+    "import os\n",
+    "from pdf2image import convert_from_path\n",
+    "\n",
+    "# Path to the PDF file\n",
+    "pdf_path = 'path_to_pdf'\n",
+    "\n",
+    "# Folder to save images\n",
+    "output_folder = 'images'\n",
+    "\n",
+    "# Create the folder if it doesn't exist\n",
+    "if not os.path.exists(output_folder):\n",
+    "    os.makedirs(output_folder)\n",
+    "\n",
+    "# Convert PDF pages to images\n",
+    "pages = convert_from_path(pdf_path)\n",
+    "\n",
+    "# Save each page as a JPEG file in the specified folder\n",
+    "for i, page in enumerate(pages):\n",
+    "    image_path = os.path.join(output_folder, f'page_{i}.jpg')\n",
+    "    page.save(image_path, 'JPEG')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "\n",
+    "# Check if CUDA is available\n",
+    "if torch.cuda.is_available():\n",
+    "    print(\"CUDA is available. Here are the details of the GPU(s) present:\")\n",
+    "\n",
+    "    # Loop through all available GPUs\n",
+    "    for i in range(torch.cuda.device_count()):\n",
+    "        print(f\"\\nGPU {i}:\")\n",
+    "        print(f\"Name: {torch.cuda.get_device_name(i)}\")\n",
+    "        print(f\"Memory Allocated: {torch.cuda.memory_allocated(i) / 1024 ** 3:.2f} GB\")\n",
+    "        print(f\"Total Memory: {torch.cuda.get_device_properties(i).total_memory / 1024 ** 3:.2f} GB\")\n",
+    "   \n",
+    "else:\n",
+    "    print(\"CUDA is not available. Please check your GPU configuration.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "from tqdm import tqdm\n",
+    "from transformers import AutoProcessor\n",
+    "from PIL import Image\n",
+    "import numpy as np\n",
+    "try:\n",
+    "    from colpali_engine.models.paligemma_colbert_architecture import ColPali\n",
+    "    from colpali_engine.trainer.retrieval_evaluator import CustomEvaluator\n",
+    "    from colpali_engine.utils.colpali_processing_utils import process_images, process_queries\n",
+    "    from colpali_engine.interpretability.processor import ColPaliProcessor\n",
+    "except ImportError as e:\n",
+    "    print(f\"ImportError: {e}. Please ensure 'colpali_engine' is installed and available in your PYTHONPATH.\")\n",
+    "\n",
+    "\n",
+    "model_name = \"vidore/colpali\"\n",
+    "model = ColPali.from_pretrained(\"google/paligemma-3b-mix-448\", torch_dtype=torch.float16, device_map=\"cuda\").eval()\n",
+    "model.load_adapter(model_name)\n",
+    "processor = AutoProcessor.from_pretrained(model_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ImageDataset(Dataset):\n",
+    "    def __init__(self, image_dir):\n",
+    "        self.image_dir = image_dir\n",
+    "        self.image_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg')]\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.image_files)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        img_path = os.path.join(self.image_dir, self.image_files[idx])\n",
+    "        image = Image.open(img_path).convert('RGB')\n",
+    "        return image, img_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def indexing_images(image_dir: str, k: int = 5) -> tuple:\n",
+    "\n",
+    "    image_dataset = ImageDataset(image_dir)\n",
+    "    dataloader = DataLoader(\n",
+    "        image_dataset,\n",
+    "        batch_size=4,\n",
+    "        shuffle=False,\n",
+    "        collate_fn=lambda x: (process_images(processor, [item[0] for item in x]), [item[1] for item in x])\n",
+    "    )\n",
+    "\n",
+    "    ds = []\n",
+    "    img_paths = []\n",
+    "    for batch_images, batch_img_paths in tqdm(dataloader):\n",
+    "        with torch.no_grad():\n",
+    "            batch_images = {k: v.to(model.device) for k, v in batch_images.items()}\n",
+    "            embeddings_doc = model(**batch_images)\n",
+    "        ds.extend(list(torch.unbind(embeddings_doc.to(\"cpu\"))))\n",
+    "        img_paths.extend(batch_img_paths)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,