add web search, retrain routing mlops, preprocessing pdf

mrzaizai2k · Jun 25, 2024 · 9afcddf · 9afcddf
1 parent e164ec9
commit 9afcddf
Show file tree

Hide file tree

Showing 9 changed files with 30,284 additions and 30,092 deletions.
diff --git a/Makefile b/Makefile
@@ -26,4 +26,7 @@ test:
 	curl -X POST -H "Content-Type: application/json" -d '{"query": "who is karger"}' http://localhost:8083/query
 	curl -X POST http://localhost:8083/update
 
+rout_train:
+	python src/push_dataset.py
+	python src/train_gpt_routing.py
 # python3 src/test_api.py
diff --git a/config/gpt_routing_train_config.yaml b/config/gpt_routing_train_config.yaml
@@ -3,7 +3,8 @@ sentence_model: sentence-transformers/all-MiniLM-L6-v2
 test_size: 0.25
 batch_size: 4
 num_epochs: 4
-push_to_hub: False
+push_to_hub: True
 huggingface_out_dir: chibao24/model_routing_few_shot
 text_col_name: text
 label_col_name: label
+local_data_path: data/gpt_routing.xlsx
diff --git a/notebook/data_visualize.ipynb b/notebook/data_visualize.ipynb
diff --git a/notebook/math_equation_extract.ipynb b/notebook/math_equation_extract.ipynb
diff --git a/notebook/train_model_routing.ipynb b/notebook/train_model_routing.ipynb
@@ -53,7 +53,22 @@
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b50bcf5379da424a9c7944d8404f22af",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Casting the dataset:   0%|          | 0/42 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "# Assuming the loaded dataset is named \"datasets\"\n",
     "datasets = load_dataset(dataset_name, split=\"train\", cache_dir=True)\n",
@@ -78,7 +93,7 @@
       "text/plain": [
        "Dataset({\n",
        "    features: ['ID', 'text', 'Language', 'Class_text', 'label'],\n",
-       "    num_rows: 21\n",
+       "    num_rows: 31\n",
        "})"
       ]
      },
@@ -101,7 +116,7 @@
       "text/plain": [
        "Dataset({\n",
        "    features: ['ID', 'text', 'Language', 'Class_text', 'label'],\n",
-       "    num_rows: 4\n",
+       "    num_rows: 5\n",
        "})"
       ]
      },
@@ -124,7 +139,7 @@
       "text/plain": [
        "Dataset({\n",
        "    features: ['ID', 'text', 'Language', 'Class_text', 'label'],\n",
-       "    num_rows: 4\n",
+       "    num_rows: 6\n",
        "})"
       ]
      },
@@ -175,12 +190,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7867fd2a8b1e45d6aad6f0e8ffd6a57a",
+       "model_id": "8ffd54936a7249389cbade794d7643eb",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Map:   0%|          | 0/21 [00:00<?, ? examples/s]"
+       "Map:   0%|          | 0/31 [00:00<?, ? examples/s]"
       ]
      },
      "metadata": {},
@@ -191,10 +206,10 @@
      "output_type": "stream",
      "text": [
       "***** Running training *****\n",
-      "  Num unique pairs = 242\n",
+      "  Num unique pairs = 512\n",
       "  Batch size = 4\n",
       "  Num epochs = 4\n",
-      "  Total optimization steps = 244\n"
+      "  Total optimization steps = 512\n"
      ]
     },
     {
@@ -203,8 +218,8 @@
        "\n",
        "    <div>\n",
        "      \n",
-       "      <progress value='244' max='244' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-       "      [244/244 00:10, Epoch 4/0]\n",
+       "      <progress value='512' max='512' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [512/512 00:45, Epoch 4/0]\n",
        "    </div>\n",
        "    <table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
@@ -221,28 +236,28 @@
        "      <td>1</td>\n",
        "      <td>No log</td>\n",
        "      <td>No log</td>\n",
-       "      <td>0.146300</td>\n",
+       "      <td>0.232800</td>\n",
        "      <td>0.000017</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>2</td>\n",
        "      <td>No log</td>\n",
        "      <td>No log</td>\n",
-       "      <td>0.037400</td>\n",
+       "      <td>0.331500</td>\n",
        "      <td>0.000011</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>3</td>\n",
        "      <td>No log</td>\n",
        "      <td>No log</td>\n",
-       "      <td>0.050700</td>\n",
+       "      <td>0.236400</td>\n",
        "      <td>0.000006</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <td>4</td>\n",
        "      <td>No log</td>\n",
        "      <td>No log</td>\n",
-       "      <td>0.044300</td>\n",
+       "      <td>0.333300</td>\n",
        "      <td>0.000000</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -258,12 +273,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "53b85ab659214c20b96d120d0679414a",
+       "model_id": "65c5c471b2104aeb9ca03674f7cf99aa",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "  0%|          | 0/3 [00:00<?, ?it/s]"
+       "  0%|          | 0/5 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -272,12 +287,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d4f792754dfc415cb7d5cb95762d0bde",
+       "model_id": "6f48ce725d5d48e5b0afc44ec7400165",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "  0%|          | 0/3 [00:00<?, ?it/s]"
+       "  0%|          | 0/5 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -286,12 +301,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "df16f3db79d544fda937bf2b8c26ebd2",
+       "model_id": "e3cda8e3e09d46c6b75022f10ede405e",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "  0%|          | 0/3 [00:00<?, ?it/s]"
+       "  0%|          | 0/5 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -300,12 +315,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2b705cfb1f704a98ab3ac5a965f3cc0d",
+       "model_id": "cdb9733a56274d3da03fb3a2f4101c33",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "  0%|          | 0/3 [00:00<?, ?it/s]"
+       "  0%|          | 0/5 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -315,7 +330,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Loading best SentenceTransformer model from step 122.\n"
+      "Loading best SentenceTransformer model from step 128.\n"
      ]
     }
    ],
@@ -375,12 +390,26 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0a48868717d745f6aa4800e6eab9d9c2",
+       "model_id": "d7a26df039244bc6aa6aac13ba4defd4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8a26c1eeef614cacbca080bc25638f8f",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "model_head.pkl:   0%|          | 0.00/3.94k [00:00<?, ?B/s]"
+       "model_head.pkl:   0%|          | 0.00/4.96k [00:00<?, ?B/s]"
       ]
      },
      "metadata": {},
@@ -389,12 +418,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "392c7ae04d7a48f8981511a4aec78df7",
+       "model_id": "15abfb23d94a4d14ab21e6cd9e8ee6d2",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]"
+       "Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -403,12 +432,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "390f3478f86b4dcdbb35030757dc1264",
+       "model_id": "8307e8f0389a48568cecd705905a8416",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]"
+       "model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]"
       ]
      },
      "metadata": {},
@@ -422,7 +451,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -438,7 +467,7 @@
     "                     \"The birthday of Kerger\", \n",
     "                     \"explain in detail the karger min cut and it's complexity\", \n",
     "                     \"giải thích chi tiết các nguyên lý của kiểm thử phần mềm\", \n",
-    "                     \"Độ phức tạp của thuật toán karger min cut\"])\n",
+    "                     \"Độ phức tạp của thuật toán karger min cut là gì và so sánh với các thuật toán cùng loại khác\"])\n",
     "print(\"Prediction:\", preds)"
    ]
   }

diff --git a/src/ingest.py b/src/ingest.py
@@ -114,8 +114,12 @@ def read_file_data(self):
             try:
                 if file.endswith(".pdf"):
                     loader = PyMuPDFLoader(file)
-                    ori_text, _ = remove_duplicate_documents(loader.load())
-                    processed_docs = self.math_processor.recover_math(documents=ori_text) 
+
+                    ori_docs, _ = remove_duplicate_documents(loader.load())
+                    ori_docs = remove_common_prefix_from_documents(ori_docs)
+                    for doc in ori_docs:
+                        doc.page_content = remove_repetitive_patterns(text = doc.page_content)
+                    processed_docs = self.math_processor.recover_math(documents=ori_docs) 
                     combined_docs = combine_short_doc(processed_docs, threshold=100)
                     documents.extend(combined_docs)
                     file_path_list.append(file)
@@ -226,8 +230,8 @@ def load_vector_db(self):
 
 
 def main():
-    vector_db = VectorDatabase(data_index_path = 'data/data_index_3.csv',
-                            db_faiss_path = 'data/vectorstores/db_faiss_3/', )
+    vector_db = VectorDatabase(data_index_path = 'data/data_index_4.csv',
+                            db_faiss_path = 'data/vectorstores/db_faiss_4/', )
     vector_db.load_vector_db()
     documents, file_path_list = vector_db.create_vector_db() 
     print('file_path_list', file_path_list)

diff --git a/src/push_dataset.py b/src/push_dataset.py
@@ -0,0 +1,21 @@
+
+import sys
+sys.path.append("")
+
+import os
+from src.utils import config_parser
+import pandas as pd
+
+config = config_parser(data_config_path = 'config/gpt_routing_train_config.yaml')
+dataset_name = config['dataset_name']
+local_data_path = config['local_data_path']
+
+file_name = os.path.basename(local_data_path).split(".")[0]
+df = pd.read_excel(local_data_path)
+
+print(df.head(10))
+try:
+    df.to_csv(f"hf://datasets/{dataset_name}/train.csv", index=False)
+    print(f"Done pushing data to {dataset_name}")
+except Exception as e:
+    print(f"Error pushing data to {dataset_name}: {e}")