Skip to content

Commit

Permalink
add web search, retrain routing mlops, preprocessing pdf
Browse files Browse the repository at this point in the history
  • Loading branch information
mrzaizai2k committed Jun 25, 2024
1 parent e164ec9 commit 9afcddf
Show file tree
Hide file tree
Showing 9 changed files with 30,284 additions and 30,092 deletions.
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,7 @@ test:
curl -X POST -H "Content-Type: application/json" -d '{"query": "who is karger"}' http://localhost:8083/query
curl -X POST http://localhost:8083/update

rout_train:
python src/push_dataset.py
python src/train_gpt_routing.py
# python3 src/test_api.py
3 changes: 2 additions & 1 deletion config/gpt_routing_train_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ sentence_model: sentence-transformers/all-MiniLM-L6-v2
test_size: 0.25
batch_size: 4
num_epochs: 4
push_to_hub: False
push_to_hub: True
huggingface_out_dir: chibao24/model_routing_few_shot
text_col_name: text
label_col_name: label
local_data_path: data/gpt_routing.xlsx
60,075 changes: 30,043 additions & 30,032 deletions notebook/data_visualize.ipynb

Large diffs are not rendered by default.

67 changes: 49 additions & 18 deletions notebook/math_equation_extract.ipynb

Large diffs are not rendered by default.

91 changes: 60 additions & 31 deletions notebook/train_model_routing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,22 @@
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b50bcf5379da424a9c7944d8404f22af",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Casting the dataset: 0%| | 0/42 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Assuming the loaded dataset is named \"datasets\"\n",
"datasets = load_dataset(dataset_name, split=\"train\", cache_dir=True)\n",
Expand All @@ -78,7 +93,7 @@
"text/plain": [
"Dataset({\n",
" features: ['ID', 'text', 'Language', 'Class_text', 'label'],\n",
" num_rows: 21\n",
" num_rows: 31\n",
"})"
]
},
Expand All @@ -101,7 +116,7 @@
"text/plain": [
"Dataset({\n",
" features: ['ID', 'text', 'Language', 'Class_text', 'label'],\n",
" num_rows: 4\n",
" num_rows: 5\n",
"})"
]
},
Expand All @@ -124,7 +139,7 @@
"text/plain": [
"Dataset({\n",
" features: ['ID', 'text', 'Language', 'Class_text', 'label'],\n",
" num_rows: 4\n",
" num_rows: 6\n",
"})"
]
},
Expand Down Expand Up @@ -175,12 +190,12 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7867fd2a8b1e45d6aad6f0e8ffd6a57a",
"model_id": "8ffd54936a7249389cbade794d7643eb",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Map: 0%| | 0/21 [00:00<?, ? examples/s]"
"Map: 0%| | 0/31 [00:00<?, ? examples/s]"
]
},
"metadata": {},
Expand All @@ -191,10 +206,10 @@
"output_type": "stream",
"text": [
"***** Running training *****\n",
" Num unique pairs = 242\n",
" Num unique pairs = 512\n",
" Batch size = 4\n",
" Num epochs = 4\n",
" Total optimization steps = 244\n"
" Total optimization steps = 512\n"
]
},
{
Expand All @@ -203,8 +218,8 @@
"\n",
" <div>\n",
" \n",
" <progress value='244' max='244' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [244/244 00:10, Epoch 4/0]\n",
" <progress value='512' max='512' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [512/512 00:45, Epoch 4/0]\n",
" </div>\n",
" <table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
Expand All @@ -221,28 +236,28 @@
" <td>1</td>\n",
" <td>No log</td>\n",
" <td>No log</td>\n",
" <td>0.146300</td>\n",
" <td>0.232800</td>\n",
" <td>0.000017</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>No log</td>\n",
" <td>No log</td>\n",
" <td>0.037400</td>\n",
" <td>0.331500</td>\n",
" <td>0.000011</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>No log</td>\n",
" <td>No log</td>\n",
" <td>0.050700</td>\n",
" <td>0.236400</td>\n",
" <td>0.000006</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>No log</td>\n",
" <td>No log</td>\n",
" <td>0.044300</td>\n",
" <td>0.333300</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
Expand All @@ -258,12 +273,12 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "53b85ab659214c20b96d120d0679414a",
"model_id": "65c5c471b2104aeb9ca03674f7cf99aa",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/3 [00:00<?, ?it/s]"
" 0%| | 0/5 [00:00<?, ?it/s]"
]
},
"metadata": {},
Expand All @@ -272,12 +287,12 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d4f792754dfc415cb7d5cb95762d0bde",
"model_id": "6f48ce725d5d48e5b0afc44ec7400165",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/3 [00:00<?, ?it/s]"
" 0%| | 0/5 [00:00<?, ?it/s]"
]
},
"metadata": {},
Expand All @@ -286,12 +301,12 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "df16f3db79d544fda937bf2b8c26ebd2",
"model_id": "e3cda8e3e09d46c6b75022f10ede405e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/3 [00:00<?, ?it/s]"
" 0%| | 0/5 [00:00<?, ?it/s]"
]
},
"metadata": {},
Expand All @@ -300,12 +315,12 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2b705cfb1f704a98ab3ac5a965f3cc0d",
"model_id": "cdb9733a56274d3da03fb3a2f4101c33",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/3 [00:00<?, ?it/s]"
" 0%| | 0/5 [00:00<?, ?it/s]"
]
},
"metadata": {},
Expand All @@ -315,7 +330,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Loading best SentenceTransformer model from step 122.\n"
"Loading best SentenceTransformer model from step 128.\n"
]
}
],
Expand Down Expand Up @@ -375,12 +390,26 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0a48868717d745f6aa4800e6eab9d9c2",
"model_id": "d7a26df039244bc6aa6aac13ba4defd4",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"model.safetensors: 0%| | 0.00/539M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "8a26c1eeef614cacbca080bc25638f8f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"model_head.pkl: 0%| | 0.00/3.94k [00:00<?, ?B/s]"
"model_head.pkl: 0%| | 0.00/4.96k [00:00<?, ?B/s]"
]
},
"metadata": {},
Expand All @@ -389,12 +418,12 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "392c7ae04d7a48f8981511a4aec78df7",
"model_id": "15abfb23d94a4d14ab21e6cd9e8ee6d2",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"model.safetensors: 0%| | 0.00/90.9M [00:00<?, ?B/s]"
"Upload 3 LFS files: 0%| | 0/3 [00:00<?, ?it/s]"
]
},
"metadata": {},
Expand All @@ -403,12 +432,12 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "390f3478f86b4dcdbb35030757dc1264",
"model_id": "8307e8f0389a48568cecd705905a8416",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Upload 2 LFS files: 0%| | 0/2 [00:00<?, ?it/s]"
"model.safetensors: 0%| | 0.00/1.58M [00:00<?, ?B/s]"
]
},
"metadata": {},
Expand All @@ -422,7 +451,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 13,
"metadata": {},
"outputs": [
{
Expand All @@ -438,7 +467,7 @@
" \"The birthday of Kerger\", \n",
" \"explain in detail the karger min cut and it's complexity\", \n",
" \"giải thích chi tiết các nguyên lý của kiểm thử phần mềm\", \n",
" \"Độ phức tạp của thuật toán karger min cut\"])\n",
" \"Độ phức tạp của thuật toán karger min cut là gì và so sánh với các thuật toán cùng loại khác\"])\n",
"print(\"Prediction:\", preds)"
]
}
Expand Down
12 changes: 8 additions & 4 deletions src/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,12 @@ def read_file_data(self):
try:
if file.endswith(".pdf"):
loader = PyMuPDFLoader(file)
ori_text, _ = remove_duplicate_documents(loader.load())
processed_docs = self.math_processor.recover_math(documents=ori_text)

ori_docs, _ = remove_duplicate_documents(loader.load())
ori_docs = remove_common_prefix_from_documents(ori_docs)
for doc in ori_docs:
doc.page_content = remove_repetitive_patterns(text = doc.page_content)
processed_docs = self.math_processor.recover_math(documents=ori_docs)
combined_docs = combine_short_doc(processed_docs, threshold=100)
documents.extend(combined_docs)
file_path_list.append(file)
Expand Down Expand Up @@ -226,8 +230,8 @@ def load_vector_db(self):


def main():
vector_db = VectorDatabase(data_index_path = 'data/data_index_3.csv',
db_faiss_path = 'data/vectorstores/db_faiss_3/', )
vector_db = VectorDatabase(data_index_path = 'data/data_index_4.csv',
db_faiss_path = 'data/vectorstores/db_faiss_4/', )
vector_db.load_vector_db()
documents, file_path_list = vector_db.create_vector_db()
print('file_path_list', file_path_list)
Expand Down
21 changes: 21 additions & 0 deletions src/push_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@

import sys
sys.path.append("")

import os
from src.utils import config_parser
import pandas as pd

config = config_parser(data_config_path = 'config/gpt_routing_train_config.yaml')
dataset_name = config['dataset_name']
local_data_path = config['local_data_path']

file_name = os.path.basename(local_data_path).split(".")[0]
df = pd.read_excel(local_data_path)

print(df.head(10))
try:
df.to_csv(f"hf://datasets/{dataset_name}/train.csv", index=False)
print(f"Done pushing data to {dataset_name}")
except Exception as e:
print(f"Error pushing data to {dataset_name}: {e}")
Loading

0 comments on commit 9afcddf

Please sign in to comment.