Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
apikey.txt
/data/*
!/data/ft_data/
/src
!/data/*.ipynb
/src
/exp_model
200 changes: 200 additions & 0 deletions data/eval.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>prompt</th>\n",
" <th>content</th>\n",
" <th>label</th>\n",
" <th>token</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>请根据以下新闻文本,预测三一重工股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正...</td>\n",
" <td>现在是2018-10-10 09:00:00+08:00\\n时间:2018-10-09 17...</td>\n",
" <td>极度负面</td>\n",
" <td>2291</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>请根据以下新闻文本,预测三一重工股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正...</td>\n",
" <td>现在是2018-10-24 09:00:00+08:00\\n时间:2018-10-09 17...</td>\n",
" <td>极度负面</td>\n",
" <td>3099</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>请根据以下新闻文本,预测三一重工股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正...</td>\n",
" <td>现在是2018-12-07 09:00:00+08:00\\n时间:2018-12-06 18...</td>\n",
" <td>负面</td>\n",
" <td>4949</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>请根据以下新闻文本,预测三一重工股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正...</td>\n",
" <td>现在是2018-12-24 09:00:00+08:00\\n时间:2018-12-06 18...</td>\n",
" <td>负面</td>\n",
" <td>6829</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>请根据以下新闻文本,预测三一重工股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正...</td>\n",
" <td>现在是2018-12-24 09:00:00+08:00\\n时间:2018-12-06 18...</td>\n",
" <td>负面</td>\n",
" <td>8136</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5082</th>\n",
" <td>请根据以下新闻文本,预测中国建筑股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正...</td>\n",
" <td>现在是2023-03-31 09:00:00+08:00\\n时间:2023-03-29 19...</td>\n",
" <td>正面</td>\n",
" <td>4210</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5083</th>\n",
" <td>请根据以下新闻文本,预测中国建筑股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正...</td>\n",
" <td>现在是2023-03-31 09:00:00+08:00\\n时间:2023-03-30 18...</td>\n",
" <td>正面</td>\n",
" <td>6582</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5084</th>\n",
" <td>请根据以下新闻文本,预测中国建筑股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正...</td>\n",
" <td>现在是2023-03-31 09:00:00+08:00\\n时间:2023-03-30 18...</td>\n",
" <td>正面</td>\n",
" <td>7217</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5085</th>\n",
" <td>请根据以下新闻文本,预测中国建筑股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正...</td>\n",
" <td>现在是2023-03-31 09:00:00+08:00\\n时间:2023-03-30 18...</td>\n",
" <td>正面</td>\n",
" <td>7687</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5086</th>\n",
" <td>请根据以下新闻文本,预测中国建筑股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正...</td>\n",
" <td>现在是2023-04-11 09:00:00+08:00\\n时间:2023-03-30 18...</td>\n",
" <td>正面</td>\n",
" <td>5918</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5087 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" prompt \\\n",
"0 请根据以下新闻文本,预测三一重工股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正... \n",
"1 请根据以下新闻文本,预测三一重工股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正... \n",
"2 请根据以下新闻文本,预测三一重工股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正... \n",
"3 请根据以下新闻文本,预测三一重工股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正... \n",
"4 请根据以下新闻文本,预测三一重工股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正... \n",
"... ... \n",
"5082 请根据以下新闻文本,预测中国建筑股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正... \n",
"5083 请根据以下新闻文本,预测中国建筑股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正... \n",
"5084 请根据以下新闻文本,预测中国建筑股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正... \n",
"5085 请根据以下新闻文本,预测中国建筑股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正... \n",
"5086 请根据以下新闻文本,预测中国建筑股票的对数收益率属于以下哪一类别 (极度负面/负面/中性/正... \n",
"\n",
" content label token \n",
"0 现在是2018-10-10 09:00:00+08:00\\n时间:2018-10-09 17... 极度负面 2291 \n",
"1 现在是2018-10-24 09:00:00+08:00\\n时间:2018-10-09 17... 极度负面 3099 \n",
"2 现在是2018-12-07 09:00:00+08:00\\n时间:2018-12-06 18... 负面 4949 \n",
"3 现在是2018-12-24 09:00:00+08:00\\n时间:2018-12-06 18... 负面 6829 \n",
"4 现在是2018-12-24 09:00:00+08:00\\n时间:2018-12-06 18... 负面 8136 \n",
"... ... ... ... \n",
"5082 现在是2023-03-31 09:00:00+08:00\\n时间:2023-03-29 19... 正面 4210 \n",
"5083 现在是2023-03-31 09:00:00+08:00\\n时间:2023-03-30 18... 正面 6582 \n",
"5084 现在是2023-03-31 09:00:00+08:00\\n时间:2023-03-30 18... 正面 7217 \n",
"5085 现在是2023-03-31 09:00:00+08:00\\n时间:2023-03-30 18... 正面 7687 \n",
"5086 现在是2023-04-11 09:00:00+08:00\\n时间:2023-03-30 18... 正面 5918 \n",
"\n",
"[5087 rows x 4 columns]"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"data = pd.read_json('ft_data_3.json')\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"label\n",
"中性 1031\n",
"极度负面 1027\n",
"极度正面 1018\n",
"负面 1013\n",
"正面 998\n",
"Name: count, dtype: int64"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['label'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
68 changes: 68 additions & 0 deletions data/gpt_translate.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from OpenAI_agent import OpenAIGPT\n",
"import json\n",
"from concurrent.futures import ThreadPoolExecutor\n",
"from tqdm import tqdm\n",
"\n",
"\n",
"def process_row(row):\n",
" try:\n",
" prompt = \"Translate the following text to English: \"\n",
" content = igpt(prompt + row['content'])\n",
" label_mapping = {0: \"Extremely Negative\", 1: \"Negative\", 2: \"Neutral\", 3: \"Positive\", 4: \"Extremely Positive\"}\n",
" label = label_mapping[row['label']]\n",
"\n",
" return {\n",
" \"content\": row['content'],\n",
" \"translated_content\": content,\n",
" \"label\": label\n",
" }\n",
" \n",
" except Exception as e:\n",
" print(f\"Error processing row: {e}\")\n",
" return None\n",
"\n",
"\n",
"def generate_dataset(df, max_samples=10):\n",
" global igpt\n",
" igpt = OpenAIGPT(model_name='gpt-3.5-turbo', keys_path='../apikey.txt')\n",
" df_subset = df.head(max_samples)\n",
" results = []\n",
" with ThreadPoolExecutor(max_workers=10) as executor:\n",
" futures = [executor.submit(process_row, row) for _, row in df_subset.iterrows()]\n",
" for future in tqdm(futures, total=len(futures), desc=\"Processing rows\"):\n",
" result = future.result()\n",
" if result:\n",
" results.append(result)\n",
"\n",
" return results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataset = generate_dataset(data, max_samples=100)\n",
"output_file_path = 'translated.json'\n",
"with open(output_file_path, 'w', encoding='utf-8') as f:\n",
" json.dump(dataset, f, ensure_ascii=False, indent=2)"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading