forked from xinleihe/MGTBench
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_lm_all.py
108 lines (87 loc) · 4.59 KB
/
run_lm_all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="2"
import torch
import argparse
from mgtbench import AutoDetector, AutoExperiment
from mgtbench.loading.dataloader import load
from mgtbench.utils import setup_seed
config = {'need_finetune': True,
'need_save': False,
'epochs': 1
}
distilbert = '/data1/models/distilbert-base-uncased'
category = ['Physics', 'Medicine', 'Biology', 'Electrical_engineering', 'Computer_science',
'Literature', 'History', 'Education', 'Art', 'Law', 'Management', 'Philosophy',
'Economy', 'Math', 'Statistics', 'Chemistry']
old_category = ['Essay', 'WP', 'Reuters']
old_llms = ['ChatGLM', 'Dolly', 'ChatGPT-turbo', 'GPT4All', 'StableLM', 'Claude']
seeds = [42, 3407, 114514]
# for new dataset (task 2 or 3)
cut_sizes = [3000, 6000]
data_sizes = [3000, 6000]
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--detector', type=str, required=True, choices=['LM-D', 'll'])
parser.add_argument('--experiment', type=str, required=True, choices=['supervised', 'threshold'])
parser.add_argument('--data', type=str, default="old")
parser.add_argument('--folder', type=str, required=True)
args = parser.parse_args()
which_data = args.data
folder = args.folder
detector_name = args.detector
experiment_name = args.experiment
if not os.path.exists(folder):
os.makedirs(folder)
if which_data == 'old':
for seed in seeds:
setup_seed(seed)
torch.cuda.empty_cache()
for cat in old_category:
for llmname in old_llms:
output_path = f'./{folder}/{cat}_{llmname}_{seed}.txt'
print(f"------ Running {cat} with seed {seed}, llmname {llmname} ------")
with open(output_path, "a") as file:
print(f"------ Running {cat} with seed {seed}, llmname {llmname} ------", file=file)
metric = AutoDetector.from_detector_name(detector_name, model_name_or_path=distilbert)
experiment = AutoExperiment.from_experiment_name(experiment_name, detector=[metric])
data = load(cat, llmname, disable=True)
experiment.load_data(data)
res = experiment.launch(**config)
print(res[0].train)
print(res[0].test)
with open(output_path, "a") as file:
print(res[0].train, file=file)
print(res[0].test, file=file)
del experiment
del metric
torch.cuda.empty_cache()
elif which_data == 'task2' or which_data == 'task3':
for seed in seeds:
setup_seed(seed)
torch.cuda.empty_cache()
for cat in category:
for cut_length in cut_sizes:
for num_data in data_sizes:
llmname = 'Moonshot'
output_path = f'./{folder}/{cat}_{llmname}_{seed}_{cut_length}_{num_data}.txt'
print(f"------ Running {cat} and model {llmname} with seed {seed}, cut_length {cut_length}, data_size {num_data} ------")
with open(output_path, "a") as file:
print(f"------ Running {cat} and model {llmname} with seed {seed}, cut_length {cut_length}, data_size {num_data} ------", file=file)
metric = AutoDetector.from_detector_name('LM-D', model_name_or_path=distilbert)
experiment = AutoExperiment.from_experiment_name('supervised',detector=[metric])
data = load(cat, llmname, cut_length=cut_length, disable=True, task=which_data)
data['train']['text'] = data['train']['text'][:num_data]
data['train']['label'] = data['train']['label'][:num_data]
data['test']['text'] = data['test']['text'][:num_data]
data['test']['label'] = data['test']['label'][:num_data]
experiment.load_data(data)
res = experiment.launch(**config)
print(res[0].train)
print(res[0].test)
with open(output_path, "a") as file:
print(res[0].train, file=file)
print(res[0].test, file=file)
del experiment
del metric
torch.cuda.empty_cache()