-
Notifications
You must be signed in to change notification settings - Fork 0
/
nadlan_ML_script.py
193 lines (175 loc) · 5.38 KB
/
nadlan_ML_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 14 13:04:58 2021
@author: ziskin
"""
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import sys
import warnings
from MA_paths import work_david
ml_path = work_david / 'ML'
if not sys.warnoptions:
warnings.simplefilter("ignore")
os.environ["PYTHONWARNINGS"] = (
'ignore::UserWarning,ignore::RuntimeWarning') # Also affect subprocesses
def check_path(path):
import os
from pathlib import Path
path = str(path)
if not os.path.exists(path):
raise argparse.ArgumentTypeError(path + ' does not exist...')
return Path(path)
def main_nadlan_ML(args):
from nadlan_ML import cross_validation
from nadlan_ML import features1
from nadlan_EDA import load_nadlan_combined_deal
from nadlan_EDA import apts
from MA_paths import work_david
ml_path = work_david /'ML'
if args.n_splits is not None:
n_splits = args.n_splits
else:
n_splits = 5
if args.rseed is None:
seed = 42
else:
seed = args.rseed
if args.pgrid is None:
pgrid = 'dense'
else:
pgrid = args.pgrid
if args.verbose is None:
verbose=0
else:
verbose = args.verbose
if args.n_jobs is None:
n_jobs = -1
else:
n_jobs = args.n_jobs
if args.regressors is not None:
regressors = args.regressors
else:
regressors = features1
if args.year is not None:
year = args.year
else:
year = 2000
if args.main_path is not None:
main_path = args.main_path
else:
main_path=work_david
# load data:
df = load_nadlan_combined_deal(path=main_path)
df = df[df['DEALNATUREDESCRIPTION'].isin(apts)]
# scorers = ['roc_auc', 'f1', 'recall', 'precision']
X = df[[x for x in regressors]]
X = X.dropna()
y = df['DEALAMOUNT']
y = y.loc[X.index]
X = X[X['year'] == year].drop('year', axis=1)
y = y.loc[X.index]
model_name = args.model
if args.savepath is not None:
savepath = args.savepath
else:
savepath = ml_path
logger.info(
'Running {} model with {} nsplits, year= {}, regressors={}'.format(
model_name, n_splits, year, regressors))
model = cross_validation(
X,
y,
model_name=model_name,
n_splits=n_splits,
verbose=verbose,
pgrid=pgrid,
savepath=savepath, n_jobs=n_jobs, year=year)
print('')
logger.info('Done!')
def configure_logger(name='general', filename=None):
import logging
import sys
stdout_handler = logging.StreamHandler(sys.stdout)
if filename is not None:
file_handler = logging.FileHandler(filename=filename, mode='a')
handlers = [file_handler, stdout_handler]
else:
handlers = [stdout_handler]
logging.basicConfig(
level=logging.INFO,
format='[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s',
handlers=handlers
)
logger = logging.getLogger(name=name)
return logger
if __name__ == '__main__':
import argparse
import sys
from pathlib import Path
from Migration_main import work_david
ml_path = work_david / 'ML'
logger = configure_logger('Nadlan_ML')
savepath = Path(ml_path)
parser = argparse.ArgumentParser(
description='a command line tool for running the ML models tuning for Nadlan deals.')
optional = parser._action_groups.pop()
required = parser.add_argument_group('required arguments')
optional.add_argument(
'--savepath',
help="a full path to save the gridsearchcv files",
type=check_path)
optional.add_argument(
'--n_splits',
help='how many splits for the CV',
type=int)
optional.add_argument(
'--main_path',
help='the path where the nadlan deals are (csv)',
type=check_path)
optional.add_argument(
'--year',
help='year of the nadlan deals',
type=int)
optional.add_argument(
'--pgrid',
help='param grids for gridsearchcv object',
type=str, choices=['light', 'normal', 'dense'])
optional.add_argument(
'--n_jobs',
help='number of CPU threads to do gridsearch and cross-validate',
type=int)
optional.add_argument(
'--rseed',
help='random seed interger to start psuedo-random number generator',
type=int)
optional.add_argument(
'--verbose',
help='verbosity 0, 1, 2',
type=int)
# optional.add_argument(
# '--scorers',
# nargs='+',
# help='scorers, e.g., r2, r2_adj, etc',
# type=str)
# optional.add_argument('--nsplits', help='select number of splits for HP tuning.', type=int)
required.add_argument(
'--model',
help='select ML model.',
choices=[
'SVM',
'MLP',
'RF'])
optional.add_argument('--regressors', help='select features for ML', nargs='+')
parser._action_groups.append(optional) # added this line
args = parser.parse_args()
# if args.savepath is None:
# print('savepath is a required argument, run with -h...')
# sys.exit()
if args.model is None:
print('model is a required argument, run with -h...')
sys.exit()
logger.info('Running ML, CV with {} model'.format(args.model))
main_nadlan_ML(args)