Skip to content
This repository has been archived by the owner on Feb 11, 2022. It is now read-only.

Add batch algorithms of machine learning for custom pipeline training #35

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions server/ml/Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Intro
一个`Mini`型的ml/dl的项目,需要使用者具有一定的编程能力。目录结构为

```
├── clf
│ │
│ ├── nn
├── data
├── pipe
└── saved
├── train.py
└── predict.py
```

* 一般情况 data 目录下放置数据集
* clf 文件夹下是为了自定义的机器学习算法,例如GridSearch SVC等, 而其子文件夹nn用于存放神经网络等深度学习算法
* pipe 文件夹下放置对数据集的预定义处理, 意味着你可以从任何地方加载并处理你的数据, 例如pipe/iload_aliatec.py即是对此次ATEC风险支付的数据处理
* saved 为了存放训练好的模型,或者预测后的数据

目前`train.py`中已经放置了基本的训练算法,如果需要自定义训练,则在`clf`文件夹下新建并在train.py中导入,如果不需要则只需要每次修改其中的数据`load`函数即可。以及对应与`predict.py`文件中的`load`和`save`

`train.py`
```python
if __name__ == '__main__':

print('Loading Data....',end='',flush=True)
from pipe import iload_iris_pipe
x_train, y_train, x_test, y_test = iload_iris_pipe()
print('\tDone')
train(x_train, y_train, x_test, y_test)
```

`predict.py`
```python
from pipe import iload_iris_pipe, isave_iris_data
x_train, y_train, x_test, y_test = iload_iris_pipe()

for model in models:

clf = joblib.load(model)
modelname = clf.__class__.__name__
if hasattr(clf, "predict") and hasattr(clf, 'predict_proba'):
predicts = clf.predict(x_test)
predicts_proba = clf.predict_proba(x_test)

isave_iris_data(predicts, predicts_proba, 'saved/{}.predict'.format(modelname))
```

# Note

* 在load数据进行Pipline处理后,再交由自定义算法Pipline处理时可能会有意想不到的错误。(Sklearn本身的问题),可以只在其中一处做Pipline,即只在pipe文件夹下load数据时自定义,也可以只在自定义算法时进行pipline

# Todo

- [ ] 增加requerments.txt 文件
- [ ] 单元测试
- [x] 伪ETL工程目录
- [ ] 性能评价模块
- [x] 动态创建类的函数
- [x] 自定义 nn 函数
- [x] 自定义 clf 函数
- [x] 捕获ctrl+c,中断当前训练器
7 changes: 7 additions & 0 deletions server/ml/clf/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .isvc import IGridSVC
from .nn.icnn import ICNN

__all__ = [
'IGridSVC',
'ICNN'
]
44 changes: 44 additions & 0 deletions server/ml/clf/isvc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest,chi2

from sklearn.decomposition import PCA, NMF
from sklearn.svm import SVC



class IGridSVC():
N_FEATURES_OPTIONS = [2, 4]
C_OPTIONS = [1, 10, 100, 1000]
param_grid = [
{
'reduce_dim': [PCA(iterated_power=7), NMF()],
'reduce_dim__n_components': N_FEATURES_OPTIONS,
'classify__C': C_OPTIONS
},
{
'reduce_dim': [SelectKBest(chi2)],
'reduce_dim__k': N_FEATURES_OPTIONS,
'classify__C': C_OPTIONS
},]

pipe = Pipeline([
('reduce_dim', PCA()),
('classify', SVC( kernel="linear", probability=True))
])

def __init__(self):
self.model = None

def fit(self,x_train, y_train):
self.model = GridSearchCV(IGridSVC.pipe, cv=3, n_jobs=-1, param_grid=IGridSVC.param_grid)
self.model = self.model.fit(x_train,y_train)

def score(self,x_test,y_test):
return self.model.score(x_test,y_test)

def predict(self, x_test):
return self.model.predict(x_test)

def predict_proba(self, x_test):
return self.model.predict_proba(x_test)
Empty file added server/ml/clf/nn/__init__.py
Empty file.
72 changes: 72 additions & 0 deletions server/ml/clf/nn/icnn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os

from sklearn.model_selection import GridSearchCV

from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import to_categorical
# Keras Model/Layers

from keras.models import Model,Sequential
from keras.layers import Input, Dense, LSTM, Activation, Conv2D, \
MaxPool2D, Dropout, Flatten, Embedding,Reshape,Concatenate,\
TimeDistributed, AveragePooling1D
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam, Adadelta

from keras.losses import categorical_crossentropy

class ICNN():

# filter_sizes = [3, 4, 5]
# num_filters = 128
# epochs = 200
# batch_size = 64
num_classes = 4

param_grid = {
'clf__optimizer': ['rmsprop', 'adam', 'adagrad'],
'clf__epochs': [200, 300, 400, 700, 1000],
'clf__batch_size': [32, 64, 128],
'clf__dropout': [0.1, 0.2, 0.3, 0.4, 0.5],
'clf__kernel_initializer': ['he_normal', 'glorot_uniform', 'normal', 'uniform']
}

# pipline = Pipeline([
# # ('preprocess_step1',None),
# # ('preprocess_step2',None),
# # ('preprocess_step3',None)
# # ('clf', keras_clf)
# ])

def __init__(self,kernel_initializer='he_normal',optimizer='adam',activation='relu',loss='binary_crossentropy',dropout=0.5):

self.kernel_initializer = kernel_initializer
self.optimizer = optimizer
self.activation = activation
self.dropout = dropout
self.loss = loss
self.model = None


def creat_model(self):
model = ""
return model

def search_model(self):
pass

def fit(self,x_train,y_train):
# self.model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[checkpoint], validation_data=(x_test, y_test))
# self.model.save("{}/model.h5".format(self.name))
# checkpoint = ModelCheckpoint('{}/weights.{epoch:03d}-{val_acc:.4f}.hdf5'.format(self.name), monitor='val_acc', verbose=1, save_best_only=True, mode='auto')

self.model = KerasClassifier(build_fn=self.creat_model)

# self.model = GridSearchCV(KerasClassifier(build_fn=self.creat_model),\
# cv=3, param_grid=ICNN.param_grid)

self.model.fit(x_train,y_train)

def score(self, x_test,y_test):
# self.model.score(x_test,y_test)
return 1
8 changes: 8 additions & 0 deletions server/ml/pipe/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from .iload_iris import iload_iris_pipe, isave_iris_data
from .iload_digits import iload_digits_pipe, isave_digits_data
from .iload_aliatec import iload_aliatec_pipe, iload_predict_data, isave_predict_data
__all__ =[
'iload_iris_pipe', 'isave_iris_data',
'iload_digits_pipe', 'isave_digits_data' ,
'iload_aliatec_pipe','iload_predict_data','isave_predict_data'
]
53 changes: 53 additions & 0 deletions server/ml/pipe/iload_aliatec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import pandas as pd
import os

from sklearn.model_selection import train_test_split

train_data_path = 'data/atec_anti_fraud_train.csv'
predict_data_path = 'data/atec_anti_fraud_test_a.csv'

DROPCOLUMS = ["id","label","date"]
# 0 .... 1, 0 is safe / 1 is not safe

def iload_aliatec_pipe():

if os.path.isfile(train_data_path) and os.path.isfile(predict_data_path):
print("[√] Path Checked, File Exists")
else:
print("[X] Please Make Sure Your Datasets Was Exists")
import sys
sys.exit(1)

data = pd.read_csv(train_data_path)
data = data.fillna(0)
unlabeled = data[data['label'] == -1]
labeled = data[data['label'] != -1]

train, test = train_test_split(labeled, test_size=0.2, random_state=42)

cols = [c for c in DROPCOLUMS if c in train.columns]
x_train = train.drop(cols,axis=1)

cols = [c for c in DROPCOLUMS if c in test.columns]
x_test = test.drop(cols,axis=1)

y_train = train['label']
y_test = test['label']
return x_train, y_train, x_test, y_test

def iload_predict_data():
upload_test = pd.read_csv(predict_data_path)
upload_test = upload_test.fillna(0)
upload_id = upload_test['id']

cols = [c for c in DROPCOLUMS if c in upload_test.columns]
upload_test = upload_test.drop(cols,axis=1)

return upload_id, upload_test


def isave_predict_data(data_id,predict,filename):
p = pd.DataFrame(predict,columns=["score"])
res = pd.concat([data_id,p],axis=1)
res.to_csv(filename,index=False)
print("[+] Save Predict Result To {} Sucessful".format(filename))
25 changes: 25 additions & 0 deletions server/ml/pipe/iload_digits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import Binarizer,StandardScaler
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline, make_pipeline
def iload_digits_pipe():
digits = load_digits()
data = digits.data
lables = digits.target

pipe = Pipeline([
('scale',StandardScaler())
('reduce_dim',PCA())
])

data = pipe.fit_transform(data)
x_train, x_test, y_train, y_test = train_test_split(data, lables, test_size=0.2, random_state=42)

return x_train, y_train, x_test, y_test


def isave_digits_data(predict, filename, predict_proba=None):
pass
25 changes: 25 additions & 0 deletions server/ml/pipe/iload_iris.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

import pandas as pd

def iload_iris_pipe():
iris = load_iris()
x_train, x_test, y_train, y_test = train_test_split(iris.data,iris.target,test_size=0.2, random_state=42 )

return x_train, y_train, x_test, y_test


def isave_iris_data(predict, predict_proba, filename):
proba = []
p1 = pd.DataFrame(predict, columns=["type"])
print(predict_proba)
shape = predict_proba.shape
print(shape)
proba = [v[k] for k, v in zip(predict, predict_proba)]

p2 = pd.DataFrame(proba, columns=["proba"])
res = pd.concat([p1, p2], axis=1)

res.to_csv(filename, index=False)
print("[+] Save Predict Result To {} Sucessful".format(filename))
45 changes: 45 additions & 0 deletions server/ml/predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import pandas as pd
import glob
import csv
from sklearn.externals import joblib

models = glob.glob('saved/*.pkl')

TESTFALG = True

if TESTFALG:
from pipe import iload_iris_pipe, isave_iris_data
x_train, y_train, x_test, y_test = iload_iris_pipe()

for model in models:

clf = joblib.load(model)
modelname = clf.__class__.__name__
if hasattr(clf, "predict") and hasattr(clf, 'predict_proba'):
predicts = clf.predict(x_test)
predicts_proba = clf.predict_proba(x_test)

isave_iris_data(predicts, predicts_proba, 'saved/{}.predict'.format(modelname))

def main():
from pipe import iload_predict_data , isave_predict_data
data_id, data_features = iload_predict_data()

for model in models:

clf = joblib.load(model)
modelname = clf.__class__.__name__

if hasattr(clf, "predict"):
_ = clf.predict(data_features)
save_predict = "saved/{}_predict.csv".format(modelname)
isave_predict_data(data_id, _, save_predict)

if hasattr(clf, 'predict_proba'):
_ = clf.predict_proba(data_features)
_ = [ 1-i[0] for i in _ ]
save_predict_proba = "saved/{}_predict_proba.csv".format(modelname)
isave_predict_data(data_id, _, save_predict_proba)

# if __name__ == '__main__':
# main()
31 changes: 31 additions & 0 deletions server/ml/saved/AdaBoostClassifier.predict
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
type,proba
1,0.3516016029939402
0,0.3726623945162534
2,0.3629553100355079
1,0.3525078600618822
1,0.35120905208303793
0,0.3726623945162534
1,0.35391537907224435
2,0.36824496826420017
2,0.3539965952556733
1,0.36876142420600455
1,0.3491303347329515
0,0.37178495461951044
0,0.3726623945162534
0,0.3725266190710486
0,0.3726623945162534
1,0.35521419037617075
2,0.37429130177704345
1,0.3675845279771045
1,0.3516016029939402
2,0.37663649708734154
0,0.37281822350933047
2,0.36887142448742044
0,0.3726623945162534
2,0.37663649708734154
2,0.3706663558504806
2,0.37429130177704345
2,0.3633829969575935
2,0.3706663558504806
0,0.37178495461951044
0,0.3725266190710486
Loading