forked from khuyentran1401/Data-science
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4777378
commit e3cd189
Showing
14 changed files
with
199 additions
and
209 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ scraping/ghibli.ipynb | |
|
||
# VSCode workspace | ||
*-workspace | ||
.vscode | ||
|
||
# C extensions | ||
*.so | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,13 @@ | ||
# See https://pre-commit.com for more information | ||
# See https://pre-commit.com/hooks.html for more hooks | ||
repos: | ||
- repo: https://github.com/ambv/black | ||
rev: stable | ||
hooks: | ||
- id: black | ||
language_version: python3.7 | ||
- repo: https://github.com/pre-commit/pre-commit-hooks | ||
rev: v2.0.0 | ||
hooks: | ||
- id: flake8 | ||
- repo: https://github.com/timothycrosley/isort | ||
rev: 4.3.21 | ||
hooks: | ||
- id: isort | ||
repos: | ||
- repo: https://github.com/ambv/black | ||
rev: 20.8b1 | ||
hooks: | ||
- id: black | ||
- repo: https://gitlab.com/pycqa/flake8 | ||
rev: 3.8.4 | ||
hooks: | ||
- id: flake8 | ||
- repo: https://github.com/timothycrosley/isort | ||
rev: 5.7.0 | ||
hooks: | ||
- id: isort |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
mlf | ||
output | ||
servicefoundry |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,114 +1,146 @@ | ||
from prefect import task, Flow, Parameter | ||
from prefect.engine.results import LocalResult | ||
|
||
import mlfoundry as mlf | ||
import numpy as np | ||
import pandas as pd | ||
|
||
import mlfoundry as mlf | ||
|
||
@task | ||
def setup_mlf(): | ||
mlf_api = mlf.set_tracking_uri() | ||
return mlf_api.create_run(project_name="Iris-project") | ||
import shap | ||
from mlfoundry.mlfoundry_run import MlFoundryRun | ||
from sklearn.metrics import accuracy_score, f1_score | ||
from sklearn.neighbors import KNeighborsClassifier | ||
|
||
|
||
# ---------------------------------------------------------------------------- # | ||
# Create tasks # | ||
# ---------------------------------------------------------------------------- # | ||
@task(log_stdout=True) | ||
def train_model( | ||
train_x: pd.DataFrame, train_y: pd.DataFrame, num_train_iter: int, learning_rate: float) -> np.ndarray: | ||
"""Task for training a simple multi-class logistic regression model. The | ||
number of training iterations as well as the learning rate are taken from | ||
conf/project/parameters.yml. All of the data as well as the parameters | ||
will be provided to this function at the time of execution. | ||
""" | ||
num_iter = num_train_iter | ||
lr = learning_rate | ||
train_x: pd.DataFrame, | ||
train_y: pd.Series, | ||
n_neighbors: int, | ||
mlf_run: MlFoundryRun, | ||
) -> np.ndarray: | ||
|
||
X = train_x.to_numpy() | ||
Y = train_y.to_numpy() | ||
|
||
# Add bias to the features | ||
bias = np.ones((X.shape[0], 1)) | ||
X = np.concatenate((bias, X), axis=1) | ||
|
||
weights = [] | ||
# Train one model for each class in Y | ||
for k in range(Y.shape[1]): | ||
# Initialise weights | ||
theta = np.zeros(X.shape[1]) | ||
y = Y[:, k] | ||
for _ in range(num_iter): | ||
z = np.dot(X, theta) | ||
h = _sigmoid(z) | ||
gradient = np.dot(X.T, (h - y)) / y.size | ||
theta -= lr * gradient | ||
# Save the weights for each model | ||
weights.append(theta) | ||
# Create a new model instance | ||
knn = KNeighborsClassifier(n_neighbors=n_neighbors) | ||
|
||
# Train the model | ||
knn.fit(X, Y) | ||
|
||
# Print finishing training message | ||
print("Finish training the model.") | ||
|
||
# Return a joint multi-class model with weights for all classes | ||
return np.vstack(weights).transpose() | ||
|
||
|
||
def _sigmoid(z): | ||
"""A helper sigmoid function used by the training and the scoring tasks.""" | ||
return 1 / (1 + np.exp(-z)) | ||
|
||
@task | ||
def predict(model: np.ndarray, test_x: pd.DataFrame) -> np.ndarray: | ||
"""Task for making predictions given a pre-trained model and a test set.""" | ||
X = test_x.to_numpy() | ||
|
||
# Add bias to the features | ||
bias = np.ones((X.shape[0], 1)) | ||
X = np.concatenate((bias, X), axis=1) | ||
|
||
# Predict "probabilities" for each class | ||
result = _sigmoid(np.dot(X, model)) | ||
|
||
# Return the index of the class with max probability for all samples | ||
return np.argmax(result, axis=1) | ||
|
||
|
||
@task(log_stdout=True) | ||
def report_accuracy(predictions: np.ndarray, test_y: pd.DataFrame) -> None: | ||
"""Task for reporting the accuracy of the predictions performed by the | ||
previous task. Notice that this function has no outputs, except logging. | ||
""" | ||
# Get true class index | ||
target = np.argmax(test_y.to_numpy(), axis=1) | ||
# Calculate accuracy of predictions | ||
accuracy = np.sum(predictions == target) / target.shape[0] | ||
# Log the accuracy of the model | ||
print(f"Model accuracy on test set: {round(accuracy * 100, 2)}") | ||
# Log model | ||
mlf_run.log_model(knn, mlf.ModelFramework.SKLEARN) | ||
|
||
return knn | ||
|
||
|
||
def predict(model: np.ndarray, X: pd.DataFrame) -> np.ndarray: | ||
"""Make predictions given a pre-trained model and a test set.""" | ||
X = X.to_numpy() | ||
|
||
return {"predictions": model.predict(X)} | ||
|
||
|
||
def get_shap_values(model, X_train: pd.DataFrame, X_test: pd.DataFrame): | ||
explainer = shap.KernelExplainer(model.predict_proba, X_train) | ||
return explainer.shap_values(X_test) | ||
|
||
|
||
def log_data_stats( | ||
train_x: pd.DataFrame, | ||
test_x: pd.DataFrame, | ||
train_y: pd.Series, | ||
test_y: pd.Series, | ||
model, | ||
mlf_run: MlFoundryRun, | ||
): | ||
prediction_train = pd.DataFrame(predict(model, train_x)) | ||
prediction_test = pd.DataFrame(predict(model, test_x)) | ||
|
||
train_data = pd.concat([train_x, train_y], axis=1).reset_index(drop=True) | ||
test_data = pd.concat([test_x, test_y], axis=1).reset_index(drop=True) | ||
|
||
# Log data | ||
mlf_run.log_dataset(train_data, data_slice=mlf.DataSlice.TRAIN) | ||
mlf_run.log_dataset(test_data, data_slice=mlf.DataSlice.TEST) | ||
|
||
# Concat data and predictions | ||
train_df = pd.concat( | ||
[ | ||
train_data, | ||
prediction_train, | ||
], | ||
axis=1, | ||
) | ||
test_df = pd.concat( | ||
[ | ||
test_data, | ||
prediction_test, | ||
], | ||
axis=1, | ||
) | ||
|
||
# Get SHAP values | ||
shap_values = get_shap_values(model, train_x, test_x) | ||
|
||
# Log dataset stats | ||
data_schema = mlf.Schema( | ||
feature_column_names=list(train_df.columns), | ||
actual_column_name="species", | ||
prediction_column_name="predictions", | ||
) | ||
|
||
mlf_run.log_dataset_stats( | ||
train_df, | ||
data_slice=mlf.DataSlice.TRAIN, | ||
data_schema=data_schema, | ||
model_type=mlf.ModelType.MULTICLASS_CLASSIFICATION, | ||
# shap_values=shap_values # ! Uncomment this give an error: Details: [Errno 2] No such file or directory: './resources/failure.png' | ||
) | ||
mlf_run.log_dataset_stats( | ||
test_df, | ||
data_slice=mlf.DataSlice.TEST, | ||
data_schema=data_schema, | ||
model_type=mlf.ModelType.MULTICLASS_CLASSIFICATION, | ||
# shap_values=shap_values | ||
) | ||
|
||
log_metrics(prediction_test["predictions"], test_y, mlf_run) | ||
|
||
|
||
def log_metrics( | ||
predictions: np.ndarray, test_y: pd.DataFrame, mlf_run: MlFoundryRun | ||
) -> None: | ||
|
||
target = test_y.to_numpy() | ||
|
||
# Get metrics | ||
metrics = {} | ||
metrics["accuracy"] = accuracy_score(target, predictions) | ||
metrics["f1_score"] = f1_score(target, predictions, average="weighted") | ||
|
||
# Log metrics | ||
mlf_run.log_metrics(metrics) | ||
|
||
|
||
# ---------------------------------------------------------------------------- # | ||
# Create a flow # | ||
# ---------------------------------------------------------------------------- # | ||
|
||
with Flow("data-science") as flow: | ||
|
||
|
||
train_test_dict = LocalResult(dir='data/processed/Mon_Dec_20_2021_20:55:20').read(location='split_data_output').value | ||
def data_science_flow(train_test_dict: dict, mlf_run: MlFoundryRun): | ||
|
||
# Load data | ||
train_x = train_test_dict['train_x'] | ||
train_y = train_test_dict['train_y'] | ||
test_x = train_test_dict['test_x'] | ||
test_y = train_test_dict['test_y'] | ||
train_x = train_test_dict["train_x"] | ||
train_y = train_test_dict["train_y"] | ||
test_x = train_test_dict["test_x"] | ||
test_y = train_test_dict["test_y"] | ||
|
||
# Define parameters | ||
num_train_iter = Parameter('num_train_iter', default=10000) | ||
learning_rate = Parameter('learning_rate', default = 0.01) | ||
|
||
# Define tasks | ||
model = train_model(train_x, train_y, num_train_iter, learning_rate) | ||
predictions = predict(model, test_x) | ||
report_accuracy(predictions, test_y) | ||
params = {"n_neighbors": 12} | ||
|
||
# Log parameters | ||
mlf_run.log_params(params) | ||
|
||
# Define tasks | ||
model = train_model(train_x, train_y, params["n_neighbors"], mlf_run) | ||
|
||
flow.run() | ||
log_data_stats(train_x, test_x, train_y, test_y, model, mlf_run) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,11 @@ | ||
from prefect import Flow | ||
from prefect.tasks.prefect import StartFlowRun | ||
import mlfoundry as mlf | ||
from data_engineering import data_engineer_flow | ||
from data_science import data_science_flow | ||
|
||
data_engineering_flow = StartFlowRun( | ||
flow_name="data-engineer", project_name='Iris Project', wait=True, parameters={'test_data_ratio': 0.3}) | ||
data_science_flow = StartFlowRun( | ||
flow_name="data-science", project_name='Iris Project', wait=True) | ||
# Initialize a new MLFoundryRun | ||
mlf_api = mlf.get_client() | ||
mlf_run = mlf_api.create_run(project_name="Iris-project") | ||
|
||
with Flow("main-flow") as flow: | ||
result = data_science_flow(upstream_tasks=[data_engineering_flow]) | ||
|
||
flow.run() | ||
# Run flows | ||
train_test_dict = data_engineer_flow(mlf_run) | ||
data_science_flow(train_test_dict, mlf_run) |
Oops, something went wrong.