-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel_training.py
More file actions
84 lines (68 loc) · 2.01 KB
/
model_training.py
File metadata and controls
84 lines (68 loc) · 2.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import joblib
csv_path = "feature_sets/horn_training_features_master.csv"
df = pd.read_csv(csv_path)
FEATURES = [
"peak_match",
"peak_energy",
"total_band_energy",
"concentration",
"RR",
"DET",
"L",
"Lmax",
"DIV",
"ENTR",
"LAM",
"TT",
"Vmax",
"VENTR",
"MRT",
"RTE",
"NMPRT",
"TREND",
]
X = df[FEATURES]
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.2,
random_state=42,
stratify=y #keep the same ratio because we have way more successes than we do fails
)
model = make_pipeline(
StandardScaler(), #scale stuff (makes sense)
LogisticRegression(class_weight="balanced", max_iter=1000) #weight the smaller class so its more balanced
)
model.fit(X_train, y_train)
probs = model.predict_proba(X_test)[:, 1]
pred = (probs > 0.3).astype(int)
# Reconstruct test dataframe with metadata
df_test = df.loc[X_test.index].copy()
df_test["true_label"] = y_test
df_test["pred"] = pred
# Optional: add probabilities (very useful)
df_test["prob"] = model.predict_proba(X_test)[:, 1]
# Print metrics
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
# Extract errors
fn = df_test[(df_test["true_label"] == 1) & (df_test["pred"] == 0)]
print("\nFalse Negatives:")
print(fn[["video_file", "window_start", "window_end"] + FEATURES])
fp = df_test[(df_test["true_label"] == 0) & (df_test["pred"] == 1)]
print("\nFalse Positives:")
print(fp[["video_file", "window_start", "window_end"] + FEATURES])
# Save for inspection
fn.to_csv("feature_sets/false_negatives.csv", index=False)
fp.to_csv("feature_sets/false_positives.csv", index=False)
joblib.dump(
model,
"feature_sets/horn_logistic_model_v2.joblib"
)