onedrive_batch_processor/vid_processing_modules/model_training.py at main · zeper-eng/onedrive_batch_processor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import joblib

csv_path = "feature_sets/horn_training_features_master.csv"

df = pd.read_csv(csv_path)

FEATURES = [
    "peak_match",
    "peak_energy",
    "total_band_energy",
    "concentration",
    "RR",
    "DET",
    "L",
    "Lmax",
    "DIV",
    "ENTR",
    "LAM",
    "TT",
    "Vmax",
    "VENTR",
    "MRT",
    "RTE",
    "NMPRT",
    "TREND",
]

X = df[FEATURES]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y #keep the same ratio because we have way more successes than we do fails
)

model = make_pipeline(
    StandardScaler(), #scale stuff (makes sense)
    LogisticRegression(class_weight="balanced", max_iter=1000) #weight the smaller class so its more balanced
)

model.fit(X_train, y_train)

probs = model.predict_proba(X_test)[:, 1]
pred = (probs > 0.3).astype(int)

# Reconstruct test dataframe with metadata
df_test = df.loc[X_test.index].copy()

df_test["true_label"] = y_test
df_test["pred"] = pred

# Optional: add probabilities (very useful)
df_test["prob"] = model.predict_proba(X_test)[:, 1]

# Print metrics
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

# Extract errors
fn = df_test[(df_test["true_label"] == 1) & (df_test["pred"] == 0)]
print("\nFalse Negatives:")
print(fn[["video_file", "window_start", "window_end"] + FEATURES])

fp = df_test[(df_test["true_label"] == 0) & (df_test["pred"] == 1)]
print("\nFalse Positives:")
print(fp[["video_file", "window_start", "window_end"] + FEATURES])

# Save for inspection
fn.to_csv("feature_sets/false_negatives.csv", index=False)
fp.to_csv("feature_sets/false_positives.csv", index=False)

joblib.dump(
    model,
    "feature_sets/horn_logistic_model_v2.joblib"
)