Skip to content

Commit 96841e6

Browse files
author
IdrissaIyamuremye
committed
Added XGBoost for classification and for Regression
1 parent 4ef5c41 commit 96841e6

File tree

12 files changed

+403
-0
lines changed

12 files changed

+403
-0
lines changed
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
import numpy as np
2+
import copy
3+
4+
5+
# Classification Tree (single tree)
6+
7+
class ClassificationTree:
8+
def __init__(self, max_depth=3, reg_lambda=1.0, prune_gamma=0.0):
9+
self.max_depth = max_depth # Maximum depth
10+
self.reg_lambda = reg_lambda # Regularization constant
11+
self.prune_gamma = prune_gamma # Pruning threshold
12+
self.estimator1 = None # Tree structure before assigning leaf values
13+
self.estimator2 = None # Tree structure with leaf values
14+
self.feature = None # Feature matrix
15+
self.residual = None # Residuals (negative gradient)
16+
self.base_score = None # Initial log-odds
17+
18+
# Find best split for current node
19+
def node_split(self, did):
20+
r = self.reg_lambda
21+
max_gain = -np.inf
22+
d = self.feature.shape[1]
23+
G = -self.residual[did].sum()
24+
H = did.shape[0]
25+
p_score = (G**2)/(H + r)
26+
best_split = None
27+
28+
for k in range(d):
29+
X_feat = self.feature[did, k]
30+
x_uniq = np.unique(X_feat)
31+
s_point = [(x_uniq[i-1]+x_uniq[i])/2 for i in range(1,len(x_uniq))]
32+
l_bound = -np.inf
33+
for j in s_point:
34+
left = did[(X_feat>l_bound)&(X_feat<=j)]
35+
right = did[X_feat>j]
36+
if len(left)==0 or len(right)==0:
37+
continue
38+
GL = -self.residual[left].sum()
39+
HL = left.shape[0]
40+
GR = G - GL
41+
HR = H - HL
42+
gain = (GL**2)/(HL+r) + (GR**2)/(HR+r) - p_score
43+
if gain > max_gain:
44+
max_gain = gain
45+
best_split = {"fid": k, "split_point": j, "left": left, "right": right}
46+
l_bound = j
47+
if max_gain >= self.prune_gamma:
48+
return best_split
49+
return np.nan
50+
51+
# Recursively split nodes
52+
def recursive_split(self, node, curr_depth):
53+
if curr_depth >= self.max_depth or not isinstance(node, dict):
54+
return
55+
self.recursive_split(node.get("left"), curr_depth+1)
56+
self.recursive_split(node.get("right"), curr_depth+1)
57+
58+
# Leaf value for log-loss
59+
def output_value(self, did):
60+
return np.sum(self.residual[did]) / (did.shape[0] + self.reg_lambda)
61+
62+
# Assign leaf values to all leaves
63+
def output_leaf(self, d):
64+
if isinstance(d, dict):
65+
for key in ["left","right"]:
66+
val = d[key]
67+
if isinstance(val, dict):
68+
self.output_leaf(val)
69+
else:
70+
d[key] = self.output_value(val)
71+
72+
# Fit tree to residuals
73+
def fit(self, X, residuals):
74+
self.feature = X
75+
self.residual = residuals
76+
root = self.node_split(np.arange(X.shape[0]))
77+
if isinstance(root, dict):
78+
self.recursive_split(root, curr_depth=1)
79+
self.estimator2 = copy.deepcopy(root)
80+
self.output_leaf(self.estimator2)
81+
return self.estimator2
82+
83+
# Predict single sample
84+
def x_predict(self, p, x):
85+
if x[p["fid"]] <= p["split_point"]:
86+
if isinstance(p["left"], dict):
87+
return self.x_predict(p["left"], x)
88+
else:
89+
return p["left"]
90+
else:
91+
if isinstance(p["right"], dict):
92+
return self.x_predict(p["right"], x)
93+
else:
94+
return p["right"]
95+
96+
# Predict multiple samples
97+
def predict(self, X):
98+
if self.estimator2 is None:
99+
return np.zeros(X.shape[0])
100+
return np.array([self.x_predict(self.estimator2, x) for x in X])
101+
102+
# Built XGBoost Classifier
103+
class MyXGBoostClassifier:
104+
def __init__(self, n_estimator=10, max_depth=3, reg_lambda=1.0, prune_gamma=0.0, learning_rate=0.1):
105+
self.n_estimator = n_estimator # Number of trees
106+
self.max_depth = max_depth # Maximum depth of each tree
107+
self.reg_lambda = reg_lambda # Regularization constant
108+
self.prune_gamma = prune_gamma # Pruning threshold
109+
self.learning_rate = learning_rate # Learning rate
110+
self.trees = [] # List to store trees
111+
self.base_score = None # Initial log-odds
112+
113+
# Sigmoid to convert log-odds to probability
114+
def sigmoid(self, x):
115+
return 1/(1+np.exp(-x))
116+
117+
# Fit ensemble
118+
def fit(self, X, y):
119+
n_samples = X.shape[0]
120+
# Initial log-odds
121+
p = np.clip(np.mean(y),1e-6,1-1e-6)
122+
self.base_score = np.log(p/(1-p))
123+
y_pred = np.full(n_samples, self.base_score)
124+
125+
for m in range(self.n_estimator):
126+
# Compute residuals: negative gradient of log-loss
127+
p_pred = self.sigmoid(y_pred)
128+
residuals = y - p_pred
129+
tree = ClassificationTree(max_depth=self.max_depth,
130+
reg_lambda=self.reg_lambda,
131+
prune_gamma=self.prune_gamma)
132+
tree.fit(X, residuals)
133+
update = tree.predict(X)
134+
y_pred += self.learning_rate * update
135+
self.trees.append(tree)
136+
137+
# Predict probability
138+
def predict_proba(self, X):
139+
y_pred = np.full(X.shape[0], self.base_score)
140+
for tree in self.trees:
141+
y_pred += self.learning_rate * tree.predict(X)
142+
return self.sigmoid(y_pred)
143+
144+
# Predict class label
145+
def predict(self, X, threshold=0.5):
146+
return (self.predict_proba(X) >= threshold).astype(int)
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
import numpy as np
2+
import copy
3+
4+
# Implement Greedy Algorithm for split finding in a regression tree
5+
class RegressionTree:
6+
def __init__(self, n_estimator=1, max_depth=3, reg_lambda=1.0, prune_gamma=0.0):
7+
self.n_estimator = n_estimator # Number of estimators
8+
self.max_depth = max_depth # Maximum depth of the tree
9+
self.reg_lambda = reg_lambda # Regularization constant
10+
self.prune_gamma = prune_gamma # Pruning threshold
11+
self.estimator1 = None # Tree structure before assigning leaf values
12+
self.estimator2 = None # Tree structure with leaf values
13+
self.feature = None # Feature matrix (X)
14+
self.residual = None # Residuals (y - prediction)
15+
self.base_score = None # Initial prediction (mean of residuals)
16+
17+
# Regularized learning objective:
18+
# Split a node into left and right to maximize gain
19+
def node_split(self, did):
20+
r = self.reg_lambda
21+
max_gain = -np.inf
22+
d = self.feature.shape[1] # Number of features
23+
24+
# Calculate gradient before split
25+
G = -self.residual[did].sum() # Sum of residuals
26+
H = did.shape[0] # Number of samples in node
27+
p_score = (G**2) / (H + r) # Score before the split
28+
29+
best_split = None
30+
31+
# Iterate over all features to find best split
32+
for k in range(d):
33+
X_feat = self.feature[did, k]
34+
x_uniq = np.unique(X_feat)
35+
# Candidate split points: midpoints between unique sorted feature values
36+
s_point = [(x_uniq[i-1] + x_uniq[i])/2 for i in range(1, len(x_uniq))]
37+
38+
l_bound = -np.inf
39+
for j in s_point:
40+
# Split samples into left and right nodes
41+
left = did[(X_feat > l_bound) & (X_feat <= j)]
42+
right = did[X_feat > j]
43+
44+
if len(left) == 0 or len(right) == 0:
45+
continue
46+
47+
# Calculate gradients and hessians for left and right
48+
GL = -self.residual[left].sum()
49+
HL = left.shape[0]
50+
GR = G - GL
51+
HR = H - HL
52+
53+
# Calculate gain for this split
54+
gain = (GL**2)/(HL + r) + (GR**2)/(HR + r) - p_score
55+
56+
if gain > max_gain:
57+
max_gain = gain
58+
best_split = {"fid": k, "split_point": j, "left": left, "right": right}
59+
60+
l_bound = j
61+
62+
# Only split if gain exceeds pruning threshold
63+
if max_gain >= self.prune_gamma:
64+
return best_split
65+
return np.nan # No valid split found
66+
67+
# Recursively split tree nodes until maximum depth is reached
68+
def recursive_split(self, node, curr_depth):
69+
if curr_depth >= self.max_depth or not isinstance(node, dict):
70+
return
71+
72+
self.recursive_split(node.get("left"), curr_depth + 1)
73+
self.recursive_split(node.get("right"), curr_depth + 1)
74+
75+
# Calculate output value for a leaf node (regularized)
76+
def output_value(self, did):
77+
return np.sum(self.residual[did]) / (did.shape[0] + self.reg_lambda)
78+
79+
# Assign output values to all leaf nodes in the tree
80+
def output_leaf(self, d):
81+
if isinstance(d, dict):
82+
for key in ["left", "right"]:
83+
val = d[key]
84+
if isinstance(val, dict):
85+
self.output_leaf(val)
86+
else:
87+
# Replace node indices with actual leaf value
88+
d[key] = self.output_value(val)
89+
90+
# Fit the regression tree to feature matrix X and residuals y
91+
def fit(self, x, y):
92+
self.feature = x
93+
self.residual = y
94+
self.base_score = y.mean() # Initial prediction (mean of residuals)
95+
96+
# Build the tree from root
97+
root = self.node_split(np.arange(x.shape[0]))
98+
if isinstance(root, dict):
99+
self.recursive_split(root, curr_depth=1)
100+
self.estimator1 = root
101+
self.estimator2 = copy.deepcopy(root)
102+
self.output_leaf(self.estimator2) # Assign leaf values
103+
104+
return self.estimator2
105+
106+
# Predict output for a single sample
107+
def x_predict(self, p, x):
108+
if x[p["fid"]] <= p["split_point"]:
109+
if isinstance(p["left"], dict):
110+
return self.x_predict(p["left"], x)
111+
else:
112+
return p["left"]
113+
else:
114+
if isinstance(p["right"], dict):
115+
return self.x_predict(p["right"], x)
116+
else:
117+
return p["right"]
118+
119+
# Predict outputs for multiple samples
120+
def predict(self, x_test):
121+
if self.estimator2 is None:
122+
# If tree is empty, return base score
123+
return np.array([self.base_score] * x_test.shape[0])
124+
# Traverse tree for each sample
125+
return np.array([self.x_predict(self.estimator2, x) for x in x_test])
126+
#Built XGBoostRegressor
127+
class MyXGBoostRegressor:
128+
def __init__(self, n_estimators=10, max_depth=3, reg_lambda=1.0, prune_gamma=0.0, learning_rate=0.1):
129+
self.n_estimators = n_estimators
130+
self.max_depth = max_depth
131+
self.reg_lambda = reg_lambda
132+
self.prune_gamma = prune_gamma
133+
self.learning_rate = learning_rate
134+
self.trees = []
135+
self.base_score = None
136+
137+
def fit(self, X, y):
138+
n_samples = X.shape[0]
139+
# Initial prediction: mean of y
140+
self.base_score = y.mean()
141+
y_pred = np.full(n_samples, self.base_score)
142+
143+
for m in range(self.n_estimators):
144+
# Compute residuals (negative gradient for squared error)
145+
residuals = y - y_pred
146+
tree = RegressionTree(max_depth=self.max_depth, reg_lambda=self.reg_lambda, prune_gamma=self.prune_gamma)
147+
tree.fit(X, residuals)
148+
# Predict residuals and update y_pred
149+
update = tree.predict(X)
150+
y_pred += self.learning_rate * update
151+
self.trees.append(tree)
152+
153+
def predict(self, X):
154+
y_pred = np.full(X.shape[0], self.base_score)
155+
for tree in self.trees:
156+
y_pred += self.learning_rate * tree.predict(X)
157+
return y_pred

Classification/Model/XGBoost/__init__.py

Whitespace-only changes.
Binary file not shown.
191 Bytes
Binary file not shown.
183 Bytes
Binary file not shown.
177 Bytes
Binary file not shown.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import unittest
2+
import numpy as np
3+
from Classification.Model.XGBoost.XGBoostClassification import MyXGBoostClassifier
4+
from sklearn.datasets import load_breast_cancer
5+
from sklearn.model_selection import train_test_split
6+
7+
8+
class TestXGBoostClassification(unittest.TestCase):
9+
10+
@classmethod
11+
def setUpClass(cls):
12+
# Load data
13+
data = load_breast_cancer()
14+
X, y = data.data, data.target
15+
16+
# Train / test split
17+
cls.X_train, cls.X_test, cls.y_train, cls.y_test = train_test_split(
18+
X, y, test_size=0.2, random_state=42
19+
)
20+
21+
# Train model (small + fast for unit tests)
22+
cls.clf = MyXGBoostClassifier(
23+
n_estimator=10,
24+
max_depth=3,
25+
learning_rate=0.1
26+
)
27+
cls.clf.fit(cls.X_train, cls.y_train)
28+
29+
def test_model_created(self):
30+
self.assertIsNotNone(self.clf)
31+
32+
def test_predict_runs(self):
33+
y_pred = self.clf.predict(self.X_test)
34+
self.assertEqual(len(y_pred), len(self.y_test))
35+
36+
def test_predict_proba_runs(self):
37+
38+
y_prob = self.clf.predict_proba(self.X_test)
39+
40+
# Must return one probability per sample
41+
self.assertEqual(len(y_prob), len(self.y_test))
42+
43+
# Probabilities must be between 0 and 1
44+
self.assertTrue((y_prob >= 0).all())
45+
self.assertTrue((y_prob <= 1).all())
46+
47+
48+
def test_prediction_values_valid(self):
49+
y_pred = self.clf.predict(self.X_test)
50+
self.assertTrue(np.all(np.isin(y_pred, [0, 1])))
51+
52+
53+
if __name__ == "__main__":
54+
unittest.main()

0 commit comments

Comments
 (0)