-
Notifications
You must be signed in to change notification settings - Fork 0
/
coreset.py
199 lines (160 loc) · 8.22 KB
/
coreset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import torch
import pyro
import tyxe
import random
import copy
import functools
import heapq
import fire
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial.distance import cdist
import pyro.distributions as dist
from models.mlp import MLP
from data.data_generator import fetch_datasets
from utils.util import DEVICE, USE_CUDA, save_results, get_model_name
from utils.task_config import load_task_config
from trainer.finetune import finetune_over_coreset
from tqdm import tqdm
from typing import Optional, List
def update_coreset(prev_coreset, train_loader, coreset_size, selection_method='random', curr_idx=0):
if isinstance(train_loader, list) and selection_method == 'class_balanced':
tasks_so_far_data = []
assert curr_idx > 0
for i in range(0, curr_idx):
tasks_so_far_data.append(train_loader[i])
# Create a class-balanced list of combined_data for the total size of coreset_size
combined_data = []
num_tasks = len(tasks_so_far_data)
samples_per_task = coreset_size // num_tasks
for curr_task_loader in tasks_so_far_data:
curr_task_data = list(curr_task_loader.dataset)
combined_data.extend(random.sample(curr_task_data, samples_per_task))
remaining_samples = coreset_size - len(combined_data)
if remaining_samples > 0:
combined_data.extend(random.sample(list(tasks_so_far_data[-1].dataset), remaining_samples))
return combined_data
elif isinstance(train_loader, torch.utils.data.dataloader.DataLoader):
curr_task_data = list(train_loader.dataset)
curr_task_data = random.sample(curr_task_data, min(coreset_size*2, len(curr_task_data))) # truncating current tasks data to {coreset_size * 2} to make it lil faster
combined_data = curr_task_data + prev_coreset if prev_coreset else curr_task_data
if selection_method == 'random':
curr_coreset = random.sample(combined_data, min(coreset_size, len(combined_data)))
elif selection_method == 'k-center':
curr_coreset = k_center_coreset(combined_data, coreset_size)
elif selection_method == 'pca-k-center':
curr_coreset = pca_k_center_coreset(combined_data, coreset_size)
else:
raise ValueError(f"Invalid selection method: {selection_method}")
return curr_coreset
def k_center_coreset(data, coreset_size, via_pca=False):
if not via_pca:
data_array = np.array([x.cpu().numpy() for x, _ in data])
else:
data_array = data
num_points = len(data_array)
# Initialize the coreset with the first data point
initial_index = 0 # deterministic start point
coreset_indices = [initial_index]
# Initialize the distances from the initial coreset point to all other points
distances = np.full(num_points, np.inf)
distances[initial_index] = 0
for i in range(num_points):
if i != initial_index:
distances[i] = np.linalg.norm(data_array[i] - data_array[initial_index])
# max-heap for maintaining max distances
heap = [(-dist, i) for i, dist in enumerate(distances)]
heapq.heapify(heap)
# Iteratively select the farthest point from the current coreset
while len(coreset_indices) < coreset_size:
_, farthest_point_index = heapq.heappop(heap)
if farthest_point_index not in coreset_indices:
coreset_indices.append(farthest_point_index)
# Update the distances and the heap for the remaining points
for i in range(num_points):
if i not in coreset_indices:
new_distance = np.linalg.norm(data_array[i] - data_array[farthest_point_index])
if new_distance < distances[i]:
distances[i] = new_distance
heap = [(-distances[j], j) for j in range(num_points) if j not in coreset_indices] # Rebuild the heap with updated distances
heapq.heapify(heap)
if via_pca:
return coreset_indices
return [data[i] for i in coreset_indices]
def pca_k_center_coreset(data, coreset_size, n_components=20):
data_array = np.array([x.cpu().numpy() for x, _ in data])
# Apply PCA to reduce dimensionality
pca = PCA(n_components=n_components)
reduced_data = pca.fit_transform(data_array)
coreset_indices = k_center_coreset(reduced_data, coreset_size, via_pca=True)
return [data[i] for i in coreset_indices]
def run_coreset_only(
num_tasks: int = 5,
num_epochs: int = 10,
experiment_name: str = 'test',
task_config: str = '',
batch_size: int = 256,
coreset_size: int = 200,
coreset_method: str = 'random',
model_suffix: Optional[str] = None,
finetune_method: Optional[str] = None,
):
input_dim, output_dim, hidden_sizes, single_head, data_name = load_task_config(task_config)
train_loaders, test_loaders = fetch_datasets(batch_size, num_tasks, data_name)
net = MLP(input_dim, hidden_sizes, output_dim, num_tasks, single_head)
net.to(DEVICE)
num_heads = 1 if single_head else num_tasks
obs = tyxe.likelihoods.Categorical(-1) # Bernoulli(-1, event_dim=1) for binary
prior = tyxe.priors.IIDPrior(dist.Normal(0, 1), expose_all=False, hide_all=True)
guide = None
mlp = tyxe.VariationalBNN(net, prior, obs, guide)
heads_list = [getattr(mlp.net, f"Head_{i+1}") for i in range(num_heads)]
print(f"heads_list: {heads_list}")
head_state_dicts = []
for head in heads_list:
head_state_dicts.append(copy.deepcopy(head.state_dict())) # initialize head state for each head
prev_coreset = []
for i, train_loader in enumerate(train_loaders, 1):
# set the current head for training to the current task head
head_idx = i if not single_head else 1
mlp.net.set_task(head_idx) # set current head for forward passes for training
print(f"Current head being used for training mlp.net: {mlp.net.get_task()}")
heads_list[head_idx-1].load_state_dict(head_state_dicts[head_idx-1]) # load head for current task (PyroLinear Head)
# update coreset
if coreset_size == 0:
curr_coreset = []
else:
if coreset_method == 'class_balanced':
curr_coreset = update_coreset(prev_coreset, train_loaders, coreset_size, coreset_method, curr_idx=i)
else:
curr_coreset = update_coreset(prev_coreset, train_loader, coreset_size, coreset_method)
obs.dataset_size = len(train_loader.sampler)
# finetune the model on the coreset data
finetune_over_coreset(mlp, curr_coreset, num_epochs, callback=None, batch_size=batch_size)
head_state_dicts[head_idx-1] = copy.deepcopy(heads_list[head_idx-1].state_dict()) # update the mlp head for the current trained head for prediction
print(f"Train over task {i} Accuracies:")
prev_task_acc = []
for j, test_loader in enumerate(test_loaders[:i], 1):
# set the current head for eval (respective task head)
eval_head_idx = j if not single_head else 1
mlp.net.set_task(eval_head_idx) # set current tasks head for forward passes for evaluation
print(f"Current head being used for evaluating mlp.net: {mlp.net.get_task()}")
heads_list[eval_head_idx-1].load_state_dict(head_state_dicts[eval_head_idx-1]) # load head state for eval
correct = 0
total = 0
for x, y in test_loader:
x, y = x.to(DEVICE), y.to(DEVICE)
preds = mlp.predict(x, num_predictions=8)
correct += (preds.argmax(-1) == y).sum().item()
total += len(y)
accuracy = correct / total
print(f"Task {j} Accuracy: {accuracy:.4f}")
prev_task_acc.append(accuracy)
avg_acc = sum(prev_task_acc) / len(prev_task_acc)
save_results(get_model_name('coreset_only', coreset_size, coreset_method, model_suffix), j, prev_task_acc, avg_acc, data_name, experiment_name)
print(f"Train over task {i} avg: {avg_acc}")
# update the previous coreset
prev_coreset = curr_coreset
if __name__ == '__main__':
fire.Fire(run_coreset_only)