Skip to content

Commit 6bd626f

Browse files
committed
Implemented DistMixin
1 parent 0955c4d commit 6bd626f

File tree

2 files changed

+194
-217
lines changed

2 files changed

+194
-217
lines changed

_Dist/NeuralNetworks/Base.py

+191-7
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,16 @@
2121

2222

2323
class Generator:
24-
def __init__(self, x, y, weights=None, name="Generator", shuffle=True):
24+
def __init__(self, x, y, name="Generator", weights=None, n_class=None, shuffle=True):
2525
self._cache = {}
2626
self._x, self._y = np.asarray(x, np.float32), np.asarray(y, np.float32)
2727
if weights is None:
2828
self._sample_weights = None
2929
else:
3030
self._sample_weights = np.asarray(weights, np.float32)
31-
if len(self._y.shape) == 1:
31+
if n_class is not None:
32+
self.n_class = n_class
33+
else:
3234
y_int = self._y.astype(np.int32)
3335
if np.allclose(self._y, y_int):
3436
assert y_int.min() == 0, "Labels should start from 0"
@@ -253,9 +255,9 @@ def init_from_data(self, x, y, x_test, y_test, sample_weights, names):
253255
else:
254256
self._tf_sample_weights = tf.placeholder(tf.float32, name="sample_weights")
255257

256-
self._train_generator = self._generator_base(x, y, self._sample_weights, name="TrainGenerator")
258+
self._train_generator = self._generator_base(x, y, "TrainGenerator", self._sample_weights, self.n_class)
257259
if x_test is not None and y_test is not None:
258-
self._test_generator = self._generator_base(x_test, y_test, name="TestGenerator")
260+
self._test_generator = self._generator_base(x_test, y_test, "TestGenerator")
259261
else:
260262
self._test_generator = None
261263
self.n_random_train_subset = int(len(self._train_generator) * 0.1)
@@ -1112,7 +1114,7 @@ def _gen_categorical_columns(self):
11121114
def _transform_data(self, data, name, train_name="train",
11131115
include_label=False, refresh_redundant_info=False, stage=3):
11141116
print("Transforming {0}data{2} at stage {1}".format(
1115-
"{} ".format(name) if stage >= 2 else "", stage,
1117+
"{} ".format(name), stage,
11161118
"" if name == train_name or not self.reuse_mean_and_std else
11171119
" with {} data".format(train_name),
11181120
))
@@ -1227,8 +1229,6 @@ def _get_data_from_file(self, file_type, test_rate):
12271229

12281230
def _load_data(self, data=None, numerical_idx=None, file_type="txt", names=("train", "test"),
12291231
shuffle=True, test_rate=0.1, stage=3):
1230-
if stage == 1:
1231-
names = (None, None)
12321232
use_cached_data = False
12331233
train_data = test_data = None
12341234
data_cache_folder = os.path.join(self._data_folder, "_Cache", self._name)
@@ -1417,3 +1417,187 @@ def evaluate(self, x, y, x_cv=None, y_cv=None, x_test=None, y_test=None):
14171417
attr[key] = value
14181418

14191419
return type(name_, bases, attr)
1420+
1421+
1422+
class DistMixin:
1423+
def reset_all_variables(self):
1424+
self._sess.run(tf.global_variables_initializer())
1425+
1426+
def rolling_fit(self, train_rate=0.8, cv_rate=0.1, sample_weights=None, **kwargs):
1427+
n_data = len(self._train_generator)
1428+
if sample_weights is not None:
1429+
n_weights = len(sample_weights)
1430+
assert_msg = (
1431+
"Sample weights should match training data, "
1432+
"but n_weights={} & n_data={} found".format(n_weights, n_data)
1433+
)
1434+
assert n_weights == n_data, assert_msg
1435+
n_train = int(train_rate * n_data)
1436+
n_test = int(cv_rate * n_data) if self._test_generator is None else len(self._test_generator)
1437+
j, cursor, print_settings = 0, 0, kwargs.pop("print_settings", True)
1438+
flag = test_flag = False
1439+
if self._test_generator is not None:
1440+
test_flag = True
1441+
test_data, _ = self._test_generator.get_all_data()
1442+
x_test, y_test = test_data[..., :-1], test_data[..., -1]
1443+
else:
1444+
x_test = y_test = None
1445+
print("Rolling fit with train_rate={} and test_rate={}".format(train_rate, cv_rate))
1446+
while True:
1447+
j += 1
1448+
train_cursor = cursor + n_train
1449+
test_cursor = train_cursor + n_test
1450+
if n_data - test_cursor < n_test:
1451+
flag = True
1452+
test_cursor = n_data
1453+
with self._train_generator:
1454+
if self._test_generator is None:
1455+
test_data, _ = self._train_generator.get_range(train_cursor, test_cursor)
1456+
x_test, y_test = test_data[..., :-1], test_data[..., -1]
1457+
self._test_generator = self._generator_base(x_test, y_test, name="TestGenerator")
1458+
self._train_generator.set_range(cursor, train_cursor)
1459+
kwargs["print_settings"] = print_settings
1460+
self.fit(**kwargs)
1461+
x, y, _ = self._gen_batch(self._train_generator, self.n_random_train_subset, True)
1462+
print(" - Performance of roll {}".format(j), end=" | ")
1463+
self._evaluate(x, y, x_test, y_test)
1464+
cursor += n_test
1465+
print_settings = False
1466+
if not test_flag:
1467+
self._test_generator = None
1468+
if flag:
1469+
break
1470+
with self._train_generator:
1471+
self._train_generator.set_range(cursor)
1472+
kwargs["print_settings"] = print_settings
1473+
self.fit(**kwargs)
1474+
if self._test_generator is not None:
1475+
print(" - Performance of roll {}".format(j + 1), end=" | ")
1476+
self._evaluate(x_test=x_test, y_test=y_test)
1477+
return self
1478+
1479+
def increment_fit(self, x=None, y=None, x_test=None, y_test=None, sample_weights=None, **kwargs):
1480+
if x is not None and y is not None:
1481+
data = np.hstack([np.asarray(x, np.float32), np.asarray(y, np.float32).reshape([-1, 1])])
1482+
if x_test is not None and y_test is not None:
1483+
data = (data, np.hstack([
1484+
np.asarray(x_test, np.float32), np.asarray(y_test, np.float32).reshape([-1, 1])
1485+
]))
1486+
x, y, x_test, y_test = self._load_data(data)
1487+
else:
1488+
data = None
1489+
if self._test_generator is not None:
1490+
test_data, _ = self._test_generator.get_all_data()
1491+
x_test, y_test = test_data[..., :-1], test_data[..., -1]
1492+
if sample_weights is not None:
1493+
self._sample_weights = np.asarray(sample_weights, np.float32)
1494+
self._handle_unbalance(y)
1495+
self._handle_sparsity()
1496+
if data is not None:
1497+
self._train_generator = self._generator_base(x, y, self._sample_weights, name="Generator")
1498+
if x_test is not None and y_test is not None:
1499+
self._test_generator = self._generator_base(x_test, y_test, name="TestGenerator")
1500+
self.fit(**kwargs)
1501+
x, y, _ = self._gen_batch(self._train_generator, self.n_random_train_subset, True)
1502+
print(" - Performance of increment fit", end=" | ")
1503+
self._evaluate(x, y, x_test, y_test)
1504+
return self
1505+
1506+
def _k_series_initialization(self, k, data):
1507+
self.init_data_info()
1508+
x, y, x_test, y_test = self._load_data(data, stage=1)
1509+
x_test, y_test, *_ = self._load_data(
1510+
np.hstack([x_test, y_test.reshape([-1, 1])]),
1511+
names=("test", None), test_rate=0, stage=2
1512+
)
1513+
names = [("train{}".format(i), "cv{}".format(i)) for i in range(k)]
1514+
return x, y, x_test, y_test, names
1515+
1516+
def _k_series_evaluation(self, i, x_test, y_test):
1517+
train, sw_train = self._train_generator.get_all_data()
1518+
cv, sw_cv = self._test_generator.get_all_data()
1519+
x, y = train[..., :-1], train[..., -1]
1520+
x_cv, y_cv = cv[..., :-1], cv[..., -1]
1521+
print(" - Performance of run {}".format(i + 1), end=" | ")
1522+
self._evaluate(x, y, x_cv, y_cv, x_test, y_test)
1523+
1524+
def _merge_preprocessors_from_k_series(self, names):
1525+
train_names, cv_names = [name[0] for name in names], [name[1] for name in names]
1526+
self._merge_preprocessors_by_names("train", train_names)
1527+
self._merge_preprocessors_by_names("cv", cv_names)
1528+
1529+
def _merge_preprocessors_by_names(self, target, names):
1530+
if len(names) == 1:
1531+
self._pre_processors[target] = self._pre_processors.pop(names[0])
1532+
pre_processors = [self._pre_processors.pop(name) for name in names]
1533+
methods = [pre_processor.method for pre_processor in pre_processors]
1534+
scale_methods = [pre_processor.scale_method for pre_processor in pre_processors]
1535+
assert Toolbox.all_same(methods), "Pre_process method should be all_same"
1536+
assert Toolbox.all_same(scale_methods), "Scale method should be all_same"
1537+
new_processor = PreProcessor(methods[0], scale_methods[0])
1538+
new_processor.mean = np.mean([pre_processor.mean for pre_processor in pre_processors], axis=0)
1539+
new_processor.std = np.mean([pre_processor.std for pre_processor in pre_processors], axis=0)
1540+
self._pre_processors[target] = new_processor
1541+
1542+
def k_fold(self, k=10, data=None, test_rate=0., sample_weights=None, **kwargs):
1543+
x, y, x_test, y_test, names = self._k_series_initialization(k, data)
1544+
n_batch = int(len(x) / k)
1545+
all_idx = list(range(len(x)))
1546+
print_settings = True
1547+
if sample_weights is not None:
1548+
self._sample_weights = np.asarray(sample_weights, np.float32)
1549+
sample_weights_store = self._sample_weights
1550+
print("Training k-fold with k={} and test_rate={}".format(k, test_rate))
1551+
for i in range(k):
1552+
self.reset_all_variables()
1553+
cv_idx = list(range(i * n_batch, (i + 1) * n_batch))
1554+
train_idx = [j for j in all_idx if j < i * n_batch or j >= (i + 1) * n_batch]
1555+
x_cv, y_cv = x[cv_idx], y[cv_idx]
1556+
x_train, y_train = x[train_idx], y[train_idx]
1557+
if sample_weights is not None:
1558+
self._sample_weights = sample_weights_store[train_idx]
1559+
else:
1560+
self._sample_weights = None
1561+
kwargs["print_settings"] = print_settings
1562+
kwargs["names"] = names[i]
1563+
self.data_info["stage"] = 2
1564+
self.fit(x_train, y_train, x_cv, y_cv, **kwargs)
1565+
self._k_series_evaluation(i, x_test, y_test)
1566+
print_settings = False
1567+
self.data_info["stage"] = 3
1568+
self._merge_preprocessors_from_k_series(names)
1569+
self._sample_weights = sample_weights_store
1570+
if x_test is not None and y_test is not None:
1571+
self._test_generator = self._generator_base(x_test, y_test, name="TestGenerator")
1572+
return self
1573+
1574+
def k_random(self, k=3, data=None, cv_rate=0.1, test_rate=0., sample_weights=None, **kwargs):
1575+
x, y, x_test, y_test, names = self._k_series_initialization(k, data)
1576+
n_cv = int(cv_rate * len(x))
1577+
print_settings = True
1578+
if sample_weights is not None:
1579+
self._sample_weights = np.asarray(sample_weights, np.float32)
1580+
sample_weights_store = self._sample_weights
1581+
print("Training k-random with k={}, cv_rate={} and test_rate={}".format(k, cv_rate, test_rate))
1582+
for i in range(k):
1583+
self.reset_all_variables()
1584+
all_idx = np.random.permutation(len(x))
1585+
cv_idx, train_idx = all_idx[:n_cv], all_idx[n_cv:]
1586+
x_cv, y_cv = x[cv_idx], y[cv_idx]
1587+
x_train, y_train = x[train_idx], y[train_idx]
1588+
if sample_weights is not None:
1589+
self._sample_weights = sample_weights_store[train_idx]
1590+
else:
1591+
self._sample_weights = None
1592+
kwargs["print_settings"] = print_settings
1593+
kwargs["names"] = names[i]
1594+
self.data_info["stage"] = 2
1595+
self.fit(x_train, y_train, x_cv, y_cv, **kwargs)
1596+
self._k_series_evaluation(i, x_test, y_test)
1597+
print_settings = False
1598+
self.data_info["stage"] = 3
1599+
self._merge_preprocessors_from_k_series(names)
1600+
self._sample_weights = sample_weights_store
1601+
if x_test is not None and y_test is not None:
1602+
self._test_generator = self._generator_base(x_test, y_test, name="TestGenerator")
1603+
return self

0 commit comments

Comments
 (0)