|
21 | 21 |
|
22 | 22 |
|
23 | 23 | class Generator:
|
24 |
| - def __init__(self, x, y, weights=None, name="Generator", shuffle=True): |
| 24 | + def __init__(self, x, y, name="Generator", weights=None, n_class=None, shuffle=True): |
25 | 25 | self._cache = {}
|
26 | 26 | self._x, self._y = np.asarray(x, np.float32), np.asarray(y, np.float32)
|
27 | 27 | if weights is None:
|
28 | 28 | self._sample_weights = None
|
29 | 29 | else:
|
30 | 30 | self._sample_weights = np.asarray(weights, np.float32)
|
31 |
| - if len(self._y.shape) == 1: |
| 31 | + if n_class is not None: |
| 32 | + self.n_class = n_class |
| 33 | + else: |
32 | 34 | y_int = self._y.astype(np.int32)
|
33 | 35 | if np.allclose(self._y, y_int):
|
34 | 36 | assert y_int.min() == 0, "Labels should start from 0"
|
@@ -253,9 +255,9 @@ def init_from_data(self, x, y, x_test, y_test, sample_weights, names):
|
253 | 255 | else:
|
254 | 256 | self._tf_sample_weights = tf.placeholder(tf.float32, name="sample_weights")
|
255 | 257 |
|
256 |
| - self._train_generator = self._generator_base(x, y, self._sample_weights, name="TrainGenerator") |
| 258 | + self._train_generator = self._generator_base(x, y, "TrainGenerator", self._sample_weights, self.n_class) |
257 | 259 | if x_test is not None and y_test is not None:
|
258 |
| - self._test_generator = self._generator_base(x_test, y_test, name="TestGenerator") |
| 260 | + self._test_generator = self._generator_base(x_test, y_test, "TestGenerator") |
259 | 261 | else:
|
260 | 262 | self._test_generator = None
|
261 | 263 | self.n_random_train_subset = int(len(self._train_generator) * 0.1)
|
@@ -1112,7 +1114,7 @@ def _gen_categorical_columns(self):
|
1112 | 1114 | def _transform_data(self, data, name, train_name="train",
|
1113 | 1115 | include_label=False, refresh_redundant_info=False, stage=3):
|
1114 | 1116 | print("Transforming {0}data{2} at stage {1}".format(
|
1115 |
| - "{} ".format(name) if stage >= 2 else "", stage, |
| 1117 | + "{} ".format(name), stage, |
1116 | 1118 | "" if name == train_name or not self.reuse_mean_and_std else
|
1117 | 1119 | " with {} data".format(train_name),
|
1118 | 1120 | ))
|
@@ -1227,8 +1229,6 @@ def _get_data_from_file(self, file_type, test_rate):
|
1227 | 1229 |
|
1228 | 1230 | def _load_data(self, data=None, numerical_idx=None, file_type="txt", names=("train", "test"),
|
1229 | 1231 | shuffle=True, test_rate=0.1, stage=3):
|
1230 |
| - if stage == 1: |
1231 |
| - names = (None, None) |
1232 | 1232 | use_cached_data = False
|
1233 | 1233 | train_data = test_data = None
|
1234 | 1234 | data_cache_folder = os.path.join(self._data_folder, "_Cache", self._name)
|
@@ -1417,3 +1417,187 @@ def evaluate(self, x, y, x_cv=None, y_cv=None, x_test=None, y_test=None):
|
1417 | 1417 | attr[key] = value
|
1418 | 1418 |
|
1419 | 1419 | return type(name_, bases, attr)
|
| 1420 | + |
| 1421 | + |
| 1422 | +class DistMixin: |
| 1423 | + def reset_all_variables(self): |
| 1424 | + self._sess.run(tf.global_variables_initializer()) |
| 1425 | + |
| 1426 | + def rolling_fit(self, train_rate=0.8, cv_rate=0.1, sample_weights=None, **kwargs): |
| 1427 | + n_data = len(self._train_generator) |
| 1428 | + if sample_weights is not None: |
| 1429 | + n_weights = len(sample_weights) |
| 1430 | + assert_msg = ( |
| 1431 | + "Sample weights should match training data, " |
| 1432 | + "but n_weights={} & n_data={} found".format(n_weights, n_data) |
| 1433 | + ) |
| 1434 | + assert n_weights == n_data, assert_msg |
| 1435 | + n_train = int(train_rate * n_data) |
| 1436 | + n_test = int(cv_rate * n_data) if self._test_generator is None else len(self._test_generator) |
| 1437 | + j, cursor, print_settings = 0, 0, kwargs.pop("print_settings", True) |
| 1438 | + flag = test_flag = False |
| 1439 | + if self._test_generator is not None: |
| 1440 | + test_flag = True |
| 1441 | + test_data, _ = self._test_generator.get_all_data() |
| 1442 | + x_test, y_test = test_data[..., :-1], test_data[..., -1] |
| 1443 | + else: |
| 1444 | + x_test = y_test = None |
| 1445 | + print("Rolling fit with train_rate={} and test_rate={}".format(train_rate, cv_rate)) |
| 1446 | + while True: |
| 1447 | + j += 1 |
| 1448 | + train_cursor = cursor + n_train |
| 1449 | + test_cursor = train_cursor + n_test |
| 1450 | + if n_data - test_cursor < n_test: |
| 1451 | + flag = True |
| 1452 | + test_cursor = n_data |
| 1453 | + with self._train_generator: |
| 1454 | + if self._test_generator is None: |
| 1455 | + test_data, _ = self._train_generator.get_range(train_cursor, test_cursor) |
| 1456 | + x_test, y_test = test_data[..., :-1], test_data[..., -1] |
| 1457 | + self._test_generator = self._generator_base(x_test, y_test, name="TestGenerator") |
| 1458 | + self._train_generator.set_range(cursor, train_cursor) |
| 1459 | + kwargs["print_settings"] = print_settings |
| 1460 | + self.fit(**kwargs) |
| 1461 | + x, y, _ = self._gen_batch(self._train_generator, self.n_random_train_subset, True) |
| 1462 | + print(" - Performance of roll {}".format(j), end=" | ") |
| 1463 | + self._evaluate(x, y, x_test, y_test) |
| 1464 | + cursor += n_test |
| 1465 | + print_settings = False |
| 1466 | + if not test_flag: |
| 1467 | + self._test_generator = None |
| 1468 | + if flag: |
| 1469 | + break |
| 1470 | + with self._train_generator: |
| 1471 | + self._train_generator.set_range(cursor) |
| 1472 | + kwargs["print_settings"] = print_settings |
| 1473 | + self.fit(**kwargs) |
| 1474 | + if self._test_generator is not None: |
| 1475 | + print(" - Performance of roll {}".format(j + 1), end=" | ") |
| 1476 | + self._evaluate(x_test=x_test, y_test=y_test) |
| 1477 | + return self |
| 1478 | + |
| 1479 | + def increment_fit(self, x=None, y=None, x_test=None, y_test=None, sample_weights=None, **kwargs): |
| 1480 | + if x is not None and y is not None: |
| 1481 | + data = np.hstack([np.asarray(x, np.float32), np.asarray(y, np.float32).reshape([-1, 1])]) |
| 1482 | + if x_test is not None and y_test is not None: |
| 1483 | + data = (data, np.hstack([ |
| 1484 | + np.asarray(x_test, np.float32), np.asarray(y_test, np.float32).reshape([-1, 1]) |
| 1485 | + ])) |
| 1486 | + x, y, x_test, y_test = self._load_data(data) |
| 1487 | + else: |
| 1488 | + data = None |
| 1489 | + if self._test_generator is not None: |
| 1490 | + test_data, _ = self._test_generator.get_all_data() |
| 1491 | + x_test, y_test = test_data[..., :-1], test_data[..., -1] |
| 1492 | + if sample_weights is not None: |
| 1493 | + self._sample_weights = np.asarray(sample_weights, np.float32) |
| 1494 | + self._handle_unbalance(y) |
| 1495 | + self._handle_sparsity() |
| 1496 | + if data is not None: |
| 1497 | + self._train_generator = self._generator_base(x, y, self._sample_weights, name="Generator") |
| 1498 | + if x_test is not None and y_test is not None: |
| 1499 | + self._test_generator = self._generator_base(x_test, y_test, name="TestGenerator") |
| 1500 | + self.fit(**kwargs) |
| 1501 | + x, y, _ = self._gen_batch(self._train_generator, self.n_random_train_subset, True) |
| 1502 | + print(" - Performance of increment fit", end=" | ") |
| 1503 | + self._evaluate(x, y, x_test, y_test) |
| 1504 | + return self |
| 1505 | + |
| 1506 | + def _k_series_initialization(self, k, data): |
| 1507 | + self.init_data_info() |
| 1508 | + x, y, x_test, y_test = self._load_data(data, stage=1) |
| 1509 | + x_test, y_test, *_ = self._load_data( |
| 1510 | + np.hstack([x_test, y_test.reshape([-1, 1])]), |
| 1511 | + names=("test", None), test_rate=0, stage=2 |
| 1512 | + ) |
| 1513 | + names = [("train{}".format(i), "cv{}".format(i)) for i in range(k)] |
| 1514 | + return x, y, x_test, y_test, names |
| 1515 | + |
| 1516 | + def _k_series_evaluation(self, i, x_test, y_test): |
| 1517 | + train, sw_train = self._train_generator.get_all_data() |
| 1518 | + cv, sw_cv = self._test_generator.get_all_data() |
| 1519 | + x, y = train[..., :-1], train[..., -1] |
| 1520 | + x_cv, y_cv = cv[..., :-1], cv[..., -1] |
| 1521 | + print(" - Performance of run {}".format(i + 1), end=" | ") |
| 1522 | + self._evaluate(x, y, x_cv, y_cv, x_test, y_test) |
| 1523 | + |
| 1524 | + def _merge_preprocessors_from_k_series(self, names): |
| 1525 | + train_names, cv_names = [name[0] for name in names], [name[1] for name in names] |
| 1526 | + self._merge_preprocessors_by_names("train", train_names) |
| 1527 | + self._merge_preprocessors_by_names("cv", cv_names) |
| 1528 | + |
| 1529 | + def _merge_preprocessors_by_names(self, target, names): |
| 1530 | + if len(names) == 1: |
| 1531 | + self._pre_processors[target] = self._pre_processors.pop(names[0]) |
| 1532 | + pre_processors = [self._pre_processors.pop(name) for name in names] |
| 1533 | + methods = [pre_processor.method for pre_processor in pre_processors] |
| 1534 | + scale_methods = [pre_processor.scale_method for pre_processor in pre_processors] |
| 1535 | + assert Toolbox.all_same(methods), "Pre_process method should be all_same" |
| 1536 | + assert Toolbox.all_same(scale_methods), "Scale method should be all_same" |
| 1537 | + new_processor = PreProcessor(methods[0], scale_methods[0]) |
| 1538 | + new_processor.mean = np.mean([pre_processor.mean for pre_processor in pre_processors], axis=0) |
| 1539 | + new_processor.std = np.mean([pre_processor.std for pre_processor in pre_processors], axis=0) |
| 1540 | + self._pre_processors[target] = new_processor |
| 1541 | + |
| 1542 | + def k_fold(self, k=10, data=None, test_rate=0., sample_weights=None, **kwargs): |
| 1543 | + x, y, x_test, y_test, names = self._k_series_initialization(k, data) |
| 1544 | + n_batch = int(len(x) / k) |
| 1545 | + all_idx = list(range(len(x))) |
| 1546 | + print_settings = True |
| 1547 | + if sample_weights is not None: |
| 1548 | + self._sample_weights = np.asarray(sample_weights, np.float32) |
| 1549 | + sample_weights_store = self._sample_weights |
| 1550 | + print("Training k-fold with k={} and test_rate={}".format(k, test_rate)) |
| 1551 | + for i in range(k): |
| 1552 | + self.reset_all_variables() |
| 1553 | + cv_idx = list(range(i * n_batch, (i + 1) * n_batch)) |
| 1554 | + train_idx = [j for j in all_idx if j < i * n_batch or j >= (i + 1) * n_batch] |
| 1555 | + x_cv, y_cv = x[cv_idx], y[cv_idx] |
| 1556 | + x_train, y_train = x[train_idx], y[train_idx] |
| 1557 | + if sample_weights is not None: |
| 1558 | + self._sample_weights = sample_weights_store[train_idx] |
| 1559 | + else: |
| 1560 | + self._sample_weights = None |
| 1561 | + kwargs["print_settings"] = print_settings |
| 1562 | + kwargs["names"] = names[i] |
| 1563 | + self.data_info["stage"] = 2 |
| 1564 | + self.fit(x_train, y_train, x_cv, y_cv, **kwargs) |
| 1565 | + self._k_series_evaluation(i, x_test, y_test) |
| 1566 | + print_settings = False |
| 1567 | + self.data_info["stage"] = 3 |
| 1568 | + self._merge_preprocessors_from_k_series(names) |
| 1569 | + self._sample_weights = sample_weights_store |
| 1570 | + if x_test is not None and y_test is not None: |
| 1571 | + self._test_generator = self._generator_base(x_test, y_test, name="TestGenerator") |
| 1572 | + return self |
| 1573 | + |
| 1574 | + def k_random(self, k=3, data=None, cv_rate=0.1, test_rate=0., sample_weights=None, **kwargs): |
| 1575 | + x, y, x_test, y_test, names = self._k_series_initialization(k, data) |
| 1576 | + n_cv = int(cv_rate * len(x)) |
| 1577 | + print_settings = True |
| 1578 | + if sample_weights is not None: |
| 1579 | + self._sample_weights = np.asarray(sample_weights, np.float32) |
| 1580 | + sample_weights_store = self._sample_weights |
| 1581 | + print("Training k-random with k={}, cv_rate={} and test_rate={}".format(k, cv_rate, test_rate)) |
| 1582 | + for i in range(k): |
| 1583 | + self.reset_all_variables() |
| 1584 | + all_idx = np.random.permutation(len(x)) |
| 1585 | + cv_idx, train_idx = all_idx[:n_cv], all_idx[n_cv:] |
| 1586 | + x_cv, y_cv = x[cv_idx], y[cv_idx] |
| 1587 | + x_train, y_train = x[train_idx], y[train_idx] |
| 1588 | + if sample_weights is not None: |
| 1589 | + self._sample_weights = sample_weights_store[train_idx] |
| 1590 | + else: |
| 1591 | + self._sample_weights = None |
| 1592 | + kwargs["print_settings"] = print_settings |
| 1593 | + kwargs["names"] = names[i] |
| 1594 | + self.data_info["stage"] = 2 |
| 1595 | + self.fit(x_train, y_train, x_cv, y_cv, **kwargs) |
| 1596 | + self._k_series_evaluation(i, x_test, y_test) |
| 1597 | + print_settings = False |
| 1598 | + self.data_info["stage"] = 3 |
| 1599 | + self._merge_preprocessors_from_k_series(names) |
| 1600 | + self._sample_weights = sample_weights_store |
| 1601 | + if x_test is not None and y_test is not None: |
| 1602 | + self._test_generator = self._generator_base(x_test, y_test, name="TestGenerator") |
| 1603 | + return self |
0 commit comments