|
| 1 | +from checkpoint import persist |
| 2 | +import sys |
| 3 | +import numpy as np |
| 4 | +import argparse |
| 5 | + |
| 6 | + |
| 7 | +def append_int(a: np.ndarray, n: int) -> np.ndarray: |
| 8 | + return np.append(a, n) |
| 9 | + |
| 10 | + |
| 11 | +def get_float(array: np.ndarray, idx: int) -> float: |
| 12 | + res = array[idx] |
| 13 | + return res |
| 14 | + |
| 15 | + |
| 16 | +def log(idx: int, k: int) -> None: |
| 17 | + print(f"{idx} / {k}", end="\r", flush=True, file=sys.stderr) |
| 18 | + |
| 19 | + |
| 20 | +def run(features: np.ndarray, target: np.ndarray, k: int) -> np.ndarray: |
| 21 | + """select k features from features using target as the target variable""" |
| 22 | + S = np.array([], "int") |
| 23 | + for idx in range(k): # type: int |
| 24 | + persist.self_coredump() |
| 25 | + log(idx, k) |
| 26 | + dims = np.unique(S[S >= 0]) |
| 27 | + target = np.array(target).reshape(target.shape[0], -1) |
| 28 | + X = features[:, dims] |
| 29 | + if X.size == 0: |
| 30 | + prediction = np.zeros(features.shape[0]).reshape(features.shape[0], -1) |
| 31 | + else: |
| 32 | + if X.ndim == 1: |
| 33 | + X = X.reshape(X.shape[0], 1) |
| 34 | + y = np.concatenate(target) |
| 35 | + X = (X - X.mean()) / X.std() |
| 36 | + X = np.c_[np.ones(X.shape[0]), X] |
| 37 | + theta = np.zeros(X.shape[1]) |
| 38 | + for _ in range(10000): |
| 39 | + error = np.dot(X, theta.T) - y |
| 40 | + theta -= 0.1 * (1 / y.size) * np.dot(X.T, error) |
| 41 | + prediction = np.zeros((len(X), 1)) |
| 42 | + for j in range(len(X)): |
| 43 | + total = 0.0 |
| 44 | + xj = X[j, :] |
| 45 | + for i in range(len(xj)): |
| 46 | + x = get_float(xj, i) |
| 47 | + t = get_float(theta, i) |
| 48 | + total += x * t |
| 49 | + prediction[j] = total |
| 50 | + grad = np.dot(features.T, target - prediction) |
| 51 | + points = np.setdiff1d(np.array(range(len(grad))), S).astype("int") |
| 52 | + if len(points) == 0: |
| 53 | + break |
| 54 | + a = points[0] |
| 55 | + m = get_float(grad, a) |
| 56 | + for i in range(len(points)): |
| 57 | + p = points[i] |
| 58 | + n = get_float(grad, p) |
| 59 | + if n > m: |
| 60 | + a = p |
| 61 | + m = n |
| 62 | + if m >= 0: |
| 63 | + S = np.unique(append_int(S, a)) |
| 64 | + else: |
| 65 | + break |
| 66 | + return S |
| 67 | + |
| 68 | + |
| 69 | +def main(dataset: str, k: int) -> None: |
| 70 | + features = np.load(f"experiment/omp/{dataset}_features.npy") |
| 71 | + target = np.load(f"experiment/omp/{dataset}_target.npy") |
| 72 | + S = run(features, target, k) |
| 73 | + print(S) |
| 74 | + |
| 75 | + |
| 76 | +if __name__ == "__main__": |
| 77 | + parser = argparse.ArgumentParser() |
| 78 | + parser.add_argument( |
| 79 | + "dataset", |
| 80 | + choices=["dataset_20KB", "dataset_large", "healthstudy"], |
| 81 | + help="dataset to use", |
| 82 | + ) |
| 83 | + parser.add_argument( |
| 84 | + "--k", type=int, default=100000, help="number of features to select" |
| 85 | + ) |
| 86 | + args = parser.parse_args() |
| 87 | + main(args.dataset, args.k) |
0 commit comments