|
| 1 | +import json |
| 2 | +import hashlib |
| 3 | +import warnings |
| 4 | + |
| 5 | +import numpy as np |
| 6 | + |
| 7 | +try: |
| 8 | + from scipy.sparse import csr_matrix |
| 9 | + |
| 10 | + _SCIPY = True |
| 11 | +except ImportError: |
| 12 | + warnings.warn("Scipy not installed. FeatureHasher can only create dense matrices") |
| 13 | + _SCIPY = False |
| 14 | + |
| 15 | + |
| 16 | +class FeatureHasher: |
| 17 | + def __init__(self, n_dim=256, sparse=True): |
| 18 | + """ |
| 19 | + Convert a collection of features to a fixed-dimensional matrix using |
| 20 | + the hashing trick. Uses the md5 hash by default. |
| 21 | +
|
| 22 | + Parameters |
| 23 | + ---------- |
| 24 | + n_dim : int (default: 256) |
| 25 | + The dimensionality of each example in the output feature matrix. |
| 26 | + Small numbers of features are likely to cause hash collisions, but |
| 27 | + large numbers will cause larger overall parameter dimensions for |
| 28 | + any (linear) learning agent. |
| 29 | + sparse : bool (default: True) |
| 30 | + Whether the resulting feature matrix should be a sparse |
| 31 | + `scipy.csr_matrix` or dense `np.ndarray`. |
| 32 | + """ |
| 33 | + self.n_dim = n_dim |
| 34 | + self.hash = hashlib.md5 |
| 35 | + self.sparse = sparse and _SCIPY |
| 36 | + |
| 37 | + def encode(self, examples): |
| 38 | + """ |
| 39 | + Encode a collection of multi-featured examples into a |
| 40 | + `n_dim`-dimensional feature matrix via feature hashing. |
| 41 | +
|
| 42 | + Feature hashing works by applying a hash function to the features of an |
| 43 | + example and using the hash values as column indices in the resulting |
| 44 | + feature matrix. The entries at each hashed feature column correspond to |
| 45 | + the values for that example and feature. For example, given the |
| 46 | + following two input examples: |
| 47 | +
|
| 48 | + >>> examples = [ |
| 49 | + {"furry": 1, "quadruped": 1, "domesticated": 1}, |
| 50 | + {"nocturnal": 1, "quadruped": 1}, |
| 51 | + ] |
| 52 | +
|
| 53 | + and a hypothetical hash function H mapping strings to [0, 127], we have: |
| 54 | +
|
| 55 | + >>> feature_mat = zeros(2, 128) |
| 56 | + >>> ex1_cols = [H("furry"), H("quadruped"), H("domesticated")] |
| 57 | + >>> ex2_cols = [H("nocturnal"), H("quadruped")] |
| 58 | + >>> feat_mat[0, ex1_cols] = 1 |
| 59 | + >>> feat_mat[1, ex2_cols] = 1 |
| 60 | +
|
| 61 | + To better handle hash collisions, it is common to multiply the feature |
| 62 | + value by the sign of the digest for the corresponding feature name. |
| 63 | +
|
| 64 | + Parameters |
| 65 | + ---------- |
| 66 | + examples : dict or list of dicts |
| 67 | + A collection of N examples, each represented as a dict where keys |
| 68 | + correspond to the feature name and values correspond to the feature |
| 69 | + value. |
| 70 | +
|
| 71 | + Returns |
| 72 | + ------- |
| 73 | + table : `np.ndarray` or `scipy.sparse.csr_matrix` of shape (N, n_dim) |
| 74 | + The encoded feature matrix |
| 75 | + """ |
| 76 | + if isinstance(examples, dict): |
| 77 | + examples = [examples] |
| 78 | + |
| 79 | + sparse = self.sparse |
| 80 | + return self._encode_sparse(examples) if sparse else self._encode_dense(examples) |
| 81 | + |
| 82 | + def _encode_dense(self, examples): |
| 83 | + N = len(examples) |
| 84 | + table = np.zeros(N, self.n_dim) # dense |
| 85 | + |
| 86 | + for row, feat_dict in enumerate(examples): |
| 87 | + for f_id, val in feat_dict.items(): |
| 88 | + if isinstance(f_id, str): |
| 89 | + f_id = f_id.encode("utf-8") |
| 90 | + |
| 91 | + # use json module to convert the feature id into a unique |
| 92 | + # string compatible with the buffer API (required by hashlib) |
| 93 | + if isinstance(f_id, (tuple, dict, list)): |
| 94 | + f_id = json.dumps(f_id, sort_keys=True).encode("utf-8") |
| 95 | + |
| 96 | + h = int(self.hash(f_id).hexdigest(), base=16) |
| 97 | + col = h % self.n_dim |
| 98 | + table[row, col] += np.sign(h) * val |
| 99 | + |
| 100 | + return table |
| 101 | + |
| 102 | + def _encode_sparse(self, examples): |
| 103 | + N = len(examples) |
| 104 | + idxs, data = [], [] |
| 105 | + |
| 106 | + for row, feat_dict in enumerate(examples): |
| 107 | + for f_id, val in feat_dict.items(): |
| 108 | + if isinstance(f_id, str): |
| 109 | + f_id = f_id.encode("utf-8") |
| 110 | + |
| 111 | + # use json module to convert the feature id into a unique |
| 112 | + # string compatible with the buffer API (required by hashlib) |
| 113 | + if isinstance(f_id, (tuple, dict, list)): |
| 114 | + f_id = json.dumps(f_id, sort_keys=True).encode("utf-8") |
| 115 | + |
| 116 | + h = int(self.hash(f_id).hexdigest(), base=16) |
| 117 | + col = h % self.n_dim |
| 118 | + idxs.append((row, col)) |
| 119 | + data.append(np.sign(h) * val) |
| 120 | + |
| 121 | + table = csr_matrix((data, zip(*idxs)), shape=(N, self.n_dim)) |
| 122 | + return table |
0 commit comments