Skip to content

Commit ea4df8f

Browse files
committed
split preprocessing script by application
1 parent e6cf942 commit ea4df8f

File tree

3 files changed

+772
-1
lines changed

3 files changed

+772
-1
lines changed

preprocessing/preprocess.py renamed to preprocessing/dsp.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ def DFT(frame, fs=44000):
187187

188188

189189
#######################################################################
190-
# DSP Utils #
190+
# Preprocessing Utils #
191191
#######################################################################
192192

193193

preprocessing/general.py

+122
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import json
2+
import hashlib
3+
import warnings
4+
5+
import numpy as np
6+
7+
try:
8+
from scipy.sparse import csr_matrix
9+
10+
_SCIPY = True
11+
except ImportError:
12+
warnings.warn("Scipy not installed. FeatureHasher can only create dense matrices")
13+
_SCIPY = False
14+
15+
16+
class FeatureHasher:
17+
def __init__(self, n_dim=256, sparse=True):
18+
"""
19+
Convert a collection of features to a fixed-dimensional matrix using
20+
the hashing trick. Uses the md5 hash by default.
21+
22+
Parameters
23+
----------
24+
n_dim : int (default: 256)
25+
The dimensionality of each example in the output feature matrix.
26+
Small numbers of features are likely to cause hash collisions, but
27+
large numbers will cause larger overall parameter dimensions for
28+
any (linear) learning agent.
29+
sparse : bool (default: True)
30+
Whether the resulting feature matrix should be a sparse
31+
`scipy.csr_matrix` or dense `np.ndarray`.
32+
"""
33+
self.n_dim = n_dim
34+
self.hash = hashlib.md5
35+
self.sparse = sparse and _SCIPY
36+
37+
def encode(self, examples):
38+
"""
39+
Encode a collection of multi-featured examples into a
40+
`n_dim`-dimensional feature matrix via feature hashing.
41+
42+
Feature hashing works by applying a hash function to the features of an
43+
example and using the hash values as column indices in the resulting
44+
feature matrix. The entries at each hashed feature column correspond to
45+
the values for that example and feature. For example, given the
46+
following two input examples:
47+
48+
>>> examples = [
49+
{"furry": 1, "quadruped": 1, "domesticated": 1},
50+
{"nocturnal": 1, "quadruped": 1},
51+
]
52+
53+
and a hypothetical hash function H mapping strings to [0, 127], we have:
54+
55+
>>> feature_mat = zeros(2, 128)
56+
>>> ex1_cols = [H("furry"), H("quadruped"), H("domesticated")]
57+
>>> ex2_cols = [H("nocturnal"), H("quadruped")]
58+
>>> feat_mat[0, ex1_cols] = 1
59+
>>> feat_mat[1, ex2_cols] = 1
60+
61+
To better handle hash collisions, it is common to multiply the feature
62+
value by the sign of the digest for the corresponding feature name.
63+
64+
Parameters
65+
----------
66+
examples : dict or list of dicts
67+
A collection of N examples, each represented as a dict where keys
68+
correspond to the feature name and values correspond to the feature
69+
value.
70+
71+
Returns
72+
-------
73+
table : `np.ndarray` or `scipy.sparse.csr_matrix` of shape (N, n_dim)
74+
The encoded feature matrix
75+
"""
76+
if isinstance(examples, dict):
77+
examples = [examples]
78+
79+
sparse = self.sparse
80+
return self._encode_sparse(examples) if sparse else self._encode_dense(examples)
81+
82+
def _encode_dense(self, examples):
83+
N = len(examples)
84+
table = np.zeros(N, self.n_dim) # dense
85+
86+
for row, feat_dict in enumerate(examples):
87+
for f_id, val in feat_dict.items():
88+
if isinstance(f_id, str):
89+
f_id = f_id.encode("utf-8")
90+
91+
# use json module to convert the feature id into a unique
92+
# string compatible with the buffer API (required by hashlib)
93+
if isinstance(f_id, (tuple, dict, list)):
94+
f_id = json.dumps(f_id, sort_keys=True).encode("utf-8")
95+
96+
h = int(self.hash(f_id).hexdigest(), base=16)
97+
col = h % self.n_dim
98+
table[row, col] += np.sign(h) * val
99+
100+
return table
101+
102+
def _encode_sparse(self, examples):
103+
N = len(examples)
104+
idxs, data = [], []
105+
106+
for row, feat_dict in enumerate(examples):
107+
for f_id, val in feat_dict.items():
108+
if isinstance(f_id, str):
109+
f_id = f_id.encode("utf-8")
110+
111+
# use json module to convert the feature id into a unique
112+
# string compatible with the buffer API (required by hashlib)
113+
if isinstance(f_id, (tuple, dict, list)):
114+
f_id = json.dumps(f_id, sort_keys=True).encode("utf-8")
115+
116+
h = int(self.hash(f_id).hexdigest(), base=16)
117+
col = h % self.n_dim
118+
idxs.append((row, col))
119+
data.append(np.sign(h) * val)
120+
121+
table = csr_matrix((data, zip(*idxs)), shape=(N, self.n_dim))
122+
return table

0 commit comments

Comments
 (0)