-
Notifications
You must be signed in to change notification settings - Fork 7
/
FFMFormat_hash.py
41 lines (37 loc) · 1.67 KB
/
FFMFormat_hash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas
import hashlib
def hashstr(str, nr_bins):
return int(hashlib.md5(str.encode('utf8')).hexdigest(), 16) % (nr_bins- 1) + 1
def gen_hashed_fmm_feats(feats, nr_bins = int(1e+6)):
feats = ['%s:%s:%s' %(field, hashstr(feat, nr_bins), value) for (field, feat, value) in feats]
return feats
def FFMFormat_hash(df, label, path, train_len, category_feature = [], continuous_feature = [], vector_feature = []):
index = df.shape[0]
train = open(path + 'train.ffm', 'w')
test = open(path + 'test.ffm', 'w')
feature_index = 0
for i in range(index):
feats = []
field_index = 0
for j, feat in enumerate(category_feature):
feats.append((field_index, feat + '_' + str(df[feat][i]), 1))
field_index = field_index + 1
# feature_index = feature_index + 1
for j, feat in enumerate(continus_feature):
feats.append((field_index, feat + '_' + str(df[feat][i]), df[feat][i]))
field_index = field_index + 1
# feature_index = feature_index + 1
for j, feat in enumerate(vector_feature):
words = df[feat][i].split(' ')
for word in words:
feats.append((field_index, feat + '_' + word, 1))
field_index = field_index + 1
feats = gen_hashed_fmm_feats(feats)
print('%s %s' % (df[label][i], ' '.join(feats)))
if i < train_len:
train.write('%s %s\n' % (df[label][i], ' '.join(feats)))
else:
test.write('%s\n' % (' '.join(feats)))
train.close()
test.close()
FFMFormat_hash(df, 'label', '../data/ffm/', train_len, category_feature, continuous_feature, vector_feature)