-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlabel_nearest.go
237 lines (225 loc) · 9.17 KB
/
label_nearest.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
package sticker
import (
"encoding/gob"
"fmt"
"io"
"log"
)
// SimCountPair is the data structure for float32 similarity and uint32 count.
type SimCountPair struct {
Sim float32
Count uint32
}
// LabelNearest is the sparse weighted nearest neighborhood.
// Sparse means in the following 3 reasons:
// (i) The similarity used in constructing the nearest neighborhood defined by the inner-product on the features activated in the given entry.
// (ii) The positive labels assigned to the entries in the training dataset are only used for averaging.
// (iii) The entries in the training dataset whose inner-product is not positive are not used.
//
// LabelNearest is only the optimized data structure of the training dataset for searching nearest neighborhood.
type LabelNearest struct {
// NfeaturesList is the slice of the number of features contained in each training data entry.
NfeaturesList []float32
// FeatureIndexList is the map from the feature to the feature index.
// The feature index contains the list of the pair of the entry index and the corresponding feature value.
// The feature vector for the each entry is normalized for computing cos similarity effectively in the inference.
FeatureIndexList map[uint32]KeyValues32
// LabelVectors is the label vectors in the training dataset.
LabelVectors LabelVectors
}
// TrainLabelNearest returns an trained LabelNearest on the given training dataset ds.
//
// Currently, this function returns no error.
func TrainLabelNearest(ds *Dataset, debug *log.Logger) (*LabelNearest, error) {
nfeaturesList := make([]float32, ds.Size())
lenX := make([]float32, ds.Size())
for i, xi := range ds.X {
nfeaturesList[i] = float32(len(xi))
l := float32(0.0)
for _, xipair := range xi {
l += xipair.Value * xipair.Value
}
lenX[i] = Sqrt32(l)
}
featureIndexList := make(map[uint32]KeyValues32)
featureActs := make([]float32, 0, len(ds.X))
for i, xi := range ds.X {
lenxi := lenX[i]
for _, xipair := range xi {
featureIndexList[xipair.Key] = append(featureIndexList[xipair.Key], KeyValue32{uint32(i), xipair.Value / lenxi})
}
featureActs = append(featureActs, float32(len(xi)))
}
featureActsMin, featureActsQ25, featureActsMed, featureActsQ75, featureActsMax, featureActsAvg := SummarizeFloat32Slice(featureActs)
if debug != nil {
debug.Printf("TrainLabelNearest: feature occurrence summary: min=%.0f, 1st-qu=%.0f, median=%.0f, 3rd-qu=%.0f, max=%.0f, avg=%.2f", featureActsMin, featureActsQ25, featureActsMed, featureActsQ75, featureActsMax, featureActsAvg)
}
labelVectors := make(LabelVectors, 0, len(ds.Y))
for _, yi := range ds.Y {
yihat := make(LabelVector, len(yi))
copy(yihat, yi)
labelVectors = append(labelVectors, yihat)
}
return &LabelNearest{
NfeaturesList: nfeaturesList,
FeatureIndexList: featureIndexList,
LabelVectors: labelVectors,
}, nil
}
// DecodeLabelNearestWithGobDecoder decodes LabelNearest using decoder.
//
// This function returns an error in decoding.
func DecodeLabelNearestWithGobDecoder(model *LabelNearest, decoder *gob.Decoder) error {
model.NfeaturesList = []float32{}
if err := decoder.Decode(&model.NfeaturesList); err != nil {
return fmt.Errorf("DecodeLabelNearest: NfeaturesList: %s", err)
}
var nfeatures uint32
if err := decoder.Decode(&nfeatures); err != nil {
return fmt.Errorf("DecodeLabelNearest: nfeatures: %s", err)
}
model.FeatureIndexList = make(map[uint32]KeyValues32, nfeatures)
for j := uint32(0); j < nfeatures; j++ {
var feature uint32
if err := decoder.Decode(&feature); err != nil {
return fmt.Errorf("DecodeLabelNearest: #%d feature: %s", j, err)
}
featureIndex := KeyValues32{}
if err := decoder.Decode(&featureIndex); err != nil {
return fmt.Errorf("DecodeLabelNearest: FeatureIndexList[%d]: %s", feature, err)
}
model.FeatureIndexList[feature] = featureIndex
}
model.LabelVectors = make(LabelVectors, 0, len(model.NfeaturesList))
for i := 0; i < len(model.NfeaturesList); i++ {
labelVector := LabelVector{}
if err := decoder.Decode(&labelVector); err != nil {
return fmt.Errorf("DecodeLabelNearest: LabelVectors[%d]: %s", i, err)
}
model.LabelVectors = append(model.LabelVectors, labelVector)
}
return nil
}
// DecodeLabelNearest decodes LabelNearest from r.
// Directly passing *os.File used by a gob.Decoder to this function causes mysterious errors.
// Thus, if users use gob.Decoder, then they should call DecodeLabelNearestWithGobDecoder.
//
// This function returns an error in decoding.
func DecodeLabelNearest(model *LabelNearest, r io.Reader) error {
return DecodeLabelNearestWithGobDecoder(model, gob.NewDecoder(r))
}
// EncodeLabelNearestWithGobEncoder decodes LabelNearest using encoder.
//
// This function returns an error in decoding.
func EncodeLabelNearestWithGobEncoder(model *LabelNearest, encoder *gob.Encoder) error {
if err := encoder.Encode(model.NfeaturesList); err != nil {
return fmt.Errorf("EncodeLabelNearest: NfeatureList: %s", err)
}
nfeatures := uint32(len(model.FeatureIndexList))
if err := encoder.Encode(nfeatures); err != nil {
return fmt.Errorf("EncodeLabelNearest: nfeatures: %s", err)
}
j := 0
for feature, featureIndex := range model.FeatureIndexList {
if err := encoder.Encode(feature); err != nil {
return fmt.Errorf("EncodeLabelNearest: #%d feature: %s", j, err)
}
if err := encoder.Encode(featureIndex); err != nil {
return fmt.Errorf("EncodeLabelNearest: FeatureIndexList[%d]: %s", feature, err)
}
j++
}
for i, labelVector := range model.LabelVectors {
if err := encoder.Encode(labelVector); err != nil {
return fmt.Errorf("EncodeLabelNearest: LabelVectors[%d]: %s", i, err)
}
}
return nil
}
// EncodeLabelNearest encodes LabelNearest to w.
// Directly passing *os.File used by a gob.Encoder to this function causes mysterious errors.
// Thus, if users use gob.Encoder, then they should call EncodeLabelNearestWithGobEncoder.
//
// This function returns an error in encoding.
func EncodeLabelNearest(model *LabelNearest, w io.Writer) error {
return EncodeLabelNearestWithGobEncoder(model, gob.NewEncoder(w))
}
// GobEncode returns the error always, because users should encode large LabelNearest objects with EncodeLabelNearest.
func (model *LabelNearest) GobEncode() ([]byte, error) {
return nil, fmt.Errorf("LabelNearest should be encoded with EncodeLabelNearest")
}
// FindNearests returns the S nearest entries with each similarity for the given entry.
// See Predict for hyper-parameter details.
func (model *LabelNearest) FindNearests(x FeatureVector, S uint, beta float32) KeyValues32 {
simCounts := make([]SimCountPair, len(model.LabelVectors))
for _, xpair := range x {
featureIndex := model.FeatureIndexList[xpair.Key]
for _, indexValue := range featureIndex {
simCounts[indexValue.Key].Sim += indexValue.Value * xpair.Value
simCounts[indexValue.Key].Count++
}
}
indexSimsTopS := make(KeyValues32, 0, S)
for i, simCount := range simCounts {
if sim, count := simCount.Sim, simCount.Count; sim > 0.0 {
// For efficiency, call Pow32 as short-cut style.
jaccard := float32(count) / (float32(len(x)) + model.NfeaturesList[i] - float32(count))
if beta == 0 {
} else if beta == 1 {
sim *= jaccard
} else {
sim *= Pow32(jaccard, beta)
}
if len(indexSimsTopS) == 0 {
indexSimsTopS = append(indexSimsTopS, KeyValue32{uint32(i), sim})
} else if indexSimsTopS[len(indexSimsTopS)-1].Value > sim {
if len(indexSimsTopS) < cap(indexSimsTopS) {
indexSimsTopS = append(indexSimsTopS, KeyValue32{uint32(i), sim})
}
} else {
for rank := 0; rank < len(indexSimsTopS); rank++ {
if sim >= indexSimsTopS[rank].Value {
if len(indexSimsTopS) < cap(indexSimsTopS) {
indexSimsTopS = append(indexSimsTopS, KeyValue32{0, 0})
}
copy(indexSimsTopS[rank+1:], indexSimsTopS[rank:])
indexSimsTopS[rank] = KeyValue32{uint32(i), sim}
break
}
}
}
}
}
return indexSimsTopS
}
// Predict returns the results for the given data entry x with the sparse S-nearest neighborhood.
// The returned results are the top-K labels, the label histogram, and the slice of the data entry index and its similarity.
//
// alpha is the smoothing parameter for weighting the votes by each neighbor.
// beta is the smoothing parameter for balancing the Jaccard similarity and the cosine similarity.
func (model *LabelNearest) Predict(x FeatureVector, K, S uint, alpha, beta float32) (LabelVector, map[uint32]float32, KeyValues32) {
indexSimsTopS := model.FindNearests(x, S, beta)
labelHist := make(map[uint32]float32)
xlen := float32(0.0)
for _, xpair := range x {
xlen += xpair.Value * xpair.Value
}
xlen = Sqrt32(xlen)
for _, indexSim := range indexSimsTopS {
value := Pow32(indexSim.Value/xlen, alpha)
for _, label := range model.LabelVectors[indexSim.Key] {
labelHist[label] += value
}
}
return RankTopK(labelHist, K), labelHist, indexSimsTopS
}
// PredictAll returns the top-K labels for each data entry in X with the sparse S-nearest neighborhood.
// See Predict for hyper-parameter details.
func (model *LabelNearest) PredictAll(X FeatureVectors, K, S uint, alpha, beta float32) LabelVectors {
Yhat := make(LabelVectors, 0, len(X))
for _, xi := range X {
yihat, _, _ := model.Predict(xi, K, S, alpha, beta)
Yhat = append(Yhat, yihat)
}
return Yhat
}