forked from huichen/mlf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
libsvm_dataset_loader.go
104 lines (86 loc) · 1.98 KB
/
libsvm_dataset_loader.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
package contrib
import (
"github.com/huichen/mlf/data"
"github.com/huichen/mlf/util"
"io/ioutil"
"log"
"strconv"
"strings"
)
func LoadLibSVMDataset(path string, usingSparseRepresentation bool) data.Dataset {
log.Print("载入libsvm格式文件", path)
content, err := ioutil.ReadFile(path)
if err != nil {
log.Fatalf("无法打开文件\"%v\",错误提示:%v\n", path, err)
}
lines := strings.Split(string(content), "\n")
minFeature := 10000
maxFeature := 0
labels := make(map[string]int)
labelIndex := 0
for _, l := range lines {
if l == "" {
continue
}
fields := strings.Split(l, " ")
_, ok := labels[fields[0]]
if !ok {
labels[fields[0]] = labelIndex
labelIndex++
}
for i := 1; i < len(fields); i++ {
if fields[i] == "" {
continue
}
fs := strings.Split(fields[i], ":")
fid, _ := strconv.Atoi(fs[0])
if fid > maxFeature {
maxFeature = fid
}
if fid < minFeature {
minFeature = fid
}
}
}
if minFeature != 1 || maxFeature < 2 {
log.Fatal("文件输入格式不合法")
}
set := data.NewInmemDataset()
for _, l := range lines {
if l == "" {
continue
}
fields := strings.Split(l, " ")
instance := new(data.Instance)
instance.Output = &data.InstanceOutput{
Label: labels[fields[0]],
LabelString: fields[0],
}
if usingSparseRepresentation {
instance.NamedFeatures = make(map[string]float64)
} else {
instance.Features = util.NewVector(maxFeature + 1)
}
// 常数项
if !usingSparseRepresentation {
instance.Features.Set(0, 1)
}
for i := 1; i < len(fields); i++ {
if fields[i] == "" {
continue
}
fs := strings.Split(fields[i], ":")
fid, _ := strconv.Atoi(fs[0])
value, _ := strconv.ParseFloat(fs[1], 64)
if usingSparseRepresentation {
instance.NamedFeatures[fs[0]] = value
} else {
instance.Features.Set(fid, value)
}
}
set.AddInstance(instance)
}
set.Finalize()
log.Print("载入数据样本数目 ", set.NumInstances())
return set
}