-
Notifications
You must be signed in to change notification settings - Fork 9
/
bucketMul.swift
91 lines (62 loc) · 3.33 KB
/
bucketMul.swift
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
/*
Needs a bit of refactoring probably.
bucketMulFast is the main wrapper function, kept similar in calling style to basicMul, so it can be
quickly replaced in other places in code.
*/
func bucketMul(v: VectorFloat, by: ExpertWeights, expNo: ScalarFloat, out: VectorFloat, effort: Double = 0.25) {
let bm = BucketMul.shared
bm.fullMul(v: v, ew: by, expNo: expNo, out: out, effort: effort)
}
// should be private, but this way is useful for testing
class BucketMul {
let probesCount = 4096
let maxDispatchSize = 229376 * 2//176128
let dispatch : DynaVectorFloat
let probes : Vector
let cutoff : ScalarFloat
private let prevSize = ScalarFloat(value: 0)
static let shared = BucketMul()
private init() {
self.dispatch = DynaVectorFloat(shape: [maxDispatchSize*2])
self.probes = Vector(shape: [probesCount])
self.cutoff = ScalarFloat(value: 0)
}
func calcDispatch(v: VectorFloat, eWeights ew: ExpertWeights, expNo: ScalarFloat, effort: Double) {
assert(dispatch.rows >= ew.buckets.rows*2)
assert(ew.probes.cols == 4096, "probes implemented for 4096 only. needs review of sort as well as probeShort")
dispatch.size.zero()
let q = Int(Double(probesCount-1)*(1-effort))
gpu.deploy("findCutoff32", buffers: [v, ew.probes, expNo, cutoff], ints:[q], threadCount: 1024, threadGroupSize: [1024, 1, 1])
let chunkSize = 4//w.stats.rows//16
gpu.deploy("prepareDispatch", buffers:[v, ew.stats, expNo, cutoff, dispatch, dispatch.size],
ints:[chunkSize, ew.inSize, ew.buckets.cols, ew.expertSize], threadCount: ew.stats.rows/chunkSize)
// gpu.eval()
// print("dsize", dispatch.size.getLong(index: 0))
}
private let mulGroups = 32
private let tmpMulVec = MatrixFloat(shape:[32, 16384])
func fullMul(v: VectorFloat, ew: ExpertWeights, expNo: ScalarFloat, out: VectorFloat, effort: Double) {
calcDispatch(v: v, eWeights: ew, expNo: expNo, effort: effort)
gpu.deploy("roundUp", buffers:[dispatch.size, prevSize], ints:[2048], threadCount: 1)
gpu.deploy("zeroRange32", buffers: [dispatch, prevSize, dispatch.size], threadCount: 2048 )
// ^ quick patch here.
// bucketMulFast goes through dispatch in bucketSize * STEP chunks, and if dispatchSize is not evened
// out, the ranges may start to overlap and cause subtle errors at various Effort levels.
// not sure if 2048 rounding is right at this iteration, needs testing and fixing probably
mul(by: ew, out: out)
}
func mul(by: ExpertWeights, out: VectorFloat) {
let weightBuckets = by.buckets
assert(!goQ8, "call BucketMulQ8, not this")
let bucketSize = 16
let numBuckets = out.rows / bucketSize
assert(numBuckets % 4 == 0)
gpu.deploy("bucketMul", buffers: [weightBuckets, dispatch, tmpMulVec, dispatch.size],
ints: [weightBuckets.cols, mulGroups],
threadCount: [weightBuckets.cols, mulGroups])
let simdSize = 32
gpu.deploy("bucketIntegrate", buffers: [tmpMulVec, out],
threadCount: [simdSize, out.rows/4, 1],
threadGroupSize: [simdSize, 1, 1])
}
}