-
-
Notifications
You must be signed in to change notification settings - Fork 86
/
mixerblock.go
83 lines (70 loc) · 2.5 KB
/
mixerblock.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
// Copyright 2022 spaGO Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package mlpmixer implements the MLP-Mixer (Tolstikhin et al., 2021).
package mlpmixer
import (
"encoding/gob"
"fmt"
"github.com/nlpodyssey/spago/ag"
"github.com/nlpodyssey/spago/mat/float"
"github.com/nlpodyssey/spago/nn"
"github.com/nlpodyssey/spago/nn/activation"
"github.com/nlpodyssey/spago/nn/normalization/layernorm"
)
var _ nn.Model = &MixerBlock{}
// MixerBlock contains the serializable parameters.
type MixerBlock struct {
nn.Module
Config
TokenLayerNorm *layernorm.Model
TokenMixerFF *FeedForward
ChannelLayerNorm *layernorm.Model
ChannelMixerFF *FeedForward
}
// Config provides configuration settings for a MixerBlock.
type Config struct {
InputSize int
HiddenSizeTokenMixer int
HiddenSizeChannelMixer int
Channels int
ActFunctionTokenMixer activation.Name
ActFunctionChannelMixer activation.Name
Eps float64
}
func init() {
gob.Register(&MixerBlock{})
}
// New returns a new model with parameters initialized to zeros.
func New[T float.DType](config Config) *MixerBlock {
return &MixerBlock{
Config: config,
TokenMixerFF: newFeedForward[T](config.Channels, config.HiddenSizeTokenMixer, config.ActFunctionTokenMixer, 0),
TokenLayerNorm: layernorm.New[T](config.InputSize, config.Eps),
ChannelMixerFF: newFeedForward[T](config.InputSize, config.HiddenSizeChannelMixer, config.ActFunctionChannelMixer, 0),
ChannelLayerNorm: layernorm.New[T](config.InputSize, config.Eps),
}
}
// Forward performs the forward step for each input node and returns the result.
func (m *MixerBlock) Forward(xs ...ag.Node) []ag.Node {
if len(xs) > m.Config.Channels {
panic(fmt.Sprintf("mlpmixer: maximum sequence length is %d, got %d",
m.Config.Channels, len(xs)))
}
xs = m.residual(m.tokenMix(xs), xs)
xs = m.residual(m.channelMix(xs), xs)
return xs
}
func (m *MixerBlock) tokenMix(xs []ag.Node) []ag.Node {
normalized := m.TokenLayerNorm.Forward(xs...)
cols := ag.ColViews(ag.Stack(normalized...))
ys := m.TokenMixerFF.Forward(cols...)
return ag.Map(ag.T, ag.RowViews(ag.T(ag.Stack(ys...))))
}
func (m *MixerBlock) channelMix(xs []ag.Node) []ag.Node {
normalized := m.ChannelLayerNorm.Forward(xs...)
return m.ChannelMixerFF.Forward(normalized...)
}
func (m *MixerBlock) residual(xs []ag.Node, residual []ag.Node) []ag.Node {
return ag.Map2(ag.Add, xs, residual)
}