-
Notifications
You must be signed in to change notification settings - Fork 2
/
automata.go
367 lines (327 loc) · 12.1 KB
/
automata.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
package uax
import (
"context"
"fmt"
pool "github.com/jolestar/go-commons-pool"
)
// UnicodeBreaker represents a logic to split up
// Unicode sequences into smaller parts. They are used by Segmenters
// to supply breaking logic.
type UnicodeBreaker interface {
CodePointClassFor(rune) int
StartRulesFor(rune, int)
ProceedWithRune(rune, int)
LongestActiveMatch() int
Penalties() []int
}
// NfaStateFn represents a state in a non-deterministic finite automata.
// Functions of type NfaStateFn try to match a rune (Unicode code-point).
// The caller may provide a third argument, which should be a rune class.
// Rune (code-point) classes are described in various Unicode standards
// and annexes. One such annex, UAX#29, describes classes to help
// splitting up text into graphemes or words. An example class may be
// a class of western language alphabetic characters “AL”, of which runes
// ‘X’ and ‘é’ would be part of.
//
// The first argument is a Recognizer (see the definition of
// type Recognizer in this package), which carries this state function.
//
// NfaStateFn – after matching a rune – must return another NfaStateFn,
// which will then in turn be called to process the next rune. The process
// of matching a string will stop as soon as a NfaStateFn returns nil.
type NfaStateFn func(*Recognizer, rune, int) NfaStateFn
// A Recognizer represents an automata to recognize sequences of runes
// (i.e. Unicode code-points). Its main functionality is performed by
// an embedded NfaStateFn. The first NfaStateFn to use is provided with
// the constructor.
//
// Recognizer's state functions must be careful to increment MatchLen
// with each matched rune. Failing to do so may result in incorrect splits
// of text.
//
// Semantics of Expect and UserData are up to the client and not used by
// the default mechanism.
//
// It is not mandatory to use Recognizers for segmenting text. The type is
// provided for easier implementation of types implementing UnicodeBreaker.
// Recognizers implement interface RuneSubscriber and UnicodeBreakers will
// use a UnicodePublisher to interact with them.
type Recognizer struct {
Expect int // next code-point to expect; semantics are up to the client
MatchLen int // length of active match
UserData interface{} // clients may need to store additional information
penalties []int // penalties to return, used internally in DoAccept()
nextStep NfaStateFn // next step of a DFA
}
// NewRecognizer creates a new Recognizer.
// This is rarely used, as clients rather should call NewPooledRecognizer().
//
// see NewPooledRecognizer.
func NewRecognizer(codePointClass int, next NfaStateFn) *Recognizer {
rec := &Recognizer{}
rec.Expect = codePointClass
rec.nextStep = next
return rec
}
// Recognizers are short-lived objects. To avoid multiple allocation of
// small objects we will pool them.
type recognizerPool struct {
opool *pool.ObjectPool
ctx context.Context
}
var globalRecognizerPool *recognizerPool
func init() {
globalRecognizerPool = &recognizerPool{}
factory := pool.NewPooledObjectFactorySimple(
func(context.Context) (interface{}, error) {
rec := &Recognizer{}
return rec, nil
})
globalRecognizerPool.ctx = context.Background()
config := pool.NewDefaultPoolConfig()
//config.LIFO = false
config.MaxTotal = -1 // infinity
config.BlockWhenExhausted = false
globalRecognizerPool.opool = pool.NewObjectPool(globalRecognizerPool.ctx, factory, config)
}
// NewPooledRecognizer returns a new Recognizer, pre-filled with an expected code-point class
// and a state function. The Recognizer is pooled for efficiency.
func NewPooledRecognizer(cpClass int, stateFn NfaStateFn) *Recognizer {
o, _ := globalRecognizerPool.opool.BorrowObject(globalRecognizerPool.ctx)
rec := o.(*Recognizer)
rec.Expect = cpClass
rec.nextStep = stateFn
return rec
}
// Clears the Recognizer and puts it back into the pool.
func (rec *Recognizer) releaseIntoPool() {
rec.penalties = nil
rec.Expect = 0
rec.MatchLen = 0
rec.nextStep = nil
_ = globalRecognizerPool.opool.ReturnObject(globalRecognizerPool.ctx, rec)
}
// Simple stringer for debugging purposes.
func (rec *Recognizer) String() string {
if rec == nil {
return "[nil rule]"
}
return fmt.Sprintf("[%d -> done=%v]", rec.Expect, rec.Done())
}
// Unsubscribed signals to
// a Recognizer that it has been unsubscribed from a RunePublisher;
// usually after the Recognizer's NfaStateFn has returned nil.
//
// Interface RuneSubscriber
func (rec *Recognizer) Unsubscribed() {
rec.releaseIntoPool()
}
// Done is used by a Recognizer that it is done matching runes.
// If MatchLength() > 0 is has been accepting a sequence of runes,
// otherwise it has aborted to further try a match.
//
// Interface RuneSubscriber
func (rec *Recognizer) Done() bool {
return rec.nextStep == nil
}
// MatchLength is part of interface RuneSubscriber.
func (rec *Recognizer) MatchLength() int {
return rec.MatchLen
}
// RuneEvent is part of interface RuneSubscriber.
func (rec *Recognizer) RuneEvent(r rune, codePointClass int) []int {
//fmt.Printf("received rune event: %+q / %d\n", r, codePointClass)
var penalties []int
if rec.nextStep != nil {
//CT.Infof(" calling func = %v", rec.nextStep)
rec.nextStep = rec.nextStep(rec, r, codePointClass)
} else {
//CT.Info(" not calling func = nil")
}
if rec.Done() && rec.MatchLen > 0 { // accepted a match
penalties = rec.penalties
}
//CT.Infof(" subscriber: penalites = %v, done = %v, match len = %d", penalties, rec.Done(), rec.MatchLength())
return penalties
}
// --- Standard Recognizer Rules ----------------------------------------
// DoAbort returns a state function which signals abort.
func DoAbort(rec *Recognizer) NfaStateFn {
rec.MatchLen = 0
return nil
}
// DoAccept returns a state function which signals accept, together with break
// penalties for matched runes (in reverse sequence).
func DoAccept(rec *Recognizer, penalties ...int) NfaStateFn {
rec.MatchLen++
rec.penalties = penalties
tracer().Debugf("ACCEPT with %v", rec.penalties)
return nil
}
// --- Rune Publishing and Subscription ---------------------------------
// A RuneSubscriber is a receiver of rune events, i.e. messages to
// process a new code-point (rune). If they can match the rune, they
// will expect further runes, otherwise they abort. To they are finished,
// either by accepting or rejecting input, they set Done() to true.
// A successful acceptance of input is signalled by Done()==true and
// MatchLength()>0.
type RuneSubscriber interface {
RuneEvent(r rune, codePointClass int) []int // receive a new code-point
MatchLength() int // length (in # of code-point) of the match up to now
Done() bool // is this subscriber done?
Unsubscribed() // this subscriber has been unsubscribed
}
// A RunePublisher notifies subscribers with rune events: a new rune has been read
// and the subscriber – usually a recognizer rule – has to react to it.
//
// UnicodeBreakers are not required to use the RunePublisher/RuneSubscriber
// pattern, but it is convenient to stick to it. UnicodeBreakers often
// rely on sets of rules, which are tested interleavingly. To releave
// UnicodeBreakers from managing rune-distribution to all the rules, it
// may be advantageous hold a RunePublisher within a UnicodeBreaker and
// let all rules implement the RuneSubscriber interface.
type RunePublisher interface {
SubscribeMe(RuneSubscriber) RunePublisher // subscribe an additional rune subscriber
PublishRuneEvent(r rune, codePointClass int) (longestDistance int, penalties []int)
//SetPenaltyAggregator(pa PenaltyAggregator) // function to aggregate break penalties
}
// NewRunePublisher creates a new default RunePublisher.
func NewRunePublisher() *DefaultRunePublisher {
rpub := &DefaultRunePublisher{}
//rpub.aggregate = AddPenalties
return rpub
}
// PublishRuneEvent triggers a rune event notification to all subscribers. Rune events
// include the rune (code-point) and an optional code-point class for
// the rune.
//
// Return values are: the longest active match and a slice of penalties.
// These values usually are collected from the RuneSubscribers.
// Penalties will be overwritten by the next call to PublishRuneEvent().
// Clients will have to make a copy if they want to preserve penalty
// values.
//
// Interface RunePublisher
func (rpub *DefaultRunePublisher) PublishRuneEvent(r rune, codePointClass int) (int, []int) {
longest := 0
if rpub.penaltiesTotal == nil {
rpub.penaltiesTotal = make([]int, 1024)
}
//CT.Infof("pre-publish(): total penalites = %v", rpub.penaltiesTotal)
rpub.penaltiesTotal = rpub.penaltiesTotal[:0]
//CT.Infof("pre-publish(): total penalites = %v", rpub.penaltiesTotal)
// pre-condition: no subscriber is Done()
for i := rpub.Len() - 1; i >= 0; i-- {
subscr := rpub.at(i)
penalties := subscr.RuneEvent(r, codePointClass)
//CT().Infof(" publish(): penalites = %v", penalties)
for j, p := range penalties { // aggregate all penalties
if j >= len(rpub.penaltiesTotal) {
rpub.penaltiesTotal = append(rpub.penaltiesTotal, p)
} else {
//rpub.aggregate.Aggregate(p)
//rpub.penaltiesTotal[j] = rpub.aggregate(rpub.penaltiesTotal[j], p)
rpub.penaltiesTotal[j] += p
}
}
//CT().Infof(" publish(): total penalites = %v", rpub.penaltiesTotal)
if !subscr.Done() { // compare against longest active match
if d := subscr.MatchLength(); d > longest {
longest = d
}
}
rpub.Fix(i) // re-order heap if subscr.Done()
}
//CT.Infof("pre-publish(): total penalites = %v", rpub.penaltiesTotal)
// now unsubscribe all done subscribers
for subscr := rpub.PopDone(); subscr != nil; subscr = rpub.PopDone() {
subscr.Unsubscribed()
}
return longest, rpub.penaltiesTotal
}
// --- Penalty aggregators ---------------------------------------------------
// PenaltyAggregator is a type for methods of penalty-aggregation. Aggregates all the
// break penalties at a break-point to a single penalty value at that point.
// type PenaltyAggregator interface {
// Aggregate(int)
// HasValue() bool
// Value() int
// }
// SetPenaltyAggregator sets a PenaltyAggregator for a rune publisher.
// A PenaltyAggregator aggregates all the
// break penalties at a break-point to a single penalty value at that point.
//
// Part of interface RunePublisher.
// func (rpub *DefaultRunePublisher) SetPenaltyAggregator(pa PenaltyAggregator) {
// if pa == nil {
// rpub.aggregate = &AddPenalties{}
// } else {
// rpub.aggregate = pa
// }
// }
// AddPenalties is the default aggregator for break-penalties.
// Simply adds up all penalties at each break position, respectively.
//
// The zero value is 0, thus AddPenalties calculates a monoid fold
// of 0+n1=n, n+n2=n, …; i.e., n1+n2+…
//
// type AddPenalties struct {
// sum int
// modified bool
// }
// func (a *AddPenalties) Aggregate(n int) {
// a.modified = true
// a.sum += n
// }
// func (a *AddPenalties) HasValue() bool {
// return a.modified
// }
// func (a *AddPenalties) Value() int {
// return a.sum
// }
// MaxPenalties is an alternative function to aggregate break-penalties.
// Returns maximum of all penalties at each break position.
//
// The zero value is InfinitePenalty, thus MaxMerits calculates a semi-group fold
// of min(∞,n1)=n, min(n,n2)=n, … (remember, mertits are negative penalties)
//
// type MaxMerits struct {
// m int
// modified bool
// }
// func (a *MaxMerits) Aggregate(n int) {
// if !a.modified {
// a.m = InfinitePenalty
// }
// a.modified = true
// a.m = min(a.m, n)
// }
// func (a *MaxMerits) HasValue() bool {
// return a.modified
// }
// func (a *MaxMerits) Value() int {
// return a.m
// }
// SubscribeMe lets a client subscribe to a RunePublisher.
//
// Part of interface RunePublisher.
func (rpub *DefaultRunePublisher) SubscribeMe(rsub RuneSubscriber) RunePublisher {
// if rpub.aggregate == nil { // this is necessary as we allow uninitialzed DefaultRunePublishers
// rpub.aggregate = &AddPenalties{}
// }
rpub.Push(rsub)
return rpub
}
// ----------------------------------------------------------------------
func min(a int, b int) int {
if a < b {
return a
}
return b
}
func max(a int, b int) int {
if a > b {
return a
}
return b
}