-
Notifications
You must be signed in to change notification settings - Fork 94
/
strings.go
250 lines (220 loc) · 5.38 KB
/
strings.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
package jsluice
import (
"strconv"
"strings"
"unicode"
"unicode/utf8"
)
// An item represents a token in a JavaScript string
type item struct {
typ itemType
val string
}
type itemType int
const (
itemString itemType = iota
itemSingleEscape
itemHexEscape
itemOctalEscape
itemUnicodeEscape
itemCodepointEscape
)
func (i item) String() string {
switch i.typ {
case itemString:
return i.val
case itemSingleEscape:
escapes := map[string]string{
"b": "\b",
"f": "\f",
"n": "\n",
"r": "\r",
"t": "\t",
"v": "\v",
}
if out, exists := escapes[i.val]; exists {
return out
}
return i.val
case itemHexEscape, itemUnicodeEscape, itemCodepointEscape:
num, err := strconv.ParseInt(i.val, 16, 0)
if err != nil {
return i.val
}
return string(rune(num))
case itemOctalEscape:
num, err := strconv.ParseInt(i.val, 8, 0)
if err != nil {
return i.val
}
return string(rune(num))
default:
return i.val
}
}
// a stringLexer maintains the state needed to lex a string
type stringLexer struct {
str string // The input string
start int // The start position (in bytes) of the current token being lexed
pos int // The current position (in bytes) of the rune being looked at
items []item // A slice of tokens that have been emitted
done bool // Flag that's set when we've consumed all of the input
}
func newStringLexer(in string) *stringLexer {
return &stringLexer{
str: in,
start: 0,
pos: 0,
items: make([]item, 0),
done: false,
}
}
// Next returns the next rune in the input string, moving the
// position pointer on by the size of the rune that was decoded.
// For ASCII text this wouldn't be required, but in JavaScript
// source we will encounter many runes that have a length of more
// than one byte
func (s *stringLexer) Next() rune {
if s.pos >= len(s.str) {
s.done = true
return -1
}
r, l := utf8.DecodeRuneInString(s.str[s.pos:])
s.pos += l
return r
}
// Backup moves the position pointer back by the length of the
// previous rune in the input string.
func (s *stringLexer) Backup() {
if s.done || s.pos <= 0 {
return
}
_, l := utf8.DecodeLastRuneInString(s.str[:s.pos])
s.pos -= l
}
// Peek returns the next rune in the input string without advancing
// the position pointer
func (s *stringLexer) Peek() rune {
r := s.Next()
s.Backup()
return r
}
// Emit adds a token of the provided type to the stringLexer's
// internal list of tokens. The start pointer is advanced to the
// current position.
func (s *stringLexer) Emit(t itemType) {
s.items = append(s.items, item{
typ: t,
val: s.str[s.start:s.pos],
})
s.start = s.pos
}
// Ignore moves the start position pointer to the current
// position without emitting a token; effectively ignoring
// the chunk of text between the last token we emitted and now.
func (s *stringLexer) Ignore() {
s.start = s.pos
}
// Accept advances the position pointer only if the next rune
// is in the set of valid runes provided
func (s *stringLexer) Accept(valid string) bool {
if strings.ContainsRune(valid, s.Next()) {
return true
}
s.Backup()
return false
}
// AcceptN advances the position pointer N times, only if
// the next N runes are in the set of valid runes provided
func (s *stringLexer) AcceptN(valid string, n int) bool {
count := 0
for i := 0; i < n; i++ {
if s.Accept(valid) {
count++
}
}
return count == n
}
// AcceptUntil accepts any runes until the rune provided is
// encountered
func (s *stringLexer) AcceptUntil(r rune) {
for s.Next() != r && !s.done {
}
s.Backup()
}
// AcceptRun accepts runes until encountering a rune not
// in the set of valid runes provided
func (s *stringLexer) AcceptRun(valid string) {
for strings.ContainsRune(valid, s.Next()) {
}
s.Backup()
}
// String returns the unescaped representation of the input
// that has been lexed so far. Usually it would only be
// called after all the input has been processed.
func (s *stringLexer) String() string {
out := &strings.Builder{}
for _, i := range s.items {
out.WriteString(i.String())
}
return out.String()
}
// DecodeString accepts a raw string as it might be found in some
// JavaScript source code, and converts any escape sequences. E.g:
// foo\x3dbar -> foo=bar // Hex escapes
// foo\u003Dbar -> foo=bar // Unicode escapes
// foo\u{003D}bar -> foo=bar // Braced unicode escapes
// foo\075bar -> foo=bar // Octal escape
// foo\"bar -> foo"bar // Single character escapes
func DecodeString(in string) string {
in = dequote(in)
l := newStringLexer(in)
validHex := "0123456789abcdefABCDEF"
for !l.done {
l.AcceptUntil('\\')
l.Emit(itemString)
if l.done {
break
}
// Ignore the backslash
l.Next()
l.Ignore()
switch l.Next() {
case 'b', 'f', 'n', 'r', 't', 'v', '\'', '"', '\\':
l.Emit(itemSingleEscape)
case '0':
// It's a \0 (null)
if !unicode.IsDigit(l.Peek()) {
l.Emit(itemSingleEscape)
continue
}
// It's an octal escape
l.AcceptRun("01234567")
l.Emit(itemOctalEscape)
case 'x':
// ignore the x
l.Ignore()
// Exactly 2 hex digits
if l.AcceptN(validHex, 2) {
l.Emit(itemHexEscape)
}
case 'u':
// ignore the u
l.Ignore()
// e.g. \u{00003d}
if l.Accept("{") {
l.Ignore()
l.AcceptRun(validHex)
l.Emit(itemCodepointEscape)
if l.Accept("}") {
l.Ignore()
}
}
// e.g. \u003d
if l.AcceptN(validHex, 4) {
l.Emit(itemUnicodeEscape)
}
}
}
return l.String()
}