-
-
Notifications
You must be signed in to change notification settings - Fork 15
/
tokenize.js
93 lines (77 loc) · 2.13 KB
/
tokenize.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
const {createTokenizer, regexRule} = require('doken')
const iconv = require('./iconv-lite')
const jschardet = require('./jschardet')
const {unescapeString} = require('./helper')
const encodingDetectionProps = [
'EV',
'GN',
'GC',
'AN',
'BT',
'WT',
'PW',
'PB',
'C'
]
const _tokenize = createTokenizer({
rules: [
regexRule('_whitespace', /\s+/y, {lineBreaks: true}),
regexRule('parenthesis', /(\(|\))/y),
regexRule('semicolon', /;/y),
regexRule('prop_ident', /[A-Za-z]+/y),
regexRule('c_value_type', /\[([^\\\]]|\\[^])*\]/y, {lineBreaks: true}),
{
type: 'invalid',
match: (input, position) => ({length: 1})
}
]
})
exports.tokenizeIter = function*(contents) {
for (let token of _tokenize(contents)) {
token.progress = token.pos / (contents.length - 1)
delete token.length
yield token
}
}
exports.tokenizeBufferIter = function*(buffer, {encoding = null} = {}) {
if (encoding != null) {
yield* exports.tokenizeIter(iconv.decode(buffer, encoding))
return
}
// Guess encoding
let detectedEncoding = jschardet.detect(buffer.slice(0, 300)).encoding
let contents = iconv.decode(buffer, detectedEncoding)
let tokens = exports.tokenizeIter(contents)
// Search for encoding
let prelude = []
while (true) {
let next = tokens.next()
if (next.done) break
let {type, value} = next.value
let lastToken = prelude[prelude.length - 1]
prelude.push(next.value)
if (
type === 'c_value_type' &&
lastToken != null &&
lastToken.type === 'prop_ident' &&
lastToken.value === 'CA'
) {
encoding = unescapeString(value.slice(1, -1))
break
}
}
if (
encoding != null &&
encoding != detectedEncoding &&
iconv.encodingExists(encoding)
) {
yield* exports.tokenizeIter(iconv.decode(buffer, encoding))
} else {
yield* prelude
yield* tokens
}
}
exports.tokenize = contents => [...exports.tokenizeIter(contents)]
exports.tokenizeBuffer = (buffer, opts) => [
...exports.tokenizeBufferIter(buffer, opts)
]