Skip to content

Commit a58a092

Browse files
author
John Messerly
committed
Merge pull request #5 from dart-lang/encoding_bug
Fixes encoding parser to handle whitespace correctly.
2 parents fc79cd5 + 08c86f8 commit a58a092

File tree

2 files changed

+35
-16
lines changed

2 files changed

+35
-16
lines changed

lib/src/encoding_parser.dart

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
library encoding_parser;
22

3-
import 'dart:collection';
43
import 'constants.dart';
54
import 'inputstream.dart';
65

@@ -9,14 +8,12 @@ import 'inputstream.dart';
98
/// String-like object with an associated position and various extra methods
109
/// If the position is ever greater than the string length then an exception is
1110
/// raised.
12-
class EncodingBytes extends IterableBase<String> {
11+
class EncodingBytes {
1312
final String _bytes;
1413
int _position = -1;
1514

1615
EncodingBytes(this._bytes);
1716

18-
Iterator<String> get iterator => _bytes.split('').iterator;
19-
2017
int get length => _bytes.length;
2118

2219
String next() {
@@ -145,25 +142,21 @@ class EncodingParser {
145142
];
146143

147144
try {
148-
for (var byte in data) {
149-
var keepParsing = true;
145+
for (;;) {
150146
for (var dispatch in methodDispatch) {
151147
if (data.matchBytes(dispatch[0])) {
152-
try {
153-
keepParsing = dispatch[1]();
154-
break;
155-
} on StateError catch (e) {
156-
keepParsing = false;
157-
break;
158-
}
148+
var keepParsing = dispatch[1]();
149+
if (keepParsing) break;
150+
151+
// We found an encoding. Stop.
152+
return encoding;
159153
}
160154
}
161-
if (!keepParsing) {
162-
break;
163-
}
155+
data.position += 1;
164156
}
165157
} on StateError catch (e) {
166158
// Catch this here to match behavior of Python's StopIteration
159+
// TODO(jmesserly): refactor to not use exceptions
167160
}
168161
return encoding;
169162
}

test/parser_feature_test.dart

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import 'package:unittest/unittest.dart';
55
import 'package:html/dom.dart';
66
import 'package:html/parser.dart';
77
import 'package:html/src/constants.dart';
8+
import 'package:html/src/encoding_parser.dart';
89
import 'package:html/src/treebuilder.dart';
910

1011
main() {
@@ -291,4 +292,29 @@ On line 4, column 3 of ParseError: Unexpected DOCTYPE. Ignored.
291292
expect(c.text, 'qux');
292293
expect(e.text, 'bar');
293294
});
295+
296+
group('Encoding pre-parser', () {
297+
getEncoding(s) => new EncodingParser(s.codeUnits).getEncoding();
298+
299+
test('gets encoding from meta charset', () {
300+
expect(getEncoding('<meta charset="utf-16">'), 'utf-16');
301+
});
302+
303+
test('gets encoding from meta in head', () {
304+
expect(getEncoding('<head><meta charset="utf-16">'), 'utf-16');
305+
});
306+
307+
test('skips comments', () {
308+
expect(getEncoding('<!--comment--><meta charset="utf-16">'), 'utf-16');
309+
});
310+
311+
test('stops if no match', () {
312+
// missing closing tag
313+
expect(getEncoding('<meta charset="utf-16"'), null);
314+
});
315+
316+
test('ignores whitespace', () {
317+
expect(getEncoding(' <meta charset="utf-16">'), 'utf-16');
318+
});
319+
});
294320
}

0 commit comments

Comments
 (0)