forked from microsoft/WPF-Samples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHtmlLexicalAnalyzer.cs
745 lines (654 loc) · 28.7 KB
/
HtmlLexicalAnalyzer.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
// // Copyright (c) Microsoft. All rights reserved.
// // Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using System.Diagnostics;
using System.IO;
using System.Text;
namespace HtmlToXamlDemo
{
/// <summary>
/// lexical analyzer class
/// recognizes tokens as groups of characters separated by arbitrary amounts of whitespace
/// also classifies tokens according to type
/// </summary>
internal class HtmlLexicalAnalyzer
{
// ---------------------------------------------------------------------
//
// Constructors
//
// ---------------------------------------------------------------------
#region Constructors
/// <summary>
/// initializes the _inputStringReader member with the string to be read
/// also sets initial values for _nextCharacterCode and _nextTokenType
/// </summary>
/// <param name="inputTextString">
/// text string to be parsed for xml content
/// </param>
internal HtmlLexicalAnalyzer(string inputTextString)
{
_inputStringReader = new StringReader(inputTextString);
_nextCharacterCode = 0;
NextCharacter = ' ';
_lookAheadCharacterCode = _inputStringReader.Read();
_lookAheadCharacter = (char) _lookAheadCharacterCode;
_previousCharacter = ' ';
_ignoreNextWhitespace = true;
_nextToken = new StringBuilder(100);
NextTokenType = HtmlTokenType.Text;
// read the first character so we have some value for the NextCharacter property
GetNextCharacter();
}
#endregion Constructors
// ---------------------------------------------------------------------
//
// Internal methods
//
// ---------------------------------------------------------------------
#region Internal Methods
/// <summary>
/// retrieves next recognizable token from input string
/// and identifies its type
/// if no valid token is found, the output parameters are set to null
/// if end of stream is reached without matching any token, token type
/// paramter is set to EOF
/// </summary>
internal void GetNextContentToken()
{
Debug.Assert(NextTokenType != HtmlTokenType.Eof);
_nextToken.Length = 0;
if (IsAtEndOfStream)
{
NextTokenType = HtmlTokenType.Eof;
return;
}
if (IsAtTagStart)
{
GetNextCharacter();
if (NextCharacter == '/')
{
_nextToken.Append("</");
NextTokenType = HtmlTokenType.ClosingTagStart;
// advance
GetNextCharacter();
_ignoreNextWhitespace = false; // Whitespaces after closing tags are significant
}
else
{
NextTokenType = HtmlTokenType.OpeningTagStart;
_nextToken.Append("<");
_ignoreNextWhitespace = true; // Whitespaces after opening tags are insignificant
}
}
else if (IsAtDirectiveStart)
{
// either a comment or CDATA
GetNextCharacter();
if (_lookAheadCharacter == '[')
{
// cdata
ReadDynamicContent();
}
else if (_lookAheadCharacter == '-')
{
ReadComment();
}
else
{
// neither a comment nor cdata, should be something like DOCTYPE
// skip till the next tag ender
ReadUnknownDirective();
}
}
else
{
// read text content, unless you encounter a tag
NextTokenType = HtmlTokenType.Text;
while (!IsAtTagStart && !IsAtEndOfStream && !IsAtDirectiveStart)
{
if (NextCharacter == '<' && !IsNextCharacterEntity && _lookAheadCharacter == '?')
{
// ignore processing directive
SkipProcessingDirective();
}
else
{
if (NextCharacter <= ' ')
{
// Respect xml:preserve or its equivalents for whitespace processing
if (_ignoreNextWhitespace)
{
// Ignore repeated whitespaces
}
else
{
// Treat any control character sequence as one whitespace
_nextToken.Append(' ');
}
_ignoreNextWhitespace = true; // and keep ignoring the following whitespaces
}
else
{
_nextToken.Append(NextCharacter);
_ignoreNextWhitespace = false;
}
GetNextCharacter();
}
}
}
}
/// <summary>
/// Unconditionally returns a token which is one of: TagEnd, EmptyTagEnd, Name, Atom or EndOfStream
/// Does not guarantee token reader advancing.
/// </summary>
internal void GetNextTagToken()
{
_nextToken.Length = 0;
if (IsAtEndOfStream)
{
NextTokenType = HtmlTokenType.Eof;
return;
}
SkipWhiteSpace();
if (NextCharacter == '>' && !IsNextCharacterEntity)
{
// > should not end a tag, so make sure it's not an entity
NextTokenType = HtmlTokenType.TagEnd;
_nextToken.Append('>');
GetNextCharacter();
// Note: _ignoreNextWhitespace must be set appropriately on tag start processing
}
else if (NextCharacter == '/' && _lookAheadCharacter == '>')
{
// could be start of closing of empty tag
NextTokenType = HtmlTokenType.EmptyTagEnd;
_nextToken.Append("/>");
GetNextCharacter();
GetNextCharacter();
_ignoreNextWhitespace = false; // Whitespace after no-scope tags are sifnificant
}
else if (IsGoodForNameStart(NextCharacter))
{
NextTokenType = HtmlTokenType.Name;
// starts a name
// we allow character entities here
// we do not throw exceptions here if end of stream is encountered
// just stop and return whatever is in the token
// if the parser is not expecting end of file after this it will call
// the get next token function and throw an exception
while (IsGoodForName(NextCharacter) && !IsAtEndOfStream)
{
_nextToken.Append(NextCharacter);
GetNextCharacter();
}
}
else
{
// Unexpected type of token for a tag. Reprot one character as Atom, expecting that HtmlParser will ignore it.
NextTokenType = HtmlTokenType.Atom;
_nextToken.Append(NextCharacter);
GetNextCharacter();
}
}
/// <summary>
/// Unconditionally returns equal sign token. Even if there is no
/// real equal sign in the stream, it behaves as if it were there.
/// Does not guarantee token reader advancing.
/// </summary>
internal void GetNextEqualSignToken()
{
Debug.Assert(NextTokenType != HtmlTokenType.Eof);
_nextToken.Length = 0;
_nextToken.Append('=');
NextTokenType = HtmlTokenType.EqualSign;
SkipWhiteSpace();
if (NextCharacter == '=')
{
// '=' is not in the list of entities, so no need to check for entities here
GetNextCharacter();
}
}
/// <summary>
/// Unconditionally returns an atomic value for an attribute
/// Even if there is no appropriate token it returns Atom value
/// Does not guarantee token reader advancing.
/// </summary>
internal void GetNextAtomToken()
{
Debug.Assert(NextTokenType != HtmlTokenType.Eof);
_nextToken.Length = 0;
SkipWhiteSpace();
NextTokenType = HtmlTokenType.Atom;
if ((NextCharacter == '\'' || NextCharacter == '"') && !IsNextCharacterEntity)
{
var startingQuote = NextCharacter;
GetNextCharacter();
// Consume all characters between quotes
while (!(NextCharacter == startingQuote && !IsNextCharacterEntity) && !IsAtEndOfStream)
{
_nextToken.Append(NextCharacter);
GetNextCharacter();
}
if (NextCharacter == startingQuote)
{
GetNextCharacter();
}
// complete the quoted value
// NOTE: our recovery here is different from IE's
// IE keeps reading until it finds a closing quote or end of file
// if end of file, it treats current value as text
// if it finds a closing quote at any point within the text, it eats everything between the quotes
// TODO: Suggestion:
// however, we could stop when we encounter end of file or an angle bracket of any kind
// and assume there was a quote there
// so the attribute value may be meaningless but it is never treated as text
}
else
{
while (!IsAtEndOfStream && !char.IsWhiteSpace(NextCharacter) && NextCharacter != '>')
{
_nextToken.Append(NextCharacter);
GetNextCharacter();
}
}
}
#endregion Internal Methods
// ---------------------------------------------------------------------
//
// Internal Properties
//
// ---------------------------------------------------------------------
#region Internal Properties
internal HtmlTokenType NextTokenType { get; private set; }
internal string NextToken => _nextToken.ToString();
#endregion Internal Properties
// ---------------------------------------------------------------------
//
// Private methods
//
// ---------------------------------------------------------------------
#region Private Methods
/// <summary>
/// Advances a reading position by one character code
/// and reads the next availbale character from a stream.
/// This character becomes available as NextCharacter property.
/// </summary>
/// <remarks>
/// Throws InvalidOperationException if attempted to be called on EndOfStream
/// condition.
/// </remarks>
private void GetNextCharacter()
{
if (_nextCharacterCode == -1)
{
throw new InvalidOperationException("GetNextCharacter method called at the end of a stream");
}
_previousCharacter = NextCharacter;
NextCharacter = _lookAheadCharacter;
_nextCharacterCode = _lookAheadCharacterCode;
// next character not an entity as of now
IsNextCharacterEntity = false;
ReadLookAheadCharacter();
if (NextCharacter == '&')
{
if (_lookAheadCharacter == '#')
{
// numeric entity - parse digits - &#DDDDD;
int entityCode;
entityCode = 0;
ReadLookAheadCharacter();
// largest numeric entity is 7 characters
for (var i = 0; i < 7 && char.IsDigit(_lookAheadCharacter); i++)
{
entityCode = 10*entityCode + (_lookAheadCharacterCode - '0');
ReadLookAheadCharacter();
}
if (_lookAheadCharacter == ';')
{
// correct format - advance
ReadLookAheadCharacter();
_nextCharacterCode = entityCode;
// if this is out of range it will set the character to '?'
NextCharacter = (char) _nextCharacterCode;
// as far as we are concerned, this is an entity
IsNextCharacterEntity = true;
}
else
{
// not an entity, set next character to the current lookahread character
// we would have eaten up some digits
NextCharacter = _lookAheadCharacter;
_nextCharacterCode = _lookAheadCharacterCode;
ReadLookAheadCharacter();
IsNextCharacterEntity = false;
}
}
else if (char.IsLetter(_lookAheadCharacter))
{
// entity is written as a string
var entity = "";
// maximum length of string entities is 10 characters
for (var i = 0;
i < 10 && (char.IsLetter(_lookAheadCharacter) || char.IsDigit(_lookAheadCharacter));
i++)
{
entity += _lookAheadCharacter;
ReadLookAheadCharacter();
}
if (_lookAheadCharacter == ';')
{
// advance
ReadLookAheadCharacter();
if (HtmlSchema.IsEntity(entity))
{
NextCharacter = HtmlSchema.EntityCharacterValue(entity);
_nextCharacterCode = NextCharacter;
IsNextCharacterEntity = true;
}
else
{
// just skip the whole thing - invalid entity
// move on to the next character
NextCharacter = _lookAheadCharacter;
_nextCharacterCode = _lookAheadCharacterCode;
ReadLookAheadCharacter();
// not an entity
IsNextCharacterEntity = false;
}
}
else
{
// skip whatever we read after the ampersand
// set next character and move on
NextCharacter = _lookAheadCharacter;
ReadLookAheadCharacter();
IsNextCharacterEntity = false;
}
}
}
}
private void ReadLookAheadCharacter()
{
if (_lookAheadCharacterCode != -1)
{
_lookAheadCharacterCode = _inputStringReader.Read();
_lookAheadCharacter = (char) _lookAheadCharacterCode;
}
}
/// <summary>
/// skips whitespace in the input string
/// leaves the first non-whitespace character available in the NextCharacter property
/// this may be the end-of-file character, it performs no checking
/// </summary>
private void SkipWhiteSpace()
{
// TODO: handle character entities while processing comments, cdata, and directives
// TODO: SUGGESTION: we could check if lookahead and previous characters are entities also
while (true)
{
if (NextCharacter == '<' && (_lookAheadCharacter == '?' || _lookAheadCharacter == '!'))
{
GetNextCharacter();
if (_lookAheadCharacter == '[')
{
// Skip CDATA block and DTDs(?)
while (!IsAtEndOfStream &&
!(_previousCharacter == ']' && NextCharacter == ']' && _lookAheadCharacter == '>'))
{
GetNextCharacter();
}
if (NextCharacter == '>')
{
GetNextCharacter();
}
}
else
{
// Skip processing instruction, comments
while (!IsAtEndOfStream && NextCharacter != '>')
{
GetNextCharacter();
}
if (NextCharacter == '>')
{
GetNextCharacter();
}
}
}
if (!char.IsWhiteSpace(NextCharacter))
{
break;
}
GetNextCharacter();
}
}
/// <summary>
/// checks if a character can be used to start a name
/// if this check is true then the rest of the name can be read
/// </summary>
/// <param name="character">
/// character value to be checked
/// </param>
/// <returns>
/// true if the character can be the first character in a name
/// false otherwise
/// </returns>
private bool IsGoodForNameStart(char character) => character == '_' || char.IsLetter(character);
/// <summary>
/// checks if a character can be used as a non-starting character in a name
/// uses the IsExtender and IsCombiningCharacter predicates to see
/// if a character is an extender or a combining character
/// </summary>
/// <param name="character">
/// character to be checked for validity in a name
/// </param>
/// <returns>
/// true if the character can be a valid part of a name
/// </returns>
private bool IsGoodForName(char character) => IsGoodForNameStart(character) ||
character == '.' ||
character == '-' ||
character == ':' ||
char.IsDigit(character) ||
IsCombiningCharacter(character) ||
IsExtender(character);
/// <summary>
/// identifies a character as being a combining character, permitted in a name
/// TODO: only a placeholder for now but later to be replaced with comparisons against
/// the list of combining characters in the XML documentation
/// </summary>
/// <param name="character">
/// character to be checked
/// </param>
/// <returns>
/// true if the character is a combining character, false otherwise
/// </returns>
private bool IsCombiningCharacter(char character) => false;
/// <summary>
/// identifies a character as being an extender, permitted in a name
/// TODO: only a placeholder for now but later to be replaced with comparisons against
/// the list of extenders in the XML documentation
/// </summary>
/// <param name="character">
/// character to be checked
/// </param>
/// <returns>
/// true if the character is an extender, false otherwise
/// </returns>
private bool IsExtender(char character) => false;
/// <summary>
/// skips dynamic content starting with '<![' and ending with ']>'
/// </summary>
private void ReadDynamicContent()
{
// verify that we are at dynamic content, which may include CDATA
Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' && _lookAheadCharacter == '[');
// Let's treat this as empty text
NextTokenType = HtmlTokenType.Text;
_nextToken.Length = 0;
// advance twice, once to get the lookahead character and then to reach the start of the cdata
GetNextCharacter();
GetNextCharacter();
// NOTE: 10/12/2004: modified this function to check when called if's reading CDATA or something else
// some directives may start with a <![ and then have some data and they will just end with a ]>
// this function is modified to stop at the sequence ]> and not ]]>
// this means that CDATA and anything else expressed in their own set of [] within the <! [...]>
// directive cannot contain a ]> sequence. However it is doubtful that cdata could contain such
// sequence anyway, it probably stops at the first ]
while (!(NextCharacter == ']' && _lookAheadCharacter == '>') && !IsAtEndOfStream)
{
// advance
GetNextCharacter();
}
if (!IsAtEndOfStream)
{
// advance, first to the last >
GetNextCharacter();
// then advance past it to the next character after processing directive
GetNextCharacter();
}
}
/// <summary>
/// skips comments starting with '<!-' and ending with '-->'
/// NOTE: 10/06/2004: processing changed, will now skip anything starting with
/// the "<!-" sequence and ending in "!>" or "->", because in practice many html pages do not
/// use the full comment specifying conventions
/// </summary>
private void ReadComment()
{
// verify that we are at a comment
Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' && _lookAheadCharacter == '-');
// Initialize a token
NextTokenType = HtmlTokenType.Comment;
_nextToken.Length = 0;
// advance to the next character, so that to be at the start of comment value
GetNextCharacter(); // get first '-'
GetNextCharacter(); // get second '-'
GetNextCharacter(); // get first character of comment content
while (true)
{
// Read text until end of comment
// Note that in many actual html pages comments end with "!>" (while xml standard is "-->")
while (!IsAtEndOfStream &&
!(NextCharacter == '-' && _lookAheadCharacter == '-' ||
NextCharacter == '!' && _lookAheadCharacter == '>'))
{
_nextToken.Append(NextCharacter);
GetNextCharacter();
}
// Finish comment reading
GetNextCharacter();
if (_previousCharacter == '-' && NextCharacter == '-' && _lookAheadCharacter == '>')
{
// Standard comment end. Eat it and exit the loop
GetNextCharacter(); // get '>'
break;
}
if (_previousCharacter == '!' && NextCharacter == '>')
{
// Nonstandard but possible comment end - '!>'. Exit the loop
break;
}
// Not an end. Save character and continue continue reading
_nextToken.Append(_previousCharacter);
}
// Read end of comment combination
if (NextCharacter == '>')
{
GetNextCharacter();
}
}
/// <summary>
/// skips past unknown directives that start with "<!" but are not comments or Cdata
/// ignores content of such directives until the next ">"
/// character
/// applies to directives such as DOCTYPE, etc that we do not presently support
/// </summary>
private void ReadUnknownDirective()
{
// verify that we are at an unknown directive
Debug.Assert(_previousCharacter == '<' && NextCharacter == '!' &&
!(_lookAheadCharacter == '-' || _lookAheadCharacter == '['));
// Let's treat this as empty text
NextTokenType = HtmlTokenType.Text;
_nextToken.Length = 0;
// advance to the next character
GetNextCharacter();
// skip to the first tag end we find
while (!(NextCharacter == '>' && !IsNextCharacterEntity) && !IsAtEndOfStream)
{
GetNextCharacter();
}
if (!IsAtEndOfStream)
{
// advance past the tag end
GetNextCharacter();
}
}
/// <summary>
/// skips processing directives starting with the characters '<?' and ending with '?>'
/// NOTE: 10/14/2004: IE also ends processing directives with a />, so this function is
/// being modified to recognize that condition as well
/// </summary>
private void SkipProcessingDirective()
{
// verify that we are at a processing directive
Debug.Assert(NextCharacter == '<' && _lookAheadCharacter == '?');
// advance twice, once to get the lookahead character and then to reach the start of the drective
GetNextCharacter();
GetNextCharacter();
while (!((NextCharacter == '?' || NextCharacter == '/') && _lookAheadCharacter == '>') && !IsAtEndOfStream)
{
// advance
// we don't need to check for entities here because '?' is not an entity
// and even though > is an entity there is no entity processing when reading lookahead character
GetNextCharacter();
}
if (!IsAtEndOfStream)
{
// advance, first to the last >
GetNextCharacter();
// then advance past it to the next character after processing directive
GetNextCharacter();
}
}
#endregion Private Methods
// ---------------------------------------------------------------------
//
// Private Properties
//
// ---------------------------------------------------------------------
#region Private Properties
private char NextCharacter { get; set; }
private bool IsAtEndOfStream => _nextCharacterCode == -1;
private bool IsAtTagStart
=> NextCharacter == '<' && (_lookAheadCharacter == '/' || IsGoodForNameStart(_lookAheadCharacter)) &&
!IsNextCharacterEntity;
private bool IsAtTagEnd => (NextCharacter == '>' || (NextCharacter == '/' && _lookAheadCharacter == '>')) &&
!IsNextCharacterEntity;
private bool IsAtDirectiveStart
=> (NextCharacter == '<' && _lookAheadCharacter == '!' && !IsNextCharacterEntity);
private bool IsNextCharacterEntity { // check if next character is an entity
get; set; }
#endregion Private Properties
// ---------------------------------------------------------------------
//
// Private Fields
//
// ---------------------------------------------------------------------
#region Private Fields
// string reader which will move over input text
private readonly StringReader _inputStringReader;
// next character code read from input that is not yet part of any token
// and the character it represents
private int _nextCharacterCode;
private int _lookAheadCharacterCode;
private char _lookAheadCharacter;
private char _previousCharacter;
private bool _ignoreNextWhitespace;
// store token and type in local variables before copying them to output parameters
private readonly StringBuilder _nextToken;
#endregion Private Fields
}
}