-
Notifications
You must be signed in to change notification settings - Fork 17
/
tctok.h
2486 lines (2055 loc) · 82.6 KB
/
tctok.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* $Header: d:/cvsroot/tads/tads3/tctok.h,v 1.5 1999/07/11 00:46:59 MJRoberts Exp $ */
/*
* Copyright (c) 1999, 2002 Michael J. Roberts. All Rights Reserved.
*
* Please see the accompanying license file, LICENSE.TXT, for information
* on using and copying this software.
*/
/*
Name
tctok.h - TADS3 compiler tokenizer and preprocessor
Function
Notes
The tokenizer is layered with the preprocessor, so that the preprocessor
can deal with include files, macro expansion, and preprocessor directives.
Modified
04/12/99 MJRoberts - Creation
*/
#ifndef TCTOK_H
#define TCTOK_H
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "os.h"
#include "t3std.h"
#include "utf8.h"
#include "vmhash.h"
#include "vmerr.h"
#include "tcerr.h"
#include "tcerrnum.h"
/* ------------------------------------------------------------------------ */
/*
* Constants
*/
/* maximum length of a symbol name, in characters */
const size_t TOK_SYM_MAX_LEN = 80;
/*
* Maximum buffer required to hold a symbol, in bytes. Each UTF-8
* character may take up three bytes, plus we need a null terminator
* byte.
*/
const size_t TOK_SYM_MAX_BUFFER = (3*TOK_SYM_MAX_LEN + 1);
/* maximum #if nesting level */
const size_t TOK_MAX_IF_NESTING = 100;
/* maximum number of parameters per macro */
const int TOK_MAX_MACRO_ARGS = 128;
/*
* Special token flag characters - these are a characters that can't
* occur in an input file (we guarantee this by converting any
* occurrences of this character to a space on reading input). We use
* these to flag certain special properties of tokens in the input
* buffer.
*
* We use ASCII characters in the control range (0x01 (^A) through 0x1A
* (^Z), excluding 0x09 (tab), 0x0A (LF), 0x0D (CR), and 0x0C (Page
* Feed); a well-formed source file would never use any of these
* characters in input. Even if it does, we won't get confused, since
* we'll always translate these to a space if we find them in input; but
* choosing characters that *should* never occur in valid input will
* ensure that we never alter the meaning of valid source by this
* translation.
*/
/*
* macro parameter flag - we use this in the internal storage of a
* #define expansion to flag where the formal parameters are mentioned,
* so that we can substitute the actuals when expanding the macro
*/
const char TOK_MACRO_FORMAL_FLAG = 0x01;
/*
* Token fully expanded flag. Whenever we detect that a particular
* token has been fully expanded in the course of a particular macro
* expansion, we'll insert this byte before the token; on subsequent
* re-scans, whenever we see this flag, we'll realize that the token
* needs no further consideration of expansion.
*/
const char TOK_FULLY_EXPANDED_FLAG = 0x02;
/*
* Macro substitution end marker. Each time we expand a macro, we'll
* insert immediately after the macro expansion a special pseudo-token,
* consisting of this flag followed by a pointer to the symbol table
* entry for the symbol expanded. As we expand macros, we'll check to
* see if any of these special flags appear in the buffer after the
* macro about to be expanded. If we find such a flag matching the
* symbol about to be expanded, we'll know the symbol has already been
* fully expanded on a previous scan and thus must not be expanded
* again.
*/
const char TOK_MACRO_EXP_END = 0x03;
/*
* End-of-line flag. This serves as a local end-of-file marker for
* preprocessor lines. Because preprocessor lines must be considered in
* isolation, we need some way when parsing one to tell the tokenizer
* not to try to read another line when it reaches the end of the
* current line. This flag serves this purpose: when the tokenizer
* encounters one of these flags, it will simply return end-of-file
* until the caller explicitly reads a new source line.
*/
const char TOK_END_PP_LINE = 0x04;
/*
* "#foreach" marker flag. This marks the presence of a #foreach token in
* a macro's expansion. We leave the text of the expansion area intact,
* but we replace the #foreach token with this marker character.
*/
const char TOK_MACRO_FOREACH_FLAG = 0x05;
/*
* "#argcount" marker flag. This marks the presence of a #argcount token
* in a macro's expansion.
*/
const char TOK_MACRO_ARGCOUNT_FLAG = 0x06;
/*
* "#ifempty" and #ifnempty" marker flags
*/
const char TOK_MACRO_IFEMPTY_FLAG = 0x07;
const char TOK_MACRO_IFNEMPTY_FLAG = 0x08;
/*
* Macro format version number. The compiler sets up a predefined macro
* (__TADS_MACRO_FORMAT_VERSION) with this information. Since 3.1, the
* macro table is visible to user code via t3GetGlobalSymbols() (using the
* T3PreprocMacros selector), and this information includes the parsed
* format with the embedded flag codes. The macro information can be used
* in DynamicFunc compilation at run-time. If we ever make incompatible
* changes to the internal format, future interpreters will have to
* recognize older versions so that they can make the necessary
* translations. By embedding the version information in the table, we
* make this recognition possible.
*/
#define TCTOK_MACRO_FORMAT_VERSION 1
/* ------------------------------------------------------------------------ */
/*
* Macro table. This is a virtualized version of our basic hash table, to
* allow specialized versions that provide views on top of other table
* structures.
*/
class CTcMacroTable
{
public:
virtual ~CTcMacroTable() { }
/* add an entry */
virtual void add(CVmHashEntry *entry) = 0;
/* remove an entry */
virtual void remove(CVmHashEntry *entry) = 0;
/* find an entry */
virtual class CVmHashEntry *find(const char *str, size_t len) = 0;
/* enumerate entries */
virtual void enum_entries(
void (*func)(void *ctx, class CVmHashEntry *entry), void *ctx) = 0;
/* dump the hash table for debugging purposes */
virtual void debug_dump() = 0;
};
/* ------------------------------------------------------------------------ */
/*
* #if state
*/
enum tok_if_t
{
TOKIF_NONE, /* not in a #if block at all */
TOKIF_IF_YES, /* processing a true #if branch */
TOKIF_IF_NO, /* processing a false #if branch */
TOKIF_IF_DONE, /* done with true #if/#elif; skip #elif's and #else */
TOKIF_ELSE_YES, /* processing a true #else branch */
TOKIF_ELSE_NO /* processing a false #else branch */
};
/*
* #if stack entry
*/
struct tok_if_info_t
{
/* state */
tok_if_t state;
/* file descriptor and line number of starting #if */
class CTcTokFileDesc *desc;
long linenum;
};
/* ------------------------------------------------------------------------ */
/*
* Token Types
*/
enum tc_toktyp_t
{
TOKT_INVALID, /* invalid token */
TOKT_NULLTOK, /* null token - caller should read another token */
TOKT_EOF, /* end of file */
TOKT_MACRO_FORMAL, /* formal parameter replacement placeholder */
TOKT_MACRO_FOREACH, /* macro varargs #foreach placeholder */
TOKT_MACRO_ARGCOUNT, /* macro varargs #argcount placeholder */
TOKT_MACRO_IFEMPTY, /* #ifempty macro placeholder */
TOKT_MACRO_IFNEMPTY, /* #ifnempty macro placeholder */
TOKT_SYM, /* symbolic name */
TOKT_INT, /* integer */
TOKT_SSTR, /* single-quoted string */
TOKT_SSTR_START, /* start of an sstring with embedding - '...<< */
TOKT_SSTR_MID, /* middle of an sstring with embedding - >>...<< */
TOKT_SSTR_END, /* end of an sstring with embedding - >>...' */
TOKT_DSTR, /* double-quoted string */
TOKT_DSTR_START, /* start of a dstring with embedding - "...<< */
TOKT_DSTR_MID, /* middle of a dstring with embedding - >>...<< */
TOKT_DSTR_END, /* end of a dstring with embedding - >>..." */
TOKT_RESTR, /* regular expression string - R'...' or R"..." */
TOKT_LPAR, /* left paren '(' */
TOKT_RPAR, /* right paren ')' */
TOKT_COMMA, /* comma ',' */
TOKT_DOT, /* period '.' */
TOKT_LBRACE, /* left brace '{' */
TOKT_RBRACE, /* right brace '}' */
TOKT_LBRACK, /* left square bracket '[' */
TOKT_RBRACK, /* right square bracket ']' */
TOKT_EQ, /* equals sign '=' */
TOKT_EQEQ, /* double-equals sign '==' */
TOKT_ASI, /* colon-equals assignment operator ':=' */
TOKT_PLUS, /* plus sign '+' */
TOKT_MINUS, /* minus sign '-' */
TOKT_TIMES, /* multiplication symbol '*' */
TOKT_DIV, /* division symbol '/' */
TOKT_MOD, /* modulo '%' */
TOKT_GT, /* greater-than sign '>' */
TOKT_LT, /* less-than sign '<' */
TOKT_GE, /* greater-or-equal sign '>=' */
TOKT_LE, /* less-or-equal sign '<=' */
TOKT_NE, /* not-equals sign '!=' or '<>' */
TOKT_ARROW, /* arrow symbol '->' */
TOKT_COLON, /* colon ':' */
TOKT_SEM, /* semicolon ';' */
TOKT_AND, /* bitwise AND '&' */
TOKT_ANDAND, /* logical AND '&&' */
TOKT_OR, /* bitwise OR '|' */
TOKT_OROR, /* logical OR '||' */
TOKT_XOR, /* bitwise XOR '^' */
TOKT_SHL, /* shift left '<<' */
TOKT_ASHR, /* arithmetic shift right '>>' */
TOKT_LSHR, /* logical shift right '>>>' */
TOKT_INC, /* increment '++' */
TOKT_DEC, /* decrement '--' */
TOKT_PLUSEQ, /* plus-equals '+=' */
TOKT_MINEQ, /* minus-equals '-=' */
TOKT_TIMESEQ, /* times-equals '*=' */
TOKT_DIVEQ, /* divide-equals '/=' */
TOKT_MODEQ, /* mod-equals '%=' */
TOKT_ANDEQ, /* and-equals '&=' */
TOKT_OREQ, /* or-equals '|=' */
TOKT_XOREQ, /* xor-equals '^=' */
TOKT_SHLEQ, /* shift-left-and-assign '<<=' */
TOKT_ASHREQ, /* arithmetic shift-right-and-assign '>>=' */
TOKT_LSHREQ, /* logical shift-right-and-assign '>>>=' */
TOKT_NOT, /* logical not '!' */
TOKT_BNOT, /* bitwise not '~' */
TOKT_POUND, /* pound '#' */
TOKT_POUNDPOUND, /* double-pound '##' */
TOKT_POUNDAT, /* pound-at '#@' */
TOKT_ELLIPSIS, /* ellipsis '...' */
TOKT_QUESTION, /* question mark '?' */
TOKT_QQ, /* double question mark '??' */
TOKT_COLONCOLON, /* double-colon '::' */
TOKT_FLOAT, /* floating-point number */
TOKT_BIGINT, /* an integer promoted to a float due to overflow */
TOKT_AT, /* at-sign */
TOKT_DOTDOT, /* range marker '..' */
TOKT_FMTSPEC, /* sprintf format spec for <<%fmt expr>> */
/* keywords */
TOKT_SELF,
TOKT_INHERITED,
TOKT_ARGCOUNT,
TOKT_IF,
TOKT_ELSE,
TOKT_FOR,
TOKT_WHILE,
TOKT_DO,
TOKT_SWITCH,
TOKT_CASE,
TOKT_DEFAULT,
TOKT_GOTO,
TOKT_BREAK,
TOKT_CONTINUE,
TOKT_FUNCTION,
TOKT_RETURN,
TOKT_LOCAL,
TOKT_OBJECT,
TOKT_NIL,
TOKT_TRUE,
TOKT_PASS,
TOKT_EXTERNAL,
TOKT_EXTERN,
TOKT_FORMATSTRING,
TOKT_CLASS,
TOKT_REPLACE,
TOKT_MODIFY,
TOKT_NEW,
TOKT_DELETE,
TOKT_THROW,
TOKT_TRY,
TOKT_CATCH,
TOKT_FINALLY,
TOKT_INTRINSIC,
TOKT_DICTIONARY,
TOKT_GRAMMAR,
TOKT_ENUM,
TOKT_TEMPLATE,
TOKT_STATIC,
TOKT_FOREACH,
TOKT_EXPORT,
TOKT_DELEGATED,
TOKT_TARGETPROP,
TOKT_PROPERTYSET,
TOKT_TARGETOBJ,
TOKT_DEFININGOBJ,
TOKT_TRANSIENT,
TOKT_REPLACED,
TOKT_PROPERTY,
TOKT_OPERATOR,
TOKT_METHOD,
TOKT_INVOKEE
/* type names - formerly reserved but later withdrawn */
// TOKT_VOID,
// TOKT_INTKW,
// TOKT_STRING,
// TOKT_LIST,
// TOKT_BOOLEAN,
// TOKT_ANY
};
/* ------------------------------------------------------------------------ */
/*
* Source Block. As we read the source file, we need to keep quoted
* strings and symbol names around for later reference, in case they're
* needed after reading more tokens and flushing the line buffer. We'll
* copy needed text into our source blocks, which we keep in memory
* throughout the compilation, so that we can be certain we can
* reference these strings at any time.
*/
/* size of a source block */
const size_t TCTOK_SRC_BLOCK_SIZE = 50000;
/* source block class */
class CTcTokSrcBlock
{
public:
CTcTokSrcBlock()
{
/* no next block yet */
nxt_ = 0;
}
~CTcTokSrcBlock()
{
/* delete the next block in line */
if (nxt_ != 0)
delete nxt_;
}
/* get/set the next block */
CTcTokSrcBlock *get_next() const { return nxt_; }
void set_next(CTcTokSrcBlock *blk) { nxt_ = blk; }
/* get a pointer to the block's buffer */
char *get_buf() { return buf_; }
private:
/* the next block in the list */
CTcTokSrcBlock *nxt_;
/* bytes of the list entry */
char buf_[TCTOK_SRC_BLOCK_SIZE];
};
/* ------------------------------------------------------------------------ */
/*
* String Buffer. We use these buffers for reading input lines and
* expanding macros.
*/
class CTcTokString
{
public:
CTcTokString()
{
/* no buffer yet */
buf_ = 0;
buf_len_ = 0;
buf_size_ = 0;
}
virtual ~CTcTokString()
{
/* delete our buffer */
if (buf_ != 0)
t3free(buf_);
}
/* ensure that a given amount of space if available */
virtual void ensure_space(size_t siz)
{
/* make sure there's room for the requested size plus a null byte */
if (buf_size_ < siz + 1)
{
/* increase to the next 4k increment */
buf_size_ = (siz + 4095 + 1) & ~4095;
/* allocate or re-allocate the buffer */
if (buf_ == 0)
buf_ = (char *)t3malloc(buf_size_);
else
buf_ = (char *)t3realloc(buf_, buf_size_);
/* throw an error if that failed */
if (buf_ == 0)
err_throw(TCERR_NO_STRBUF_MEM);
}
}
/* expand the buffer */
void expand()
{
/* expand to the next 4k increment */
ensure_space(buf_size_ + 4096);
}
/* get the text and the length of the text */
const char *get_text() const { return buf_; }
size_t get_text_len() const { return buf_len_; }
/* get the end of the text */
const char *get_text_end() const { return buf_ + buf_len_; }
/* append text to the buffer */
virtual void append(const char *p) { append(p, strlen(p)); }
virtual void append(const char *p, size_t len)
{
/* make sure we have space available */
ensure_space(buf_len_ + len);
/* copy the text onto the end of our buffer */
memcpy(buf_ + buf_len_, p, len);
/* add it to the length of the text */
buf_len_ += len;
/* null-terminte it */
buf_[buf_len_] = '\0';
}
/* prepend text */
virtual void prepend(const char *p) { prepend(p, strlen(p)); }
virtual void prepend(const char *p, size_t len)
{
/* make sure we have enough space */
ensure_space(buf_len_ + len);
/*
* move the existing text (including the null terminator) up in the
* buffer to make room for the prepended text
*/
memmove(buf_ + len, buf_, buf_len_ + 1);
/* copy the new text to the start of the buffer */
memcpy(buf_, p, len);
/* count the new size */
buf_len_ += len;
}
/*
* Append a string to the buffer, enclosing the text in single or
* double quote (as given by 'qu', which must be either '"' or '\'')
* and backslash-escaping any occurrences of the same quote character
* found within the string.
*/
void append_qu(char qu, const char *p) { append_qu(qu, p, strlen(p)); }
void append_qu(char qu, const char *p, size_t len)
{
const char *start;
/* append the open quote */
append(&qu, 1);
/* scan for quotes we'll need to escape */
while (len != 0)
{
size_t rem;
/* skip to the next quote */
for (start = p, rem = len ; rem != 0 && *p != qu ; ++p, --rem) ;
/* insert the chunk up to the quote */
if (p != start)
append(start, p - start);
/* if we did find a quote, append it with a backslash escape */
if (rem != 0)
{
/* append the backslash and the quote */
append("\\", 1);
append(&qu, 1);
/* skip the quote in the source */
++p;
--rem;
}
/* we now only have 'rem' left to consider */
len = rem;
}
/* finally, append the closing quote */
append(&qu, 1);
}
/* insert text into the buffer at the given offset */
virtual void insert(int ofs, const char *p, size_t len)
{
/* check to see if there's anything after the insertion point */
if ((size_t)ofs >= buf_len_)
{
/*
* there's nothing after the insertion point, so this is simply
* equivalent to 'append' - go do the append, and we're done
*/
append(p, len);
return;
}
/* ensure there's space for the added text */
ensure_space(buf_len_ + len);
/*
* Move the existing text after the insertion point just far enough
* to make room for the new text. Include the null terminator.
*/
memmove(buf_ + ofs + len, buf_ + ofs, buf_len_ - ofs + 1);
/* copy the new text in at the given offset */
memcpy(buf_ + ofs, p, len);
/* include the new text in our length */
buf_len_ += len;
}
/* copy text into the buffer, replacing existing text */
virtual void copy(const char *p, size_t len)
{
/* ensure we have enough space */
ensure_space(len);
/* copy the text */
memcpy(buf_, p, len);
/* set our length */
buf_len_ = len;
/* null-terminate it */
buf_[buf_len_] = '\0';
}
/* clear any existing text */
virtual void clear_text()
{
/* zero the length */
buf_len_ = 0;
/* put a null terminator at the start of the buffer if possible */
if (buf_size_ > 0)
buf_[0] = '\0';
}
/* get the buffer, for copying text directly into it */
virtual char *get_buf() const { return buf_; }
size_t get_buf_size() const { return buf_size_; }
/*
* Set the text length - use this after copying directly into the
* buffer to set the length, excluding the null terminator. We'll
* add a null terminator at the given length.
*/
virtual void set_text_len(size_t len)
{
/* set the new length */
buf_len_ = len;
/* add a null terminator after the new length */
if (len < buf_size_)
buf_[len] = '\0';
}
protected:
/* buffer */
char *buf_;
/* size of the buffer */
size_t buf_size_;
/* length of the text in the buffer (excluding trailing null) */
size_t buf_len_;
};
/*
* String buffer subclass for a non-allocated string that merely
* references another buffer. This can be used anywhere a CTcString is
* required, but does not require any allocation.
*
* These objects can only be used in 'const' contexts: the underlying
* buffer cannot be changed or expanded, since we do not own the
* underlying buffer.
*/
class CTcTokStringRef: public CTcTokString
{
public:
CTcTokStringRef()
{
/* we have no referenced buffer yet */
buf_ = 0;
buf_size_ = 0;
buf_len_ = 0;
}
~CTcTokStringRef()
{
/* we don't own the underlying buffer, so simply forget about it */
buf_ = 0;
}
/* we can't make any changes to the underlying buffer */
void ensure_space(size_t) { }
void append(const char *) { assert(FALSE); }
void append(const char *, size_t) { assert(FALSE); }
void prepend(const char *) { assert(FALSE); }
void prepend(const char *, size_t) { assert(FALSE); }
void insert(int, const char *, size_t) { assert(FALSE); }
void copy(const char *, size_t) { assert(FALSE); }
void clear_text() { assert(FALSE); }
char *get_buf() const { assert(FALSE); return 0; }
void set_text_len(size_t) { assert(FALSE); }
/* set my underlying buffer */
void set_buffer(const char *buf, size_t len)
{
buf_ = (char *)buf;
buf_size_ = len + 1;
buf_len_ = len;
}
};
/* ------------------------------------------------------------------------ */
/*
* String embedding context. This keeps track of the token structure for
* embedded expressions within strings using << >>.
*/
struct tok_embed_level
{
/* parenthesis depth within the expression */
int parens;
/* token type to switch back to on ending the string */
tc_toktyp_t endtok;
/* the quote character for the enclosing string */
wchar_t qu;
/* true -> the enclosing string is a triple-quoted string */
int triple;
void enter(wchar_t qu, int triple)
{
this->parens = 0;
this->endtok = (qu == '"' ? TOKT_DSTR_END : TOKT_SSTR_END);
this->qu = qu;
this->triple = triple;
}
};
struct tok_embed_ctx
{
tok_embed_ctx() { reset(); }
void reset()
{
level = 0;
s = 0;
}
void start_expr(wchar_t qu, int triple, int report);
void end_expr()
{
if (level > 0)
--level;
if (level == 0)
s = 0;
else if (level < countof(stk))
s = stk + level - 1;
}
/* are we in an embedded expression? */
int in_expr() const { return level != 0; }
/* paren nesting at current level */
int parens() const { return in_expr() ? s->parens : 0; }
/* inc/dec paren nesting level */
void parens(int inc)
{
if (in_expr())
{
if ((s->parens += inc) < 0)
s->parens = 0;
}
}
/* ending token type at current level */
tc_toktyp_t endtok() const { return in_expr() ? s->endtok : TOKT_INVALID; }
/* ending quote at current level */
wchar_t qu() const { return in_expr() ? s->qu : 0; }
/* ending quote is triple quote at current level */
int triple() const { return in_expr() ? s->triple : FALSE; }
/* nesting level */
int level;
/* stack pointer */
tok_embed_level *s;
/* stack */
tok_embed_level stk[10];
};
/* ------------------------------------------------------------------------ */
/*
* Token
*/
class CTcToken
{
public:
CTcToken() { }
CTcToken(tc_toktyp_t typ) : typ_(typ) { }
/* get/set the token type */
tc_toktyp_t gettyp() const { return typ_; }
void settyp(tc_toktyp_t typ) { typ_ = typ; }
/* get/set the fully-expanded flag */
int get_fully_expanded() const { return fully_expanded_; }
void set_fully_expanded(int flag) { fully_expanded_ = flag; }
/* get/set the text pointer */
const char *get_text() const { return text_; }
size_t get_text_len() const { return text_len_; }
void set_text(const char *txt, size_t len)
{
text_ = txt;
text_len_ = len;
}
/* get/set the integer value */
ulong get_int_val() const { return int_val_; }
void set_int_val(ulong val)
{
typ_ = TOKT_INT;
int_val_ = val;
}
/*
* compare the text to the given string - returns true if the text
* matches, false if not
*/
int text_matches(const char *txt, size_t len) const
{
return (len == text_len_ && memcmp(txt, text_, len) == 0);
}
int text_matches(const char *txt) const
{
return text_matches(txt, txt != 0 ? strlen(txt) : 0);
}
/* copy from a another token */
void set(const CTcToken &tok)
{
typ_ = tok.typ_;
text_ = tok.text_;
text_len_ = tok.text_len_;
int_val_ = tok.int_val_;
fully_expanded_ = tok.fully_expanded_;
}
private:
/* token type */
tc_toktyp_t typ_;
/*
* Pointer to the token's text. This is a pointer into the
* tokenizer's symbol table or into the token list itself, so this
* pointer is valid as long as the tokenizer and its token list are
* valid.
*/
const char *text_;
size_t text_len_;
/* integer value - valid when the token type is TOKT_INT */
ulong int_val_;
/*
* flag: the token has been fully expanded, and should not be
* expanded further on any subsequent rescan for macros
*/
uint fully_expanded_ : 1;
};
/*
* Token list entry. This is a generic linked list element containing a
* token.
*/
class CTcTokenEle: public CTcToken
{
public:
CTcTokenEle() { nxt_ = prv_ = 0; }
/* get/set the next element */
CTcTokenEle *getnxt() const { return nxt_; }
void setnxt(CTcTokenEle *nxt) { nxt_ = nxt; }
/* get/set the previous element */
CTcTokenEle *getprv() const { return prv_; }
void setprv(CTcTokenEle *prv) { prv_ = prv; }
protected:
/* next/previous token in list */
CTcTokenEle *nxt_;
CTcTokenEle *prv_;
};
/* ------------------------------------------------------------------------ */
/*
* Macro Expansion Resource object. This object is a collection of
* resources that are needed for a macro expansion. To avoid frequent
* allocating and freeing of these resources, we keep a pool of these
* objects around so that we can re-use them as needed. We'll
* dynamically expand the pool as necessary, so this doesn't impose any
* pre-set limits; it simply avoids lots of memory allocation activity.
*/
class CTcMacroRsc
{
public:
CTcMacroRsc()
{
/* we're not in any lists yet */
next_avail_ = 0;
next_ = 0;
}
/* buffer for expansion of the whole line */
CTcTokString line_exp_;
/* buffer for expansion of current macro on line */
CTcTokString macro_exp_;
/* buffer for expansion of an actual parameter value */
CTcTokString actual_exp_buf_;
/* next resource object in the "available" list */
CTcMacroRsc *next_avail_;
/* next resource object in the master list */
CTcMacroRsc *next_;
};
/* ------------------------------------------------------------------------ */
/*
* Abstract token source interface. This is used to allow external code
* to inject their own substreams into the main token stream.
*/
class CTcTokenSource
{
public:
virtual ~CTcTokenSource() { }
/*
* Get the next token from the source. Returns null if there are no
* more tokens.
*/
virtual const CTcToken *get_next_token() = 0;
/* set the enclosing external token source and current token */
void set_enclosing_source(CTcTokenSource *src, const CTcToken *tok)
{
/* remember the enclosing source */
enclosing_src_ = src;
/* remember the current token */
enclosing_curtok_ = *tok;
}
/* get the enclosing external token source */
CTcTokenSource *get_enclosing_source() const
{ return enclosing_src_; }
/* get the token that was current when this source was inserted */
const CTcToken *get_enclosing_curtok() const
{ return &enclosing_curtok_; }
protected:
/* the enclosing external token source */
CTcTokenSource *enclosing_src_;
/*
* the current token in effect enclosing this source - this is the
* token that comes immediately after the source's tokens, because a
* source is inserted before the current token
*/
CTcToken enclosing_curtok_;
};
/* ------------------------------------------------------------------------ */
/*
* Newline spacing modes. The newline spacing mode controls how line
* breaks are handled within strings.
*/
enum newline_spacing_mode_t
{
/* delete: a newline and immediately following whitespace are deleted */
NEWLINE_SPACING_DELETE = 0,
/*
* collapse: a newline and immediately following whitespace are
* replaced with a single space character
*/
NEWLINE_SPACING_COLLAPSE = 1,
/*
* preserve: newlines and subsequent whitespace are preserved exactly
* as written in the source code
*/
NEWLINE_SPACING_PRESERVE = 2
};
/* ------------------------------------------------------------------------ */
/*
* Tokenizer. This object reads a file and constructs a representation
* of the file as a token list in memory. The tokenizer interprets
* preprocessor directives and expands macros.
*/
class CTcTokenizer
{
friend class CTcHashEntryPpDefine;
public:
/*
* Create the tokenizer and start reading from the given file. The
* default character set is generally specified by the user (on the
* compiler command line, for example), or obtained from the
* operating system.
*/
CTcTokenizer(class CResLoader *res_loader, const char *default_charset);
/* destroy the tokenizer */
~CTcTokenizer();
/*
* Reset the tokenizer. Deletes the current source object and all
* saved token text. This can be used after compilation of a unit
* is completed and the intermediate parser state can be completely
* discarded.
*/
void reset();