-
Notifications
You must be signed in to change notification settings - Fork 2
/
Camltoken.ml
587 lines (508 loc) · 19.2 KB
/
Camltoken.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
(****************************************************************************)
(* *)
(* Objective Caml *)
(* *)
(* INRIA Rocquencourt *)
(* *)
(* Copyright 2006-2010 Institut National de Recherche en Informatique et *)
(* en Automatique. All rights reserved. This file is distributed under *)
(* the terms of the GNU Library General Public License, with the special *)
(* exception on linking described in LICENSE at the top of the Objective *)
(* Caml source tree. *)
(* *)
(****************************************************************************)
(* Authors:
* - Nicolas Pouillard: initial version
*)
type quotation =
{ q_name : string ;
q_loc : string ;
q_contents : string }
type blanks = string
type comment = string
type caml_token =
| LIDENT of string
| UIDENT of string
| SYMBOL of string
| INT of int * string
| INT32 of int32 * string
| INT64 of int64 * string
| NATIVEINT of nativeint * string
| FLOAT of float * string
| CHAR of char * string
| STRING of string * string
| LABEL of string
| OPTLABEL of string
| QUOTATION of quotation
| ANTIQUOT of string * string
| COMMENT of comment
| BLANKS of blanks
| NEWLINE of newline
| LINE_DIRECTIVE of line_directive
| WARNING of warning
| ERROR of string * error
and newline = LF | CR | CRLF
and warning =
| Comment_start
| Comment_not_end
| Illegal_escape_in_string of string * int
and error =
| Illegal_character of char
| Illegal_escape_in_character of string
| Unterminated of (Lexing.position * unterminated) list
| Literal_overflow of string
and unterminated =
| Ucomment
| Ustring
| Uquotation
| Uantiquot
and line_directive = {
l_blanks1 : blanks;
l_zeros : int;
l_linenum : int;
l_blanks2 : blanks;
l_filename : string option;
l_comment : comment;
l_newline : newline
}
let sf = Printf.sprintf
let string_of_unterminated =
function
| Ucomment -> "comment"
| Ustring -> "string literal"
| Uquotation -> "quotation"
| Uantiquot -> "antiquotation"
let string_of_unterminated_list us =
String.concat " of " (List.map string_of_unterminated us)
let string_of_error =
function
| Illegal_character c ->
sf "Illegal character (%s)" (Char.escaped c)
| Illegal_escape_in_character s ->
sf "Illegal backslash escape in character literal (%s)" s
| Unterminated [(_,u)] ->
sf "Unterminated %s" (string_of_unterminated u)
| Unterminated ((_,u) :: us) ->
let us = List.map snd us in
sf "This %s contains an unterminated %s" (string_of_unterminated_list us) (string_of_unterminated u)
| Unterminated [] ->
assert false
| Literal_overflow ty ->
sf "Integer literal exceeds the range of representable integers of type %s" ty
let show_unterminated =
function
| Ucomment -> ["Ucomment"]
| Ustring -> ["Ustring"]
| Uquotation -> ["Uquotation"]
| Uantiquot -> ["Uantiquot"]
let show_position { Lexing.pos_fname=fp; pos_lnum=ln; pos_bol=bol; pos_cnum=cn } =
[fp; string_of_int ln; string_of_int bol; string_of_int cn]
let show2 s1 s2 (x, y) = s1 x @ s2 y
(* meant to be used by show_token/strings_of_token *)
let show_error =
function
| Illegal_character _ ->
("Illegal_character", [])
| Illegal_escape_in_character s ->
("Illegal_escape_in_character", [s])
| Unterminated us ->
("Unterminated", List.concat (List.map (show2 show_position show_unterminated) us))
| Literal_overflow ty ->
("Literal_overflow", [ty])
let string_of_quotation {q_name=n; q_loc=l; q_contents=s} =
let locname = if l = "" then "" else sf "@%s" l in
if n = "" then sf "<%s<%s>>" locname s
else sf "<:%s%s<%s>>" n locname s
(* spec quotation_width = String.length <.> string_of_quotation *)
let quotation_width {q_name=n; q_loc=l; q_contents=s} =
let locname_width = if l = "" then 0 else String.length l + 1 in
if n = "" then String.length s + locname_width + 4
else String.length s + String.length n + locname_width + 5
let for_all p s =
let len = String.length s in
let rec loop i = i >= len || p s.[i] && loop (i+1)
in loop 0
let (<$>) f = function
| Some x -> Some (f x)
| None -> None
let blank c = List.mem c [' '; '\t'; '\012']
let newline c = List.mem c ['\r'; '\n']
let blanks = for_all blank
let no_newline = for_all (fun c -> not (newline c))
let string_of_newline = function
| LF -> "\n"
| CR -> "\r"
| CRLF -> "\r\n"
(* spec = String.length <.> string_of_newline *)
let newline_width = function
| LF | CR -> 1
| CRLF -> 2
let strings_of_warning = function
| Comment_start -> ["Comment_start"]
| Comment_not_end -> ["Comment_not_end"]
| Illegal_escape_in_string(s, i) ->
["Illegal_escape_in_string"; s; string_of_int i]
let message_of_warning = function
| Comment_start -> "this is the start of a comment"
| Comment_not_end -> "this is not the end of a comment"
| Illegal_escape_in_string(s,_) -> sf "Illegal backslash escape in string (\\%s)" s
(* spec String.length <.> string_of_int *)
let int_width =
let rec loop acc x =
let q = x / 10 in
if q > 0 then loop (1 + acc) q else acc
in loop 0
let string_of_line_directive x =
assert (blanks x.l_blanks1);
assert (blanks x.l_blanks2);
assert (x.l_zeros >= 0);
assert (no_newline x.l_comment);
let nl = string_of_newline x.l_newline in
let zeros = String.make x.l_zeros '0' in
match x.l_filename with
| Some s ->
sf "#%s%s%d%s\"%s\"%s%s" x.l_blanks1 zeros x.l_linenum x.l_blanks2 s x.l_comment nl
| None ->
sf "#%s%s%d%s%s%s" x.l_blanks1 zeros x.l_linenum x.l_blanks2 x.l_comment nl
let line_directive_width x =
let nl = newline_width x.l_newline in
let fn_w =
match x.l_filename with
| Some s -> String.length s + 2
| None -> 0
in
String.length x.l_blanks1 + x.l_zeros + int_width x.l_linenum + String.length x.l_blanks2 + fn_w + String.length x.l_comment + nl + 1
let string_of_token = function
SYMBOL s |
LIDENT s |
UIDENT s |
COMMENT s |
INT (_, s) |
FLOAT (_, s) |
ERROR (s, _) |
BLANKS s -> s
| LABEL s -> sf "~%s:" s
| OPTLABEL s -> sf "?%s:" s
| INT32 (_, s) -> sf "%sl" s
| INT64 (_, s) -> sf "%sL" s
| NATIVEINT (_, s) -> sf "%sn" s
(* 's' is already properly escaped *)
| CHAR (_, s) -> sf "'%s'" s
(* 's' is already properly escaped *)
| STRING (_, s) -> sf "\"%s\"" s
| ANTIQUOT ("", c) -> sf "$%s$" c
| ANTIQUOT (n, c) -> sf "$%s:%s$" n c
| QUOTATION q -> string_of_quotation q
| NEWLINE nl -> string_of_newline nl
| WARNING _ -> ""
| LINE_DIRECTIVE ld -> string_of_line_directive ld
(*
The spec is token_width = String.length <.> string_of_token, that is
the kind of code where the compiler could deduce the right code from
the spec.
*)
let token_width = function
SYMBOL s |
LIDENT s |
UIDENT s |
COMMENT s |
INT (_, s) |
FLOAT (_, s) |
ERROR (s, _) |
BLANKS s -> String.length s
| LABEL s |
OPTLABEL s |
CHAR (_, s) |
STRING (_, s) |
ANTIQUOT ("", s) -> String.length s + 2
| INT32 (_, s) |
INT64 (_, s) |
NATIVEINT (_, s) -> String.length s + 1
| ANTIQUOT (n, c) -> String.length n + String.length c + 3
| QUOTATION q -> quotation_width q
| NEWLINE nl -> newline_width nl
| WARNING _ -> 0
| LINE_DIRECTIVE ld -> line_directive_width ld
let strings_of_token = function
| SYMBOL s -> ("SYMBOL", [s])
| LIDENT s -> ("LIDENT", [s])
| UIDENT s -> ("UIDENT", [s])
| INT(_, s) -> ("INT", [s])
| INT32(_, s) -> ("INT32", [s])
| INT64(_, s) -> ("INT64", [s])
| NATIVEINT(_, s) -> ("NATIVEINT", [s])
| FLOAT(_, s) -> ("FLOAT", [s])
| CHAR(_, s) -> ("CHAR", [s])
| STRING(_, s) -> ("STRING", [s]) (* here we give the source string *)
| LABEL s -> ("LABEL", [s])
| OPTLABEL s -> ("OPTLABEL", [s])
| ANTIQUOT(n, s) -> ("ANTIQUOT", [n; s])
| QUOTATION x -> ("QUOTATION", [x.q_name; x.q_loc; x.q_contents])
| COMMENT s -> ("COMMENT", [s])
| BLANKS s -> ("BLANKS", [s])
| NEWLINE nl -> ("NEWLINE", [string_of_newline nl])
| WARNING w -> ("WARNING", strings_of_warning w)
| ERROR (tok, err) -> ("ERROR", tok :: let (x,xs) = show_error err in x :: xs)
| LINE_DIRECTIVE{l_blanks1=bl1;l_zeros=zeros;l_linenum=i;l_blanks2=bl2;
l_filename=sopt;l_comment=com;l_newline=nl} ->
let nl = string_of_newline nl in
match sopt with
| None -> ("LINE_DIRECTIVE", [bl1; string_of_int zeros; string_of_int i; bl2; com; nl])
| Some s -> ("LINE_DIRECTIVE", [bl1; string_of_int zeros; string_of_int i; bl2; s; com; nl])
let show_token t =
let (name, args) = strings_of_token t in
if args = [] then name else
sf "%s \"%s\"" name (String.concat "\" \"" (List.map String.escaped args))
module Eval = struct
exception Backtrack of int
let peek s i = if i < String.length s then Some s.[i] else None
let get s i =
let len = String.length s in
if i < len then s.[i] else raise (Backtrack (len - 1))
let rec skip_indent s i =
match peek s i with
| Some (' ' | '\t') -> skip_indent s (i+1)
| _ -> i
let skip_opt_linefeed s i =
match peek s i with
| Some '\n' -> i + 1
| _ -> i
let get_dec_digit s i =
match get s i with
| '0'..'9' as c -> Char.code c - 48
| _ -> raise (Backtrack i)
let get_hex_digit s i =
match get s i with
| '0'..'9' | 'a'..'f' | 'A'..'F' as c ->
let d = Char.code c in
if d >= 97 then d - 87
else if d >= 65 then d - 55
else d - 48
| _ -> raise (Backtrack i)
let escaped_char s i = match get s i with
| '\n' -> '\n', i + 1
| '\r' -> '\r', i + 1
| 'n' -> '\n', i + 1
| 'r' -> '\r', i + 1
| 't' -> '\t', i + 1
| 'b' -> '\b', i + 1
| '\\' -> '\\', i + 1
| '"' -> '"', i + 1
| '\'' -> '\'', i + 1
| ' ' -> ' ', i + 1
| '0'..'9' as c ->
let d1 = Char.code c - 48 in
let d2 = get_dec_digit s (i + 1) in
let d3 = get_dec_digit s (i + 2) in
let c = 100 * d1 + 10 * d2 + d3 in
if c < 0 || c > 255 then raise (Backtrack (i+2)) else (Char.chr c, i + 3)
| 'x' ->
let d1 = get_hex_digit s (i + 1) in
let d2 = get_hex_digit s (i + 2) in
Char.chr (16 * d1 + d2), i + 3
| _ -> raise (Backtrack i)
let eof s i = if peek s i <> None then raise (Backtrack (i + 1))
let char s =
match String.length s with
| 1 -> s.[0]
| 0 -> failwith "invalid char token"
| _ when s = "\r\n" -> '\n'
| _ when s.[0] <> '\\' -> failwith "invalid char token"
| _ -> try let (c, i) = escaped_char s 1 in
eof s i; c
with Backtrack _ -> failwith "invalid char token"
let backslash_in_string err store s i =
match peek s i with
| Some '\n' -> skip_indent s (i+1)
| Some '\r' -> skip_indent s (skip_opt_linefeed s (i+1))
| Some c ->
begin try let (x, i) = escaped_char s i in store x; i
with Backtrack j -> (store '\\'; store c; err j; i + 1)
end
| None -> err i; store '\\'; i
let string s =
let buf = Buffer.create 23 in
let store = Buffer.add_char buf in
let errors = ref [] in
let err i j = errors := (i, j - i + 1) :: !errors in
let rec parse i =
match peek s i with
| Some '\\' -> parse (backslash_in_string (err (i+1)) store s (i+1))
| Some c -> store c; parse (i+1)
| None -> Buffer.contents buf
in
let s = parse 0 in
s, List.rev !errors
end
let eval_char = Eval.char
let eval_string = Eval.string
(* To convert integer literals, allowing max_int + 1 (PR#4210) *)
let cvt_int_literal s =
- int_of_string ("-" ^ s)
let cvt_int32_literal s =
Int32.neg (Int32.of_string ("-" ^ String.sub s 0 (String.length s - 1)))
let cvt_int64_literal s =
Int64.neg (Int64.of_string ("-" ^ String.sub s 0 (String.length s - 1)))
let cvt_nativeint_literal s =
Nativeint.neg (Nativeint.of_string ("-" ^ String.sub s 0 (String.length s - 1)))
let mkWARNING w = WARNING w
let mkERROR s e = ERROR (s, e)
let literal_overflow tok ty = mkERROR tok (Literal_overflow ty)
let illegal_escape_in_character tok s = mkERROR tok (Illegal_escape_in_character s)
let illegal_escape_in_string s (ofs, len) =
let sub = try String.sub s ofs len with Invalid_argument _ -> assert false in
mkWARNING (Illegal_escape_in_string(sub, ofs))
let mkCHAR s = try CHAR(eval_char s, s)
with Failure _ -> illegal_escape_in_character (sf "'%s'" s) s
let mkSTRING s = let s', errs = eval_string s in
STRING(s', s), List.map (illegal_escape_in_string s) errs
let mkINT s = try INT(cvt_int_literal s, s)
with Failure _ -> literal_overflow s "int"
let mkINT32 s = try INT32(cvt_int32_literal s, s)
with Failure _ -> literal_overflow (s^"l") "int32"
let mkINT64 s = try INT64(cvt_int64_literal s, s)
with Failure _ -> literal_overflow (s^"L")"int64"
let mkNATIVEINT s = try NATIVEINT(cvt_nativeint_literal s, s)
with Failure _ -> literal_overflow (s^"n") "nativeint"
let mkFLOAT s = try FLOAT(float_of_string s, s)
with Failure _ -> literal_overflow s "float"
(* TODO add some assertions? *)
let mkLIDENT s = LIDENT s
let mkUIDENT s = UIDENT s
let mkSYMBOL s = SYMBOL s
let mkLABEL s = LABEL s
let mkOPTLABEL s = OPTLABEL s
let mkQUOTATION q = QUOTATION q
let mkANTIQUOT ?(name="") s = ANTIQUOT (name, s)
let mkCOMMENT com = COMMENT com
let mkBLANKS s = BLANKS s
let mkNEWLINE nl = NEWLINE nl
let mkLINE_DIRECTIVE ld = LINE_DIRECTIVE ld
let newline_of_string = function
| "\n" -> LF
| "\r" -> CR
| "\r\n" -> CRLF
| _ -> invalid_arg "newline_of_string"
(* not exported *)
let mkLINE_DIRECTIVE' ?(bl1="") ?(bl2="") ?(zeros="0") ?s ?(com="") ?(nl="\n") i =
assert (blanks bl1);
assert (blanks bl2);
assert (no_newline com);
let zeros = int_of_string zeros in
let nl = match nl with
| "\n" -> LF
| "\r" -> CR
| "\r\n" -> CRLF
| _ -> assert false
in
LINE_DIRECTIVE{l_blanks1=bl1;l_zeros=zeros;l_linenum=int_of_string i;
l_blanks2=bl2;l_filename=s;l_comment=com;l_newline=nl}
module ParseLib = struct
exception ParseFailure
let pure x input = (x, input)
let fail _input = raise ParseFailure
let parse_elt = function
| x :: xs -> (x, xs)
| input -> fail input
let pmap f p input =
let (x, input) = p input in
(f x, input)
let pmapf f p input =
let (x, input) = p input in
match f x with
| Some r -> (r, input)
| None -> fail input
let (<*>) pf px input =
let (f, input) = pf input in
let (x, input) = px input in
(f x, input)
let parse_int =
let safe_int_of_string x =
try Some (int_of_string x)
with Failure _ -> None
in
pmapf safe_int_of_string parse_elt
let try_parse p input =
match try Some (p input) with ParseFailure -> None with
| Some (x, input) -> (Some x, input)
| None -> (None, input)
let rec parse_list p input =
let (xopt, input) = try_parse p input in
match xopt with
| Some x -> pmap (fun xs -> x :: xs) (parse_list p) input
| None -> ([], input)
let parse_full p input =
let (x, input) = try_parse p input in
if input = [] then x else None
end
open ParseLib
let parse_unterminated =
let f = function
| "Ucomment" -> Some Ucomment
| "Ustring" -> Some Ustring
| "Uquotation" -> Some Uquotation
| "Uantiquot" -> Some Uantiquot
| _ -> None
in
pmapf f parse_elt
let parse_position =
let mk_position fp ln bol cn =
{ Lexing.pos_fname=fp; pos_lnum=ln; pos_bol=bol; pos_cnum=cn }
in
pmap mk_position parse_elt <*> parse_int <*> parse_int <*> parse_int
let parse_position_unterminated =
pmap (fun x y -> x,y) parse_position <*> parse_unterminated
let parse_position_unterminated_list =
parse_list parse_position_unterminated
let warning_of_strings = function
| ["Comment_start"] -> Some Comment_start
| ["Comment_not_end"] -> Some Comment_not_end
| ["Illegal_escape_in_string"; s; si] ->
Some (Illegal_escape_in_string(s, int_of_string si))
| _ -> None
let token_of_strings = function
| "SYMBOL", [s] -> Some (SYMBOL s)
| "LIDENT", [s] -> Some (LIDENT s)
| "UIDENT", [s] -> Some (UIDENT s)
| "INT", [s] -> Some (mkINT s)
| "INT32", [s] -> Some (mkINT32 s)
| "INT64", [s] -> Some (mkINT64 s)
| "NATIVEINT", [s] -> Some (mkNATIVEINT s)
| "FLOAT", [s] -> Some (mkFLOAT s)
| "CHAR", [s] -> Some (mkCHAR s)
| "STRING", [s] -> Some (fst (mkSTRING s))
| "LABEL", [s] -> Some (LABEL s)
| "OPTLABEL", [s] -> Some (OPTLABEL s)
| "ANTIQUOT", [n; s] -> Some (ANTIQUOT(n, s))
| "QUOTATION", [n;l;c] -> Some (QUOTATION{q_name=n;q_loc=l;q_contents=c})
| "COMMENT", [s] -> Some (COMMENT s)
| "BLANKS", [s] -> Some (BLANKS s)
| "NEWLINE", [] -> Some (NEWLINE LF) (* lax in the input *)
| "NEWLINE", ["\n"] -> Some (NEWLINE LF)
| "NEWLINE", ["\r"] -> Some (NEWLINE CR)
| "NEWLINE", ["\r\n"] -> Some (NEWLINE CRLF)
| "WARNING", xs -> mkWARNING <$> warning_of_strings xs
| "ERROR", (tok :: x :: xs) ->
(* Don't you see this code crying for the option monad? *)
let mk err = Some (ERROR (tok, err)) in
begin match x, xs with
| "Illegal_character", [] -> assert (tok <> ""); mk (Illegal_character tok.[0])
| "Illegal_escape_in_character", [s] -> mk (Illegal_escape_in_character s)
| "Literal_overflow", [t] -> mk (Literal_overflow t)
| "Unterminated", us ->
begin match parse_full parse_position_unterminated_list us with
| Some x -> mk (Unterminated x)
| None -> None
end
| _ -> None
end
| "LINE_DIRECTIVE", xs ->
begin match xs with
(* One are a bit lax in the input *)
| [i] -> Some (mkLINE_DIRECTIVE' i)
| [i; s] -> Some (mkLINE_DIRECTIVE' ~s i)
| [bl1;zeros;i;bl2;com;nl] -> Some (mkLINE_DIRECTIVE' ~bl1 ~zeros ~bl2 ~com ~nl i)
| [bl1;zeros;i;bl2;s;com;nl] -> Some (mkLINE_DIRECTIVE' ~bl1 ~zeros ~bl2 ~s ~com ~nl i)
| _ -> None
end
| _ -> None