Skip to content

Add a canonical encoding of identifiers as numbers and use it in clightgen #353

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 19, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 100 additions & 1 deletion exportclight/Clightdefs.v
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

(** All imports and definitions used by .v Clight files generated by clightgen *)

From Coq Require Import String List ZArith.
From Coq Require Import Ascii String List ZArith.
From compcert Require Import Integers Floats Maps Errors AST Ctypes Cop Clight.

Definition tvoid := Tvoid.
Expand Down Expand Up @@ -80,3 +80,102 @@ Definition mkprogram (types: list composite_definition)
prog_types := types;
prog_comp_env := ce;
prog_comp_env_eq := EQ |}.

(** The following encoding of character strings as positive numbers
must be kept consistent with the OCaml function [Camlcoq.pos_of_string]. *)

Definition append_bit_pos (b: bool) (p: positive) : positive :=
if b then xI p else xO p.

Definition append_char_pos_default (c: ascii) (p: positive) : positive :=
let '(Ascii b7 b6 b5 b4 b3 b2 b1 b0) := c in
xI (xI (xI (xI (xI (xI
(append_bit_pos b0 (append_bit_pos b1
(append_bit_pos b2 (append_bit_pos b3
(append_bit_pos b4 (append_bit_pos b5
(append_bit_pos b6 (append_bit_pos b7 p))))))))))))).

Definition append_char_pos (c: ascii) (p: positive) : positive :=
match c with
| "0"%char => xO (xO (xO (xO (xO (xO p)))))
| "1"%char => xI (xO (xO (xO (xO (xO p)))))
| "2"%char => xO (xI (xO (xO (xO (xO p)))))
| "3"%char => xI (xI (xO (xO (xO (xO p)))))
| "4"%char => xO (xO (xI (xO (xO (xO p)))))
| "5"%char => xI (xO (xI (xO (xO (xO p)))))
| "6"%char => xO (xI (xI (xO (xO (xO p)))))
| "7"%char => xI (xI (xI (xO (xO (xO p)))))
| "8"%char => xO (xO (xO (xI (xO (xO p)))))
| "9"%char => xI (xO (xO (xI (xO (xO p)))))
| "a"%char => xO (xI (xO (xI (xO (xO p)))))
| "b"%char => xI (xI (xO (xI (xO (xO p)))))
| "c"%char => xO (xO (xI (xI (xO (xO p)))))
| "d"%char => xI (xO (xI (xI (xO (xO p)))))
| "e"%char => xO (xI (xI (xI (xO (xO p)))))
| "f"%char => xI (xI (xI (xI (xO (xO p)))))
| "g"%char => xO (xO (xO (xO (xI (xO p)))))
| "h"%char => xI (xO (xO (xO (xI (xO p)))))
| "i"%char => xO (xI (xO (xO (xI (xO p)))))
| "j"%char => xI (xI (xO (xO (xI (xO p)))))
| "k"%char => xO (xO (xI (xO (xI (xO p)))))
| "l"%char => xI (xO (xI (xO (xI (xO p)))))
| "m"%char => xO (xI (xI (xO (xI (xO p)))))
| "n"%char => xI (xI (xI (xO (xI (xO p)))))
| "o"%char => xO (xO (xO (xI (xI (xO p)))))
| "p"%char => xI (xO (xO (xI (xI (xO p)))))
| "q"%char => xO (xI (xO (xI (xI (xO p)))))
| "r"%char => xI (xI (xO (xI (xI (xO p)))))
| "s"%char => xO (xO (xI (xI (xI (xO p)))))
| "t"%char => xI (xO (xI (xI (xI (xO p)))))
| "u"%char => xO (xI (xI (xI (xI (xO p)))))
| "v"%char => xI (xI (xI (xI (xI (xO p)))))
| "w"%char => xO (xO (xO (xO (xO (xI p)))))
| "x"%char => xI (xO (xO (xO (xO (xI p)))))
| "y"%char => xO (xI (xO (xO (xO (xI p)))))
| "z"%char => xI (xI (xO (xO (xO (xI p)))))
| "A"%char => xO (xO (xI (xO (xO (xI p)))))
| "B"%char => xI (xO (xI (xO (xO (xI p)))))
| "C"%char => xO (xI (xI (xO (xO (xI p)))))
| "D"%char => xI (xI (xI (xO (xO (xI p)))))
| "E"%char => xO (xO (xO (xI (xO (xI p)))))
| "F"%char => xI (xO (xO (xI (xO (xI p)))))
| "G"%char => xO (xI (xO (xI (xO (xI p)))))
| "H"%char => xI (xI (xO (xI (xO (xI p)))))
| "I"%char => xO (xO (xI (xI (xO (xI p)))))
| "J"%char => xI (xO (xI (xI (xO (xI p)))))
| "K"%char => xO (xI (xI (xI (xO (xI p)))))
| "L"%char => xI (xI (xI (xI (xO (xI p)))))
| "M"%char => xO (xO (xO (xO (xI (xI p)))))
| "N"%char => xI (xO (xO (xO (xI (xI p)))))
| "O"%char => xO (xI (xO (xO (xI (xI p)))))
| "P"%char => xI (xI (xO (xO (xI (xI p)))))
| "Q"%char => xO (xO (xI (xO (xI (xI p)))))
| "R"%char => xI (xO (xI (xO (xI (xI p)))))
| "S"%char => xO (xI (xI (xO (xI (xI p)))))
| "T"%char => xI (xI (xI (xO (xI (xI p)))))
| "U"%char => xO (xO (xO (xI (xI (xI p)))))
| "V"%char => xI (xO (xO (xI (xI (xI p)))))
| "W"%char => xO (xI (xO (xI (xI (xI p)))))
| "X"%char => xI (xI (xO (xI (xI (xI p)))))
| "Y"%char => xO (xO (xI (xI (xI (xI p)))))
| "Z"%char => xI (xO (xI (xI (xI (xI p)))))
| "_"%char => xO (xI (xI (xI (xI (xI p)))))
| _ => append_char_pos_default c p
end.

Fixpoint ident_of_string (s: string) : ident :=
match s with
| EmptyString => xH
| String c s => append_char_pos c (ident_of_string s)
end.

(** A convenient notation [$ "ident"] to force evaluation of
[ident_of_string "ident"] *)

Ltac ident_of_string s :=
let x := constr:(ident_of_string s) in
let y := eval compute in x in
exact y.

Notation "$ s" := (ltac:(ident_of_string s))
(at level 1, only parsing) : string_scope.
9 changes: 7 additions & 2 deletions exportclight/Clightgen.ml
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ Recognized source files:
.i or .p C source file that should not be preprocessed
Processing options:
-normalize Normalize the generated Clight code w.r.t. loads in expressions
-canonical-idents Use canonical numbers to represent identifiers (default)
-short-idents Use canonical numbers to represent identifiers
-E Preprocess only, send result to standard output
-o <file> Generate output in <file>
|} ^
Expand Down Expand Up @@ -142,6 +144,8 @@ let cmdline_actions =
(* Processing options *)
[ Exact "-E", Set option_E;
Exact "-normalize", Set option_normalize;
Exact "-canonical-idents", Set Camlcoq.use_canonical_atoms;
Exact "-short-idents", Unset Camlcoq.use_canonical_atoms;
Exact "-o", String(fun s -> option_o := Some s);
Prefix "-o", Self (fun s -> let s = String.sub s 2 ((String.length s) - 2) in
option_o := Some s);]
Expand Down Expand Up @@ -175,20 +179,21 @@ let cmdline_actions =
]

let _ =
try
try
Gc.set { (Gc.get()) with
Gc.minor_heap_size = 524288; (* 512k *)
Gc.major_heap_increment = 4194304 (* 4M *)
};
Printexc.record_backtrace true;
Camlcoq.use_canonical_atoms := true;
Frontend.init ();
parse_cmdline cmdline_actions;
if !option_o <> None && !num_input_files >= 2 then
fatal_error no_loc "Ambiguous '-o' option (multiple source files)";
if !num_input_files = 0 then
fatal_error no_loc "no input file";
perform_actions ()
with
with
| Sys_error msg
| CmdError msg -> error no_loc "%s" msg; exit 2
| Abort -> exit 2
Expand Down
35 changes: 21 additions & 14 deletions exportclight/ExportClight.ml
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,12 @@ let define_idents p =
string_of_atom
(fun (id, name) ->
try
fprintf p "Definition _%s : ident := %ld%%positive.@ "
(sanitize name) (P.to_int32 id)
if id = pos_of_string name then
fprintf p "Definition _%s : ident := $\"%s\".@ "
(sanitize name) name
else
fprintf p "Definition _%s : ident := %ld%%positive.@ "
(sanitize name) (P.to_int32 id)
with Not_an_identifier ->
());
iter_hashtbl_sorted
Expand All @@ -93,9 +97,11 @@ let define_idents p =
fprintf p "@ "

let name_temporary t =
let t1 = P.to_int t and t0 = P.to_int (first_unused_ident ()) in
if t1 >= t0 && not (Hashtbl.mem temp_names t)
then Hashtbl.add temp_names t (sprintf "_t'%d" (t1 - t0 + 1))
if not (Hashtbl.mem string_of_atom t) && not (Hashtbl.mem temp_names t)
then begin
let t1 = P.to_int t and t0 = P.to_int (first_unused_ident ()) in
Hashtbl.add temp_names t (sprintf "_t'%d" (t1 - t0 + 1))
end

let name_opt_temporary = function
| None -> ()
Expand Down Expand Up @@ -468,7 +474,7 @@ let print_assertion p (txt, targs) =
| Text _ -> ()
| Param n -> max_param := max n !max_param)
frags;
fprintf p " | \"%s\"%%string, " txt;
fprintf p " | \"%s\", " txt;
list_iteri
(fun i targ -> fprintf p "_x%d :: " (i + 1))
targs;
Expand All @@ -495,7 +501,8 @@ let print_assertions p =
let prologue = "\
From Coq Require Import String List ZArith.\n\
From compcert Require Import Coqlib Integers Floats AST Ctypes Cop Clight Clightdefs.\n\
Local Open Scope Z_scope.\n"
Local Open Scope Z_scope.\n\
Local Open Scope string_scope.\n"

(* Naming the compiler-generated temporaries occurring in the program *)

Expand Down Expand Up @@ -554,15 +561,15 @@ let name_program p =

let print_clightgen_info p sourcefile normalized =
fprintf p "@[<v 2>Module Info.";
fprintf p "@ Definition version := %S%%string." Version.version;
fprintf p "@ Definition build_number := %S%%string." Version.buildnr;
fprintf p "@ Definition build_tag := %S%%string." Version.tag;
fprintf p "@ Definition arch := %S%%string." Configuration.arch;
fprintf p "@ Definition model := %S%%string." Configuration.model;
fprintf p "@ Definition abi := %S%%string." Configuration.abi;
fprintf p "@ Definition version := %S." Version.version;
fprintf p "@ Definition build_number := %S." Version.buildnr;
fprintf p "@ Definition build_tag := %S." Version.tag;
fprintf p "@ Definition arch := %S." Configuration.arch;
fprintf p "@ Definition model := %S." Configuration.model;
fprintf p "@ Definition abi := %S." Configuration.abi;
fprintf p "@ Definition bitsize := %d." (if Archi.ptr64 then 64 else 32);
fprintf p "@ Definition big_endian := %B." Archi.big_endian;
fprintf p "@ Definition source_file := %S%%string." sourcefile;
fprintf p "@ Definition source_file := %S." sourcefile;
fprintf p "@ Definition normalized := %B." normalized;
fprintf p "@]@ End Info.@ @ "

Expand Down
79 changes: 76 additions & 3 deletions lib/Camlcoq.ml
Original file line number Diff line number Diff line change
Expand Up @@ -282,23 +282,96 @@ type atom = positive
let atom_of_string = (Hashtbl.create 17 : (string, atom) Hashtbl.t)
let string_of_atom = (Hashtbl.create 17 : (atom, string) Hashtbl.t)
let next_atom = ref Coq_xH
let use_canonical_atoms = ref false

(* If [use_canonical_atoms] is false, strings are numbered from 1 up
in the order in which they are encountered. This produces small
numbers, and is therefore efficient, but the number for a given
string may differ between the compilation of different units.

If [use_canonical_atoms] is true, strings are Huffman-encoded as bit
sequences, which are then encoded as positive numbers. The same
string is always represented by the same number in all compilation
units. However, the numbers are bigger than in the first
implementation. Also, this places a hard limit on the number of
fresh identifiers that can be generated starting with
[first_unused_ident]. *)

let rec append_bits_pos nbits n p =
if nbits <= 0 then p else
if n land 1 = 0
then Coq_xO (append_bits_pos (nbits - 1) (n lsr 1) p)
else Coq_xI (append_bits_pos (nbits - 1) (n lsr 1) p)

(* The encoding of strings as bit sequences is optimized for C identifiers:
- numbers are encoded as a 6-bit integer between 0 and 9
- lowercase letters are encoded as a 6-bit integer between 10 and 35
- uppercase letters are encoded as a 6-bit integer between 36 and 61
- the underscore character is encoded as the 6-bit integer 62
- all other characters are encoded as 6 "one" bits followed by
the 8-bit encoding of the character. *)

let append_char_pos c p =
match c with
| '0'..'9' -> append_bits_pos 6 (Char.code c - Char.code '0') p
| 'a'..'z' -> append_bits_pos 6 (Char.code c - Char.code 'a' + 10) p
| 'A'..'Z' -> append_bits_pos 6 (Char.code c - Char.code 'A' + 36) p
| '_' -> append_bits_pos 6 62 p
| _ -> append_bits_pos 6 63 (append_bits_pos 8 (Char.code c) p)

(* The empty string is represented as the positive "1", that is, [xH]. *)

let pos_of_string s =
let rec encode i accu =
if i < 0 then accu else encode (i - 1) (append_char_pos s.[i] accu)
in encode (String.length s - 1) Coq_xH

let fresh_atom () =
let a = !next_atom in
next_atom := Pos.succ !next_atom;
a

let intern_string s =
try
Hashtbl.find atom_of_string s
with Not_found ->
let a = !next_atom in
next_atom := Pos.succ !next_atom;
let a =
if !use_canonical_atoms then pos_of_string s else fresh_atom () in
Hashtbl.add atom_of_string s a;
Hashtbl.add string_of_atom a s;
a

let extern_atom a =
try
Hashtbl.find string_of_atom a
with Not_found ->
Printf.sprintf "$%d" (P.to_int a)

let first_unused_ident () = !next_atom
(* Ignoring the terminating "1" bit, canonical encodings of strings can
be viewed as lists of bits, formed by concatenation of 6-bit fragments
(for letters, numbers, and underscore) and 14-bit fragments (for other
characters). Hence, not all positive numbers are canonical encodings:
only those whose log2 is of the form [6n + 14m].

Here are the first intervals of positive numbers corresponding to strings:
- [1, 1] for the empty string
- [2^6, 2^7-1] for one "compact" character
- [2^12, 2^13-1] for two "compact" characters
- [2^14, 2^14-1] for one "escaped" character

Hence, between 2^7 and 2^12 - 1, we have 3968 consecutive positive
numbers that cannot be the encoding of a string. These are the positive
numbers we'll use as temporaries in the SimplExpr pass if canonical
atoms are in use.

If short atoms are used, we just number the temporaries consecutively
starting one above the last generated atom.
*)

let first_unused_ident () =
if !use_canonical_atoms
then P.of_int 128
else !next_atom

(* Strings *)

Expand Down