AbsInt · xavierleroy · May 19, 2020 · May 2, 2020
diff --git a/exportclight/Clightdefs.v b/exportclight/Clightdefs.v
@@ -15,7 +15,7 @@
 
 (** All imports and definitions used by .v Clight files generated by clightgen *)
 
-From Coq Require Import String List ZArith.
+From Coq Require Import Ascii String List ZArith.
 From compcert Require Import Integers Floats Maps Errors AST Ctypes Cop Clight.
 
 Definition tvoid := Tvoid.
@@ -80,3 +80,102 @@ Definition mkprogram (types: list composite_definition)
      prog_types := types;
      prog_comp_env := ce;
      prog_comp_env_eq := EQ |}.
+
+(** The following encoding of character strings as positive numbers
+    must be kept consistent with the OCaml function [Camlcoq.pos_of_string]. *)
+
+Definition append_bit_pos (b: bool) (p: positive) : positive :=
+  if b then xI p else xO p.
+
+Definition append_char_pos_default (c: ascii) (p: positive) : positive :=
+  let '(Ascii b7 b6 b5 b4 b3 b2 b1 b0) := c in
+  xI (xI (xI (xI (xI (xI
+    (append_bit_pos b0 (append_bit_pos b1
+    (append_bit_pos b2 (append_bit_pos b3
+    (append_bit_pos b4 (append_bit_pos b5
+    (append_bit_pos b6 (append_bit_pos b7 p))))))))))))).
+
+Definition append_char_pos (c: ascii) (p: positive) : positive :=
+  match c with
+  | "0"%char => xO (xO (xO (xO (xO (xO p)))))
+  | "1"%char => xI (xO (xO (xO (xO (xO p)))))
+  | "2"%char => xO (xI (xO (xO (xO (xO p)))))
+  | "3"%char => xI (xI (xO (xO (xO (xO p)))))
+  | "4"%char => xO (xO (xI (xO (xO (xO p)))))
+  | "5"%char => xI (xO (xI (xO (xO (xO p)))))
+  | "6"%char => xO (xI (xI (xO (xO (xO p)))))
+  | "7"%char => xI (xI (xI (xO (xO (xO p)))))
+  | "8"%char => xO (xO (xO (xI (xO (xO p)))))
+  | "9"%char => xI (xO (xO (xI (xO (xO p)))))
+  | "a"%char => xO (xI (xO (xI (xO (xO p)))))
+  | "b"%char => xI (xI (xO (xI (xO (xO p)))))
+  | "c"%char => xO (xO (xI (xI (xO (xO p)))))
+  | "d"%char => xI (xO (xI (xI (xO (xO p)))))
+  | "e"%char => xO (xI (xI (xI (xO (xO p)))))
+  | "f"%char => xI (xI (xI (xI (xO (xO p)))))
+  | "g"%char => xO (xO (xO (xO (xI (xO p)))))
+  | "h"%char => xI (xO (xO (xO (xI (xO p)))))
+  | "i"%char => xO (xI (xO (xO (xI (xO p)))))
+  | "j"%char => xI (xI (xO (xO (xI (xO p)))))
+  | "k"%char => xO (xO (xI (xO (xI (xO p)))))
+  | "l"%char => xI (xO (xI (xO (xI (xO p)))))
+  | "m"%char => xO (xI (xI (xO (xI (xO p)))))
+  | "n"%char => xI (xI (xI (xO (xI (xO p)))))
+  | "o"%char => xO (xO (xO (xI (xI (xO p)))))
+  | "p"%char => xI (xO (xO (xI (xI (xO p)))))
+  | "q"%char => xO (xI (xO (xI (xI (xO p)))))
+  | "r"%char => xI (xI (xO (xI (xI (xO p)))))
+  | "s"%char => xO (xO (xI (xI (xI (xO p)))))
+  | "t"%char => xI (xO (xI (xI (xI (xO p)))))
+  | "u"%char => xO (xI (xI (xI (xI (xO p)))))
+  | "v"%char => xI (xI (xI (xI (xI (xO p)))))
+  | "w"%char => xO (xO (xO (xO (xO (xI p)))))
+  | "x"%char => xI (xO (xO (xO (xO (xI p)))))
+  | "y"%char => xO (xI (xO (xO (xO (xI p)))))
+  | "z"%char => xI (xI (xO (xO (xO (xI p)))))
+  | "A"%char => xO (xO (xI (xO (xO (xI p)))))
+  | "B"%char => xI (xO (xI (xO (xO (xI p)))))
+  | "C"%char => xO (xI (xI (xO (xO (xI p)))))
+  | "D"%char => xI (xI (xI (xO (xO (xI p)))))
+  | "E"%char => xO (xO (xO (xI (xO (xI p)))))
+  | "F"%char => xI (xO (xO (xI (xO (xI p)))))
+  | "G"%char => xO (xI (xO (xI (xO (xI p)))))
+  | "H"%char => xI (xI (xO (xI (xO (xI p)))))
+  | "I"%char => xO (xO (xI (xI (xO (xI p)))))
+  | "J"%char => xI (xO (xI (xI (xO (xI p)))))
+  | "K"%char => xO (xI (xI (xI (xO (xI p)))))
+  | "L"%char => xI (xI (xI (xI (xO (xI p)))))
+  | "M"%char => xO (xO (xO (xO (xI (xI p)))))
+  | "N"%char => xI (xO (xO (xO (xI (xI p)))))
+  | "O"%char => xO (xI (xO (xO (xI (xI p)))))
+  | "P"%char => xI (xI (xO (xO (xI (xI p)))))
+  | "Q"%char => xO (xO (xI (xO (xI (xI p)))))
+  | "R"%char => xI (xO (xI (xO (xI (xI p)))))
+  | "S"%char => xO (xI (xI (xO (xI (xI p)))))
+  | "T"%char => xI (xI (xI (xO (xI (xI p)))))
+  | "U"%char => xO (xO (xO (xI (xI (xI p)))))
+  | "V"%char => xI (xO (xO (xI (xI (xI p)))))
+  | "W"%char => xO (xI (xO (xI (xI (xI p)))))
+  | "X"%char => xI (xI (xO (xI (xI (xI p)))))
+  | "Y"%char => xO (xO (xI (xI (xI (xI p)))))
+  | "Z"%char => xI (xO (xI (xI (xI (xI p)))))
+  | "_"%char => xO (xI (xI (xI (xI (xI p)))))
+  | _ => append_char_pos_default c p
+  end.
+
+Fixpoint ident_of_string (s: string) : ident :=
+  match s with
+  | EmptyString => xH
+  | String c s => append_char_pos c (ident_of_string s)
+  end.
+
+(** A convenient notation [$ "ident"] to force evaluation of
+    [ident_of_string "ident"] *)
+
+Ltac ident_of_string s :=
+  let x := constr:(ident_of_string s) in
+  let y := eval compute in x in
+  exact y.
+
+Notation "$ s" := (ltac:(ident_of_string s))
+                  (at level 1, only parsing) : string_scope.
diff --git a/exportclight/Clightgen.ml b/exportclight/Clightgen.ml
@@ -98,6 +98,8 @@ Recognized source files:
   .i or .p       C source file that should not be preprocessed
 Processing options:
   -normalize     Normalize the generated Clight code w.r.t. loads in expressions
+  -canonical-idents  Use canonical numbers to represent identifiers  (default)
+  -short-idents  Use canonical numbers to represent identifiers
   -E             Preprocess only, send result to standard output
   -o <file>      Generate output in <file>
 |} ^
@@ -142,6 +144,8 @@ let cmdline_actions =
 (* Processing options *)
  [ Exact "-E", Set option_E;
   Exact "-normalize", Set option_normalize;
+  Exact "-canonical-idents", Set Camlcoq.use_canonical_atoms;
+  Exact "-short-idents", Unset Camlcoq.use_canonical_atoms;
   Exact "-o", String(fun s -> option_o := Some s);
   Prefix "-o", Self (fun s -> let s = String.sub s 2 ((String.length s) - 2) in
                               option_o := Some s);]
@@ -175,20 +179,21 @@ let cmdline_actions =
   ]
 
 let _ =
-  try
+try
   Gc.set { (Gc.get()) with
               Gc.minor_heap_size = 524288; (* 512k *)
               Gc.major_heap_increment = 4194304 (* 4M *)
          };
   Printexc.record_backtrace true;
+  Camlcoq.use_canonical_atoms := true;
   Frontend.init ();
   parse_cmdline cmdline_actions;
   if !option_o <> None && !num_input_files >= 2 then
     fatal_error no_loc "Ambiguous '-o' option (multiple source files)";
   if !num_input_files = 0 then
     fatal_error no_loc "no input file";
   perform_actions ()
-      with
+with
   | Sys_error msg
   | CmdError msg -> error no_loc "%s" msg; exit 2
   | Abort -> exit 2

diff --git a/exportclight/ExportClight.ml b/exportclight/ExportClight.ml
@@ -81,8 +81,12 @@ let define_idents p =
     string_of_atom
     (fun (id, name) ->
       try
-        fprintf p "Definition _%s : ident := %ld%%positive.@ "
-                  (sanitize name) (P.to_int32 id)
+        if id = pos_of_string name then
+          fprintf p "Definition _%s : ident := $\"%s\".@ "
+                    (sanitize name) name
+        else
+          fprintf p "Definition _%s : ident := %ld%%positive.@ "
+                    (sanitize name) (P.to_int32 id)
       with Not_an_identifier ->
         ());
   iter_hashtbl_sorted
@@ -93,9 +97,11 @@ let define_idents p =
   fprintf p "@ "
 
 let name_temporary t =
-  let t1 = P.to_int t and t0 = P.to_int (first_unused_ident ()) in
-  if t1 >= t0 && not (Hashtbl.mem temp_names t)
-  then Hashtbl.add temp_names t (sprintf "_t'%d" (t1 - t0 + 1))
+  if not (Hashtbl.mem string_of_atom t) && not (Hashtbl.mem temp_names t)
+  then begin
+    let t1 = P.to_int t and t0 = P.to_int (first_unused_ident ()) in
+    Hashtbl.add temp_names t (sprintf "_t'%d" (t1 - t0 + 1))
+  end
 
 let name_opt_temporary = function
   | None -> ()
@@ -468,7 +474,7 @@ let print_assertion p (txt, targs) =
      | Text _ -> ()
      | Param n -> max_param := max n !max_param)
     frags;
-  fprintf p "  | \"%s\"%%string, " txt;
+  fprintf p "  | \"%s\", " txt;
   list_iteri
     (fun i targ -> fprintf p "_x%d :: " (i + 1))
     targs;
@@ -495,7 +501,8 @@ let print_assertions p =
 let prologue = "\
 From Coq Require Import String List ZArith.\n\
 From compcert Require Import Coqlib Integers Floats AST Ctypes Cop Clight Clightdefs.\n\
-Local Open Scope Z_scope.\n"
+Local Open Scope Z_scope.\n\
+Local Open Scope string_scope.\n"
 
 (* Naming the compiler-generated temporaries occurring in the program *)
 
@@ -554,15 +561,15 @@ let name_program p =
 
 let print_clightgen_info p sourcefile normalized =
   fprintf p "@[<v 2>Module Info.";
-  fprintf p "@ Definition version := %S%%string." Version.version;
-  fprintf p "@ Definition build_number := %S%%string." Version.buildnr;
-  fprintf p "@ Definition build_tag := %S%%string." Version.tag;
-  fprintf p "@ Definition arch := %S%%string." Configuration.arch;
-  fprintf p "@ Definition model := %S%%string." Configuration.model;
-  fprintf p "@ Definition abi := %S%%string." Configuration.abi;
+  fprintf p "@ Definition version := %S." Version.version;
+  fprintf p "@ Definition build_number := %S." Version.buildnr;
+  fprintf p "@ Definition build_tag := %S." Version.tag;
+  fprintf p "@ Definition arch := %S." Configuration.arch;
+  fprintf p "@ Definition model := %S." Configuration.model;
+  fprintf p "@ Definition abi := %S." Configuration.abi;
   fprintf p "@ Definition bitsize := %d." (if Archi.ptr64 then 64 else 32);
   fprintf p "@ Definition big_endian := %B." Archi.big_endian;
-  fprintf p "@ Definition source_file := %S%%string." sourcefile;
+  fprintf p "@ Definition source_file := %S." sourcefile;
   fprintf p "@ Definition normalized := %B." normalized;
   fprintf p "@]@ End Info.@ @ "  
 

diff --git a/lib/Camlcoq.ml b/lib/Camlcoq.ml
@@ -282,23 +282,96 @@ type atom = positive
 let atom_of_string = (Hashtbl.create 17 : (string, atom) Hashtbl.t)
 let string_of_atom = (Hashtbl.create 17 : (atom, string) Hashtbl.t)
 let next_atom = ref Coq_xH
+let use_canonical_atoms = ref false
+
+(* If [use_canonical_atoms] is false, strings are numbered from 1 up
+   in the order in which they are encountered.  This produces small
+   numbers, and is therefore efficient, but the number for a given
+   string may differ between the compilation of different units.
+
+   If [use_canonical_atoms] is true, strings are Huffman-encoded as bit
+   sequences, which are then encoded as positive numbers.  The same
+   string is always represented by the same number in all compilation
+   units.  However, the numbers are bigger than in the first
+   implementation.  Also, this places a hard limit on the number of
+   fresh identifiers that can be generated starting with
+   [first_unused_ident]. *)
+
+let rec append_bits_pos nbits n p =
+  if nbits <= 0 then p else
+  if n land 1 = 0
+  then Coq_xO (append_bits_pos (nbits - 1) (n lsr 1) p)
+  else Coq_xI (append_bits_pos (nbits - 1) (n lsr 1) p)
+
+(* The encoding of strings as bit sequences is optimized for C identifiers:
+   - numbers are encoded as a 6-bit integer between 0 and 9
+   - lowercase letters are encoded as a 6-bit integer between 10 and 35
+   - uppercase letters are encoded as a 6-bit integer between 36 and 61
+   - the underscore character is encoded as the 6-bit integer 62
+   - all other characters are encoded as 6 "one" bits followed by
+     the 8-bit encoding of the character. *)
+
+let append_char_pos c p =
+  match c with
+  | '0'..'9' -> append_bits_pos 6 (Char.code c - Char.code '0') p
+  | 'a'..'z' -> append_bits_pos 6 (Char.code c - Char.code 'a' + 10) p
+  | 'A'..'Z' -> append_bits_pos 6 (Char.code c - Char.code 'A' + 36) p
+  | '_'      -> append_bits_pos 6 62 p
+  | _        -> append_bits_pos 6 63 (append_bits_pos 8 (Char.code c) p)
+
+(* The empty string is represented as the positive "1", that is, [xH]. *)
+
+let pos_of_string s =
+  let rec encode i accu =
+    if i < 0 then accu else encode (i - 1) (append_char_pos s.[i] accu)
+  in encode (String.length s - 1) Coq_xH
+
+let fresh_atom () =
+  let a = !next_atom in
+  next_atom := Pos.succ !next_atom;
+  a
 
 let intern_string s =
   try
     Hashtbl.find atom_of_string s
   with Not_found ->
-    let a = !next_atom in
-    next_atom := Pos.succ !next_atom;
+    let a =
+      if !use_canonical_atoms then pos_of_string s else fresh_atom () in
     Hashtbl.add atom_of_string s a;
     Hashtbl.add string_of_atom a s;
     a
+
 let extern_atom a =
   try
     Hashtbl.find string_of_atom a
   with Not_found ->
     Printf.sprintf "$%d" (P.to_int a)
 
-let first_unused_ident () = !next_atom
+(* Ignoring the terminating "1" bit, canonical encodings of strings can
+   be viewed as lists of bits, formed by concatenation of 6-bit fragments
+   (for letters, numbers, and underscore) and 14-bit fragments (for other
+   characters).  Hence, not all positive numbers are canonical encodings:
+   only those whose log2 is of the form [6n + 14m].
+
+   Here are the first intervals of positive numbers corresponding to strings:
+   - [1, 1] for the empty string
+   - [2^6, 2^7-1] for one "compact" character
+   - [2^12, 2^13-1] for two "compact" characters
+   - [2^14, 2^14-1] for one "escaped" character
+
+   Hence, between 2^7 and 2^12 - 1, we have 3968 consecutive positive
+   numbers that cannot be the encoding of a string.  These are the positive
+   numbers we'll use as temporaries in the SimplExpr pass if canonical
+   atoms are in use.
+
+   If short atoms are used, we just number the temporaries consecutively
+   starting one above the last generated atom.
+*)
+
+let first_unused_ident () =
+  if !use_canonical_atoms
+  then P.of_int 128
+  else !next_atom
 
 (* Strings *)