cil/doc/cil.tex at develop · gatesn/cil · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[letterpaper]{article}

\setlength{\pdfpagewidth}{\paperwidth}
\setlength{\pdfpageheight}{\paperheight}

\usepackage{hevea}
\usepackage{fullpage}
\usepackage{longtable}


% cilversion.tex is generated automatically to define \cilversion
\include{cil.version}

\def\secref#1{Section~\ref{sec-#1}}
\def\chref#1{Chapter~\ref{ch-#1}}

\def\apiref#1#2#3{\ahref{api/#1.html\##2#3}{#1.#3}}
\def\moduleref#1{\ahref{api/#1.html}{#1}}

% Use this to refer to a Cil type/val
\def\ciltyperef#1{\apiref{Cil}{TYPE}{#1}}
\def\cilvalref#1{\apiref{Cil}{VAL}{#1}}
\def\cilvisit#1{\apiref{Cil.cilVisitor}{#1}}
\def\cilprinter#1{\apiref{Cil.cilPrinter}{#1}}

% Use this to refer to a type/val in the Pretty module
\def\ptyperef#1{\apiref{Pretty}{TYPE}{#1}}
\def\pvalref#1{\apiref{Pretty}{VAL}{#1}}

% Use this to refer to a type/val in the Errormsg module
\def\etyperef#1{\apiref{Errormsg}{TYPE}{#1}}
\def\evalref#1{\apiref{Errormsg}{VAL}{#1}}

\def\formatcilvalref#1{\apiref{Formatcil}{VAL}{#1}}
\def\cfgref#1{\apiref{Cfg}{VAL}{#1}}


%----------------------------------------------------------------------
% MACROS

\newcommand{\hsp}{\hspace{0.5in}}
\def\t#1{{\tt #1}}
\newcommand\codecolor{\ifhevea\blue\else\fi}
\renewcommand\c[1]{{\codecolor #1}} % Use for code fragments

%%% Define an environment for code
%% Unfortunately since hevea is not quite TeX you have to use this as follows
%\begin{code}
% ...
%\end{verbatim}\end{code}
\def\code{\begingroup\codecolor\begin{verbatim}}
\def\endcode{\endgroup}

%use this for links to external pages.  It will open pages in the
%top frame.
\newcommand\ahreftop[2]{{\ahref{javascript:loadTop('#1')}{#2}}}

%----------------------------------------------------------------------

% Make sure that most documents show up in the main frame,
% and define javascript:loadTop for those links that should fill the window.
\makeatletter
\let\oldmeta=\@meta
\def\@meta{%
\oldmeta
\begin{rawhtml}
<base target="main">
<script language="JavaScript">
<!-- Begin
function loadTop(url) {
  parent.location.href= url;
}
// -->
</script>
\end{rawhtml}}
\makeatother

\begin{document}
\begin{latexonly}
\title{CIL: Infrastructure for C Program Analysis and Transformation}
\end{latexonly}
\maketitle


\section{Introduction}

CIL has a Source Forge page:
   \ahreftop{http://sourceforge.net/projects/cil}
            {http://sourceforge.net/projects/cil}.

 CIL ({\bf C} {\bf I}ntermediate {\bf L}anguage) is a high-level representation
along with a set of tools that permit easy analysis and source-to-source
transformation of C programs.

 CIL is both lower-level than abstract-syntax trees, by clarifying ambiguous
constructs and removing redundant ones, and also higher-level than typical
intermediate languages designed for compilation, by maintaining types and a
close relationship with the source program. The main advantage of CIL is that
it compiles all valid C programs into a few core constructs with a very clean
semantics. Also CIL has a syntax-directed type system that makes it easy to
analyze and manipulate C programs. Furthermore, the CIL front-end is able to
process not only ANSI-C programs but also those using Microsoft C or GNU C
extensions. If you do not use CIL and want instead to use just a C parser and
analyze programs expressed as abstract-syntax trees then your analysis will
have to handle a lot of ugly corners of the language (let alone the fact that
parsing C itself is not a trivial task). See \secref{simplec} for some
examples of such extreme programs that CIL simplifies for you.

 In essence, CIL is a highly-structured, ``clean'' subset of C. CIL features a
reduced number of syntactic and conceptual forms. For example, all looping
constructs are reduced to a single form, all function bodies are given
explicit {\tt return} statements, syntactic sugar like {\tt "->"} is
eliminated and function arguments with array types become pointers. (For an
extensive list of how CIL simplifies C programs, see \secref{cabs2cil}.)
This reduces the number of cases that must be considered when manipulating a C
program. CIL also separates type declarations from code and flattens scopes
within function bodies. This structures the program in a manner more amenable
to rapid analysis and transformation. CIL computes the types of all program
expressions, and makes all type promotions and casts explicit. CIL supports
all GCC and MSVC extensions except for nested functions and complex numbers.
Finally, CIL organizes C's imperative features into expressions, instructions
and statements based on the presence and absence of side-effects and
control-flow. Every statement can be annotated with successor and predecessor
information. Thus CIL provides an integrated program representation that can
be used with routines that require an AST (e.g. type-based analyses and
pretty-printers), as well as with routines that require a CFG (e.g., dataflow
analyses). CIL also supports even lower-level representations (e.g.,
three-address code), see \secref{Extension}.

 CIL comes accompanied by a number of Perl scripts that perform generally
useful operations on code:
\begin{itemize}
\item A \ahrefloc{sec-driver}{driver} which behaves as either the \t{gcc} or
Microsoft VC compiler and can invoke the preprocessor followed by the CIL
application. The advantage of this script is that you can easily use CIL and
the analyses written for CIL with existing make files.
\item A \ahrefloc {sec-merger}{whole-program merger} that you can use as a
replacement for your compiler and it learns all the files you compile when you
make a project and merges all of the preprocessed source files into a single
one. This makes it easy to do whole-program analysis.
\item A \ahrefloc{sec-patcher}{patcher} makes it easy to create modified
copies of the system include files. The CIL driver can then be told to use
these patched copies instead of the standard ones.
\end{itemize}

 CIL has been tested very extensively. It is able to process the SPECINT95
benchmarks, the Linux kernel, GIMP and other open-source projects. All of
these programs are compiled to the simple CIL and then passed to \t{gcc} and
they still run! We consider the compilation of Linux a major feat especially
since Linux contains many of the ugly GCC extensions (see \secref{ugly-gcc}).
This adds to about 1,000,000 lines of code that we tested it on. It is also
able to process the few Microsoft NT device drivers that we have had access
to. CIL was tested against GCC's c-torture testsuite and (except for the tests
involving complex numbers and inner functions, which CIL does not currently
implement) CIL passes most of the tests. Specifically CIL fails 23 tests out
of the 904 c-torture tests that it should pass. GCC itself fails 19 tests. A
total of 1400 regression test cases are run automatically on each change to
the CIL sources.

 CIL is relatively independent on the underlying machine and compiler. When
you build it CIL will configure itself according to the underlying compiler.
However, CIL has only been tested on Intel x86 using the gcc compiler on Linux
and cygwin and using the MS Visual C compiler. (See below for specific
versions of these compilers that we have used CIL for.)

 The largest application we have used CIL for is
\ahreftop{../ccured/index.html}{CCured}, a compiler that compiles C code into
type-safe code by analyzing your pointer usage and inserting runtime checks in
the places that cannot be guaranteed statically to be type safe.

 You can also use CIL to ``compile'' code that uses GCC extensions (e.g. the
Linux kernel) into standard C code.

 CIL also comes accompanies by a growing library of extensions (see
\secref{Extension}). You can use these for your projects or as examples of
using CIL.

\t{PDF} versions of \ahref{CIL.pdf}{this manual} and the
\ahref{CIL-API.pdf}{CIL API} are available. However, we recommend the
\t{HTML} versions because the postprocessed code examples are easier to
view.

 If you use CIL in your project, we would appreciate letting us know. If you
want to cite CIL in your research writings, please refer to the paper ``CIL:
Intermediate Language and Tools for Analysis and Transformation of C
Programs'' by George C. Necula, Scott McPeak, S.P. Rahul and Westley Weimer,
in ``Proceedings of Conference on Compilier Construction'', 2002.

\section{Installation}

 You need the following tools to build CIL:
\begin{itemize}
\item A Unix-like shell environment (with bash, perl, make, mv, cp,
  etc.). On Windows, you will need cygwin with those packages.
\item An ocaml compiler. You will need OCaml release 3.08 or higher to build
CIL. CIL has been tested on Linux and on Windows (where it can behave as
either Microsoft Visual C or gcc). On Windows, you can build CIL both with the
cygwin version of ocaml (preferred) and with the Win32 version of ocaml.
\item An underlying C compiler, which can be either gcc or Microsoft Visual C.
\end{itemize}

\begin{enumerate}
\item Get the source code.
\begin{itemize}
\item {\em Official distribution} (Recommended):
\begin{enumerate}
\item Download the CIL
    \ahref{http://sourceforge.net/projects/cil/files/cil/}{distribution}
    (latest version is\\
    \ahrefurl{http://sourceforge.net/projects/cil/files/cil/cil-\cilversion.tar.gz}).
\item Unzip and untar the source distribution. This will create a directory
      called \t{cil} whose structure is explained below. \\
      \t{~~~~tar xvfz cil-\cilversion.tar.gz}
\end{enumerate}
\item {\em Git Repository}: \\
  Alternately, you can download an up to the minute version of CIL
  from our git repository at:
\begin{verbatim}
    git clone git://git.code.sf.net/p/cil/code cil
\end{verbatim}
There is also a Github mirror:
\begin{verbatim}
    git clone git://github.com/kerneis/cil.git
\end{verbatim}
However, the Git version may be less stable than the released
version.

\end{itemize}
\item Enter the \t{cil} directory and run the \t{configure} script and then
      GNU make to build the distribution. If you are on Windows, at least the
      \t{configure} step must be run from within \t{bash}. \\
      \hsp\verb!cd cil!\\
      \hsp\verb!./configure!\\
      \hsp\verb!make!\\
      \hsp\verb!make quicktest!\\

\item You should now find \t{cilly.native.exe} in \t{bin}.
\end{enumerate}

 The \t{configure} script tries to find appropriate defaults for your system.
You can control its actions by passing the following arguments:
\begin{itemize}
\item \t{CC=foo} Specifies the path for the \t{gcc} executable. By default
whichever version is in the PATH is used. If \t{CC} specifies the Microsoft
\t{cl} compiler, then that compiler will be set as the default one. Otherwise,
the \t{gcc} compiler will be the default.
\end{itemize}

 CIL requires an underlying C compiler and preprocessor. CIL depends on the
underlying compiler and machine for the sizes and alignment of types. The
installation procedure for CIL queries the underlying compiler for
architecture and compiler dependent configuration parameters, such as the size
of a pointer or the particular alignment rules for structure fields. (This
means, of course, that you should re-run \t{./configure} when you move CIL to
another machine.)

We have tested CIL on the following compilers:

\begin{itemize}
\item On Windows, \t{cl} compiler version 12.00.8168 (MSVC 6),
  13.00.9466 (MSVC .Net), and 13.10.3077 (MSVC .Net 2003).  Run \t{cl}
  with no arguments to get the compiler version.
\item On Windows, using \t{cygwin} and \t{gcc} version 2.95.3, 3.0,
  3.2, 3.3, and 3.4.
\item On Linux, using \t{gcc} version 2.95.3, 3.0, 3.2, 3.3, 4.0, and 4.1.
\end{itemize}

Others have successfully used CIL on x86 processors with Mac OS X,
FreeBSD and OpenBSD; on amd64 processors with FreeBSD; on SPARC
processors with Solaris; and on PowerPC processors with Mac OS X.  If
you make any changes to the build system in order to run CIL on your
platform, please send us a patch.

  \subsection{Building CIL on Windows with Microsoft Visual C}

 Some users might want to build a standalone CIL executable on Windows (an
executable that does not require cygwin.dll to run). You will need cygwin for
the build process only. Here is how we do it

\begin{enumerate}
\item Start with a clean CIL directory
\item Start a command-line window setup with the environment variables for
  Microsoft Visual Studio. You can do this by choosing Programs/Microsoft
  Visual Studio/Tools/Command Prompt. Check that you can run \t{cl}.
\item Ensure that \t{ocamlc} refers to a Win32 version of ocaml. Run \t{ocamlc
  -v} and look at the path to the standard library.

\item Run \t{bash -c "./configure CC=cl"}.
\item Run \t{bash -c "make WIN32=1 quickbuild"}
\item Run \t{bash -c "make WIN32=1 NATIVECAML=1 cilly}
\item Run \t{bash -c "make WIN32=1 doc}
\item Run \t{bash -c "make WIN32=1 bindistrib-nocheck}
\end{enumerate}

 The above steps do not build the CIL library, but just the executable. The
last step will create a subdirectory \t{TEMP\_cil-bindistrib} that contains
everything that you need to run CIL on another machine. You will have to edit
manually some of the files in the \t{bin} directory to replace \t{CILHOME}.
The resulting CIL can be run with ActiveState Perl also.


  \section{Distribution Contents}

The file \ahrefurl{distrib/cil-\cilversion.tar.gz}
contains the complete source CIL distribution,
consisting of the following files:

\begin{longtable}{lp{4in}}
\emph{Filename}   & \emph{Description} \\
\endhead
\endfoot
\t{Makefile.in}                 & \t{configure} source for the
                                  Makefile that builds CIL/ \\
\t{configure}                   & The configure script. \\
\t{configure.ac}                & The \t{autoconf} source for \t{configure}. \\
\t{config.guess} 		& Stuff required by \t{configure}. \\
\t{config.sub} 			& idem \\
\t{install-sh}	 		& idem \\
\\
\t{doc/}                        & HTML documentation of the CIL API. \\
\t{\_build/}                        & Directory that will contain the compiled
                                   CIL modules and executables.\\
\t{bin/cilly.in}                & The \t{configure} source for a Perl script
                                  that can be invoked with the
                                  same arguments as either \t{gcc} or
                                  Microsoft Visual C and will convert the
                                  program to CIL, perform some simple
                                  transformations, emit it and compile it as
                                  usual. \\
\t{lib/patcher}              & A Perl script that applies specified patches
                                  to standard include files.\\
\\
\t{src/check.ml,mli}            & Checks the well-formedness of a CIL file. \\
\t{src/cil.ml,mli}              & Definition of CIL abstract syntax and
                                   utilities for manipulating it.\\
\t{src/clist.ml,mli}            & Utilities for efficiently managing lists
                                   that need to be concatenated often.\\
\t{src/errormsg.ml,mli}         & Utilities for error reporting. \\
\t{src/ext/heapify.ml}          & A CIL transformation that moves array local
                                   variables from the stack to the heap. \\
\t{src/ext/logcalls.ml,mli}     & A CIL transformation that logs every
                                   function call. \\
\t{src/ext/sfi.ml}              & A CIL transformation that can log every
                                   memory read and write. \\
\t{src/frontc/clexer.mll}       & The lexer. \\
\t{src/frontc/cparser.mly}      & The parser. \\
\t{src/frontc/cabs.ml}          & The abstract syntax. \\
\t{src/frontc/cprint.ml}        & The pretty printer for CABS. \\
\t{src/frontc/cabs2cil.ml}      & The elaborator to CIL. \\
\t{src/main.ml}                 & The \t{cilly} application. \\
\t{src/pretty.ml,mli}           & Utilities for pretty printing. \\
\t{src/rmtmps.ml,mli}           & A CIL tranformation that removes unused
                                  types, variables and inlined functions. \\
\t{src/stats.ml,mli}            & Utilities for maintaining timing statistics.
\\
\t{src/trace.ml,mli}            & Utilities useful for printing debugging
                                   information.\\
\\
\t{ocamlutil/}                  & Miscellaneous libraries that are not
                                  specific to CIL. \\
\\
\t{\_build/feature\_config.ml}  & File generated by the Makefile
                                      describing which extra ``features''
                                      to compile. See \secref{cil}. \\
\t{\_build/machdep.ml}     & File generated by the Makefile containing
                                  information about your architecture,
                                  such as the size of a pointer. \\
\t{src/machdep-ml.c}            & C program that generates
                                  \t{machdep.ml} files. \\
\end{longtable}


\section{Compiling C to CIL}\label{sec-cabs2cil}

 In this section we try to describe a few of the many transformations that are
applied to a C program to convert it to CIL. The module that implements this
conversion is about 5000 lines of OCaml code. In contrast a simple program
transformation that instruments all functions to keep a shadow stack of the
true return address (thus preventing stack smashing) is only 70 lines of code.
This example shows that the analysis is so much simpler because it has to
handle only a few simple C constructs and also because it can leverage on CIL
infrastructure such as visitors and pretty-printers.

 In no particular order these are a few of the most significant ways in which
C programs are compiled into CIL:
\begin{enumerate}
\item CIL will eliminate all declarations for unused entities. This means that
just because your hello world program includes \t{stdio.h} it does not mean
that your analysis has to handle all the ugly stuff from \t{stdio.h}.

\item Type specifiers are interpreted and normalized:
\begin{cilcode}[global]
int long signed x;
signed long extern x;
long static int long y;

// Some code that uses these declaration, so that CIL does not remove them
int main() { return x + y; }
\end{cilcode}

\item Anonymous structure and union declarations are given a name.
\begin{cilcode}[global]
 struct { int x; } s;
\end{cilcode}

\item Nested structure tag definitions are pulled apart. This means that all
structure tag definitions can be found by a simple scan of the globals.

\begin{cilcode}[global]
struct foo {
   struct bar {
      union baz {
          int x1;
          double x2;
      } u1;
      int y;
   } s1;
   int z;
} f;
\end{cilcode}

\item All structure, union, enumeration definitions and the type definitions
from inners scopes are moved to global scope (with appropriate renaming). This
facilitates moving around of the references to these entities.

\begin{cilcode}[global]
int main() {
  struct foo {
        int x; } foo;
  {
     struct foo {
        double d;
     };
     return foo.x;
  }
}
\end{cilcode}

\item Prototypes are added for those functions that are called before being
defined. Furthermore, if a prototype exists but does not specify the type of
parameters that is fixed. But CIL will not be able to add prototypes for those
functions that are neither declared nor defined (but are used!).
\begin{cilcode}[global]
  int f();  // Prototype without arguments
  int f(double x) {
      return g(x);
  }
  int g(double x) {
     return x;
  }
\end{cilcode}

\item Array lengths are computed based on the initializers or by constant
folding.
\begin{cilcode}[global]
  int a1[] = {1,2,3};
  int a2[sizeof(int) >= 4 ? 8 : 16];
\end{cilcode}

\item Enumeration tags are computed using constant folding:
\begin{cilcode}[global]
int main() {
  enum {
     FIVE = 5,
     SIX, SEVEN,
     FOUR = FIVE - 1,
     EIGHT = sizeof(double)
  } x = FIVE;
 return x;
}

\end{cilcode}

\item Initializers are normalized to include specific initialization for the
missing elements:
\begin{cilcode}[global]
  int a1[5] = {1,2,3};
  struct foo { int x, y; } s1 = { 4 };
\end{cilcode}

\item Initializer designators are interpreted and eliminated. Subobjects are
properly marked with braces. CIL implements
the whole ISO C99 specification for initializer (neither GCC nor MSVC do) and
a few GCC extensions.
\begin{cilcode}[global]
  struct foo {
     int x, y;
     int a[5];
     struct inner {
        int z;
     } inner;
  } s = { 0, .inner.z = 3, .a[1 ... 2] = 5, 4, y : 8 };
\end{cilcode}

\item String initializers for arrays of characters are processed

\begin{cilcode}[global]
char foo[] = "foo plus bar";
\end{cilcode}

\item String constants are concatenated

\begin{cilcode}[global]
char *foo = "foo " " plus " " bar ";
\end{cilcode}

\item Initializers for local variables are turned into assignments. This is in
order to separate completely the declarative part of a function body from the
statements. This has the unfortunate effect that we have to drop the \t{const}
qualifier from local variables !

\begin{cilcode}[local]
  int x = 5;
  struct foo { int f1, f2; } a [] = {1, 2, 3, 4, 5 };
\end{cilcode}

\item Local variables in inner scopes are pulled to function scope (with
appropriate renaming). Local scopes thus disappear. This makes it easy to find
and operate on all local variables in a function.

\begin{cilcode}[global]
  int x = 5;
  int main() {
    int x = 6;
    {
      int x = 7;
      return x;
    }
    return x;
  }
\end{cilcode}

\item Global declarations in local scopes are moved to global scope:
\begin{cilcode}[global]
  int x = 5;
  int main() {
    int x = 6;
    {
      static int x = 7;
      return x;
    }
    return x;
  }
\end{cilcode}

\item Return statements are added for functions that are missing them. If the
return type is not a base type then a \t{return} without a value is added.
The guaranteed presence of return statements makes it easy to implement a
transformation that inserts some code to be executed immediately before
returning from a function.
\begin{cilcode}[global]
  int foo() {
    int x = 5;
  }
\end{cilcode}

\item One of the most significant transformations is that expressions that
contain side-effects are separated into statements.

\begin{cilcode}[local]
   int x, f(int);
   return (x ++ + f(x));
\end{cilcode}

 Internally, the \t{x ++} statement is turned into an assignment which the
pretty-printer prints like the original. CIL has only three forms of basic
statements: assignments, function calls and inline assembly.

\item Shortcut evaluation of boolean expressions and the \t{?:} operator are
compiled into explicit conditionals:
\begin{cilcode}[local]
  int x;
  int y = x ? 2 : 4;
  int z = x || y;
  // Here we duplicate the return statement
  if(x && y) { return 0; } else { return 1; }
  // To avoid excessive duplication, CIL uses goto's for
  // statement that have more than 5 instructions
  if(x && y || z) { x ++; y ++; z ++; x ++; y ++; return z; }
\end{cilcode}

\item GCC's conditional expression with missing operands are also compiled
into conditionals:
\begin{cilcode}[local]
  int f();;
  return f() ? : 4;
\end{cilcode}

\item All forms of loops (\t{while}, \t{for} and \t{do}) are compiled
internally as a single \t{while(1)} looping construct with explicit \t{break}
statement for termination. For simple \t{while} loops the pretty printer is
able to print back the original:
\begin{cilcode}[local]
   int x, y;
   for(int i = 0; i<5; i++) {
      if(i == 5) continue;
      if(i == 4) break;
      i += 2;
   }
   while(x < 5) {
     if(x == 3) continue;
     x ++;
   }
\end{cilcode}

\item GCC's block expressions are compiled away. (That's right there is an
infinite loop in this code.)

\begin{cilcode}[local]
   int x = 5, y = x;
   int z = ({ x++; L: y -= x; y;});
   return ({ goto L; 0; });
\end{cilcode}

\item CIL contains support for both MSVC and GCC inline assembly (both in one
internal construct)

\item CIL compiles away the GCC extension that allows many kinds of constructs
to be used as lvalues:

\begin{cilcode}[local]
   int x, y, z;
   return &(x ? y : z) - & (x ++, x);
\end{cilcode}

\item All types are computed and explicit casts are inserted for all
promotions and conversions that a compiler must insert:

\item CIL will turn old-style function definition (without prototype) into
new-style definitions. This will make the compiler less forgiving when
checking function calls, and will catch for example cases when a function is
called with too few arguments. This happens in old-style code for the purpose
of implementing variable argument functions.

\item Since CIL sees the source after preprocessing the code after CIL does
not contain the comments and the preprocessing directives.

\item CIL will remove from the source file those type declarations, local
variables and inline functions that are not used in the file. This means that
your analysis does not have to see all the ugly stuff that comes from the
header files:
\begin{cilcode}[global]
#include <stdio.h>

typedef int unused_type;

static char unused_static (void) { return 0; }

int main() {
  int unused_local;
  printf("Hello world\n"); // Only printf will be kept from stdio.h
}
\end{cilcode}

\end{enumerate}

\section{How to Use CIL}\label{sec-cil}\cutname{cilly.html}

There are two predominant ways to use CIL to write a program analysis or
transformation. The first is to phrase your analysis as a module that is
called by our existing driver. The second is to use CIL as a stand-alone
library. We highly recommend that you use \t{cilly}, our driver.

\subsection{Using \t{cilly}, the CIL driver}

The most common way to use CIL is to write an Ocaml module containing your
analysis and transformation, which you then link into our boilerplate
driver application called \t{cilly}. \t{cilly} is a Perl script that
processes and mimics \t{GCC} and \t{MSVC} command-line arguments and then
calls \t{cilly.byte.exe} or \t{cilly.native.exe} (CIL's Ocaml executable).

An example of such module is \t{logwrites.ml}, a transformation that is
distributed with CIL and whose purpose is to instrument code to print the
addresses of memory locations being written. (We plan to release a
C-language interface to CIL so that you can write your analyses in C
instead of Ocaml.) See \secref{Extension} for a survey of other example
modules.

Assuming that you have written \t{/home/necula/logwrites.ml},
here is how you use it:

 \begin{enumerate}

 \item Modify \t{logwrites.ml} so that it includes a CIL ``feature
 descriptor'' like this:
\begin{verbatim}
let feature : featureDescr =
  { fd_name = "logwrites";
    fd_enabled = ref false;
    fd_description = "generation of code to log memory writes";
    fd_extraopt = [];
    fd_doit =
    (function (f: file) ->
      let lwVisitor = new logWriteVisitor in
      visitCilFileSameGlobals lwVisitor f)
  }
\end{verbatim}
  The \t{fd\_name} field names the feature and its associated
  command-line arguments. The \t{fd\_enabled} field is a \t{bool ref}.
  ``\t{fd\_doit}'' will be invoked if \t{!fd\_enabled} is true after
  argument parsing, so initialize the ref cell to true if you want
  this feature to be enabled by default.

  When the user passes the \t{-{}-{}dologwrites}
  command-line option to \t{cilly}, the variable associated with the
  \t{fd\_enabled} flag is set and the \t{fd\_doit} function is called
  on the \t{Cil.file} that represents the merger (see \secref{merger}) of
  all C files listed as arguments.

 \item Invoke \t{configure} with the arguments
\begin{verbatim}
./configure EXTRASRCDIRS=/home/necula EXTRAFEATURES=logwrites
\end{verbatim}

 This step works if each feature is packaged into its own ML file, and the
name of the entry point in the file is \t{feature}.

 An alternative way to specify the new features is to change the build files
yourself, as explained below.  You'll need to use this method if a single
feature is split across multiple files.

\begin{enumerate}
 \item Put \t{logwrites.ml} in the \t{src} or \t{src/ext} directory. This
 will make sure that \t{make} can find it. If you want to put it in some
 other directory, modify \t{Makefile.in} and add to \t{SOURCEDIRS} your
 directory.  Alternately, you can create a symlink from \t{src} or
 \t{src/ext} to your file.

 \item Modify the \t{Makefile.in} and add your module to the
 \t{CILLY\_MODULES} or
 \t{CILLY\_LIBRARY\_MODULES} variables. The order of the modules matters. Add
 your modules somewhere after \t{cil} and before \t{main}.

 \item If you have any helper files for your module, add those to
 the makefile in the same way.  e.g.:

\begin{verbatim}
CILLY_MODULES = $(CILLY_LIBRARY_MODULES) \
                myutilities1 myutilities2 logwrites \
                main
\end{verbatim}
  % $  <- emacs hack

 Again, order is important: \t{myutilities2.ml} will be able to refer
 to Myutilities1 but not Logwrites.  If you have any ocamllex or ocamlyacc
 files, add them to both \t{CILLY\_MODULES} and either \t{MLLS} or
 \t{MLYS}.

 \item Modify \t{main.ml} so that your new feature descriptor appears in
 the global list of CIL features.
\begin{verbatim}
let features : C.featureDescr list =
  [ Logcalls.feature;
    Oneret.feature;
    Heapify.feature1;
    Heapify.feature2;
    makeCFGFeature;
    Partial.feature;
    Simplemem.feature;
    Logwrites.feature;  (* add this line to include the logwrites feature! *)
  ]
  @ Feature_config.features
\end{verbatim}

 Features are processed in the order they appear on this list.  Put
 your feature last on the list if you plan to run any of CIL's
 built-in features (such as makeCFGfeature) before your own.

\end{enumerate}

 Standard code in \t{cilly} takes care of adding command-line arguments,
 printing the description, and calling your function automatically.
 Note: do not worry about introducing new bugs into CIL by adding a single
 line to the feature list.

 \item Now you can invoke the \t{cilly} application on a preprocessed file, or
 instead use the \t{cilly} driver which provides a convenient compiler-like
 interface to \t{cilly}. See \secref{driver} for details using \t{cilly}.
 Remember to enable your analysis by passing the right argument (e.g.,
 \t{-{}-{}dologwrites}).

\end{enumerate}

\subsection{Using CIL as a library}

CIL can also be built as a library that is called from your stand-alone
application. Add \t{cil/src}, \t{cil/src/frontc}, \t{cil/obj/x86\_LINUX}
(or \t{cil/obj/x86\_WIN32}) to your Ocaml project \t{-I} include paths.
Building CIL will also build the library \t{cil/obj/*/cil.cma} (or
\t{cil/obj/*/cil.cmxa}). You can then link your application against that
library.

You can call the \t{Frontc.parse: string -> unit -> Cil.file} function with
the name of a file containing the output of the C preprocessor.
The \t{Mergecil.merge: Cil.file list -> string -> Cil.file} function merges
multiple files. You can then invoke your analysis function on the resulting
\t{Cil.file} data structure.  You might want to call
\t{Rmtmps.removeUnusedTemps} first to clean up the prototypes and variables
that are not used. Then you can call the function \t{Cil.dumpFile:
cilPrinter -> out\_channel -> Cil.file -> unit} to print the file to a
given output channel. A good \t{cilPrinter} to use is
\t{defaultCilPrinter}.

Check out \t{src/main.ml} and \t{bin/cilly} for other good ideas
about high-level file processing. Again, we highly recommend that you just
our \t{cilly} driver so that you can avoid spending time re-inventing the
wheel to provide drop-in support for standard \t{makefile}s.

Here is a concrete example of compiling and linking your project against
CIL. Imagine that your program analysis or transformation is contained in
the single file \t{main.ml}.

\begin{verbatim}
$ ocamlopt -c -I $(CIL)/obj/x86_LINUX/ main.ml
$ ocamlopt -ccopt -L$(CIL)/obj/x86_LINUX/ -o main unix.cmxa str.cmxa \
        $(CIL)/obj/x86_LINUX/cil.cmxa main.cmx
\end{verbatim} % $

The first line compiles your analysis, the second line links it against CIL
(as a library) and the Ocaml Unix library. For more information about
compiling and linking Ocaml programs, see the Ocaml home page
at \ahreftop{http://caml.inria.fr/ocaml/}{http://caml.inria.fr/ocaml/}.

In the next section we give an overview of the API that you can use
to write your analysis and transformation.

\section{CIL API Documentation}\label{sec-api}

 The CIL API is documented in the file \t{src/cil.mli}. We also have an
\ahref{api/index.html}{online documentation} extracted from
\t{cil.mli} and other useful modules. We
index below the main types that are used to represent C programs in CIL:

\begin{itemize}
\item \ahref{api/index\_types.html}{An index of all types}
\item \ahref{api/index\_values.html}{An index of all values}
\item \ciltyperef{file} is the representation of a file.
\item \ciltyperef{global} is the representation of a global declaration or
definitions. Values for \ahref{api/Cil.html\#VALemptyFunction}{operating on globals}.
\item \ciltyperef{typ} is the representation of a type.
Values for \ahref{api/Cil.html\#VALvoidType}{operating on types}.
\item \ciltyperef{compinfo} is the representation of a structure or a union
type
\item \ciltyperef{fieldinfo} is the representation of a field in a structure
or a union
\item \ciltyperef{enuminfo} is the representation of an enumeration type.
\item \ciltyperef{varinfo} is the representation of a variable
\item \ciltyperef{fundec} is the representation of a function
\item \ciltyperef{lval} is the representation of an lvalue.
Values for \ahref{api/Cil.html\#VALmakeVarinfo}{operating on lvalues}.
\item \ciltyperef{exp} is the representation of an expression without
side-effects.
Values for \ahref{api/Cil.html\#VALzero}{operating on expressions}.
\item \ciltyperef{instr} is the representation of an instruction (with
side-effects but without control-flow)
\item \ciltyperef{stmt} is the representation of a control-flow statements.
Values for \ahref{api/Cil.html\#VALmkStmt}{operating on statements}.
\item \ciltyperef{attribute} is the representation of attributes.
Values for \ahref{api/Cil.html\#TYPEattributeClass}{operating on attributes}.
\end{itemize}


 \subsection{Using the visitor}\label{sec-visitor}

 One of the most useful tools exported by the CIL API is an implementation of
the visitor pattern for CIL programs. The visiting engine scans depth-first
the structure of a CIL program and at each node it queries a user-provided
visitor structure whether it should do one of the following operations:
\begin{itemize}
\item Ignore this node and all its descendants
\item Descend into all of the children and when done rebuild the node if any
of the children have changed.
\item Replace the subtree rooted at the node with another tree.
\item Replace the subtree with another tree, then descend into the children
and rebuild the node if necessary and then invoke a user-specified function.
\item In addition to all of the above actions the visitor can specify that
some instructions should be queued to be inserted before the current
instruction or statement being visited.
\end{itemize}

 By writing visitors you can customize the program traversal and
transformation. One major limitation of the visiting engine is that it does
not propagate information from one node to another. Each visitor must use its
own private data to achieve this effect if necessary.

 Each visitor is an object that is an instance of a class of type \cilvisit{}.
The most convenient way to obtain such classes is to specialize the
\apiref{Cil.nopCilVisitor}{} class (which just traverses the tree doing
nothing). Any given specialization typically overrides only a few of the
methods. Take a look for example at the visitor defined in the module
\t{logwrites.ml}. Another, more elaborate example of a visitor is the
[copyFunctionVisitor] defined in \t{cil.ml}.

 Once you have defined a visitor you can invoke it with one of the functions:
\begin{itemize}
\item \cilvalref{visitCilFile} or \cilvalref{visitCilFileSameGlobals} - visit a file
\item \cilvalref{visitCilGlobal} - visit a global
\item \cilvalref{visitCilFunction} - visit a function definition
\item \cilvalref{visitCilExp} - visit an expression
\item \cilvalref{visitCilLval} - visit an lvalue
\item \cilvalref{visitCilInstr} - visit an instruction
\item \cilvalref{visitCilStmt} - visit a statement
\item \cilvalref{visitCilType} - visit a type. Note that this does not visit
the files of a composite type. use visitGlobal to visit the [GCompTag] that
defines the fields.
\end{itemize}

Some transformations may want to use visitors to insert additional
instructions before statements and instructions. To do so, pass a list of
instructions to the  \cilvalref{queueInstr} method of the specialized
object. The instructions will automatically be inserted before that
instruction in the transformed code. The \cilvalref{unqueueInstr} method
should not normally be called by the user.

 \subsection{Interpreted Constructors and Deconstructors}

 Interpreted constructors and deconstructors are a facility for constructing
and deconstructing CIL constructs using a pattern with holes that can be
filled with a variety of kinds of elements. The pattern is a string that uses
the C syntax to represent C language elements. For example, the following
code:
\begin{code}
Formatcil.cType "void * const (*)(int x)"
\end{verbatim}\end{code}

 is an alternative way to construct the internal representation of the type of pointer to function
with an integer argument and a {void * const} as result:
\begin{code}
TPtr(TFun(TVoid [Attr("const", [])],
          [ ("x", TInt(IInt, []), []) ], false, []), [])
\end{verbatim}\end{code}

 The advantage of the interpreted constructors is that you can use familiar C
syntax to construct CIL abstract-syntax trees.

 You can construct this way types, lvalues, expressions, instructions and
statements. The pattern string can also contain a number of placeholders that
are replaced during construction with CIL items passed as additional argument
to the construction function. For example, the \t{\%e:id} placeholder means
that the argument labeled ``id'' (expected to be of form \t{Fe exp}) will
supply the expression to replace the placeholder. For example, the following
code constructs an increment instruction at location \t{loc}:
\begin{code}
Formatcil.cInstr "%v:x = %v:x + %e:something"
        loc
        [ ("something", Fe some_exp);
          ("x", Fv some_varinfo) ]
\end{verbatim}\end{code}

 An alternative way to construct the same CIL instruction is:
\begin{code}
Set((Var some_varinfo, NoOffset),
    BinOp(PlusA, Lval (Var some_varinfo, NoOffset),
          some_exp, intType),
    loc)
\end{verbatim}\end{code}

 See \ciltyperef{formatArg} for a definition of the placeholders that are
understood.

 A dual feature is the interpreted deconstructors. This can be used to test
whether a CIL construct has a certain form:
\begin{code}
Formatcil.dType "void * const (*)(int x)" t
\end{verbatim}\end{code}

 will test whether the actual argument \t{t} is indeed a function pointer of
the required type. If it is then the result is \t{Some []} otherwise it is
\t{None}. Furthermore, for the purpose of the interpreted deconstructors
placeholders in patterns match anything of the right type. For example,
\begin{code}
Formatcil.dType "void * (*)(%F:t)" t
\end{verbatim}\end{code}

 will match any function pointer type, independent of the type and number of
the formals. If the match succeeds the result is \t{Some [ FF forms ]} where
\t{forms} is a list of names and types of the formals. Note that each member
in the resulting list corresponds positionally to a placeholder in the
pattern.

 The interpreted constructors and deconstructors do not support the complete C
syntax, but only a substantial fragment chosen to simplify the parsing. The
following is the syntax that is supported:
\begin{verbatim}
Expressions:
  E ::= %e:ID | %d:ID | %g:ID | n | L | ( E ) | Unop E | E Binop E
        | sizeof E | sizeof ( T ) | alignof E  | alignof ( T )
        | & L | ( T ) E

Unary operators:
  Unop ::= + | - | ~ | %u:ID

Binary operators:
  Binop ::= + | - | * | / | << | >> | & | ``|'' | ^
          | == | != | < | > | <= | >= | %b:ID

Lvalues:
  L ::= %l:ID | %v:ID Offset | * E | (* E) Offset | E -> ident Offset

Offsets: