forked from hikettei/Caten
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetting-started.lisp
286 lines (244 loc) · 13.1 KB
/
getting-started.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
;;;; Note: This file is not intended to be loaded directly from Lisp,
;;;; but to be executed expression by expression.
;;;; If you are using Emacs/Lem, you can simply press `C-C C-c` while your cursor is hovering over an expression
(unless (find-package :caten)
(ql:quickload :caten))
(defpackage :getting-started
(:use :cl :caten/air :caten/aasm :caten/api :caten/runtime))
(in-package :getting-started)
;;; [Introduction]
;;; Welcome to Caten.
;;; Caten is a tensor library written in Common Lisp.
;;; Our ultimate goal is to generate a fast deep learning model inference runtime across a wide range of devices with minimal cost.
;;;
;;; We need contributors. This document was created to help new contributors to quickly understand the design of Caten.
;;; To learn how Caten works, you first need to understand the following 4 main components:
;;; - 1. caten/api | High-Level Graph Interface (API)
;;; - 2. caten/air | Low-Level Graph Interface
;;; - 3. caten/codegen | AIR Graph => Kernel Generator
;;; - 4. caten/runtime | AIR Graph Runner + Buffer managements
;;; **All other systems are built on top of these packages.**
;;; ~~~[1. High Level Interface (caten/api)]~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;;; The main role of `caten/api` is to provide a matrix operation API **with the same interface as Numpy/PyTorch**.
;;; Like Petalisp/tinygrad, Caten uses lazy evaluation.
;; For example, creating a 3x3 matrix initialized with 1.0 using make-tensor does not trigger any computation, that is what we call lazy evaluation.
(defparameter *tensor1* (make-tensor `(3 3) :initial-element 1.0))
(print *tensor1*) ;; Getting nothing! The tensor is not evaluated. This is called lazy evaluation.
;; If you want to see the result, you have to compile the kernel using the function `proceed`.
(print (proceed *tensor1*))
;; Let's define another tensor for the next experiments
(defparameter *tensor2* (make-tensor `(3 3) :initial-element 1.0))
;; To execute a previously compiled graph without recompiling, create a `GraphRuntime` using the `caten` function, then execute it with forward.
;; The graph created consists of a matrix multiplication, passing two tensors
;; The caten function creates the low-level graph, that is a compiled (low level) version of the matmul as an AST
(print (caten (!matmul *tensor1* *tensor2*)))
;; this is being executed with forward
(print (forward (caten (!matmul *tensor1* *tensor2*))))
;; Proceed evaluates by computing the lower level AST and executes it with forward
(print (proceed (!matmul *tensor1* *tensor2*)))
;; Of course, Caten is designed so that all graphs can be compiled with dynamic shapes. There's no need to recompile every time the batch_size changes.
;;; The goal of `caten/api` is to prevent bugs by wrapping the low-level interface commands (described later) in a high-level API.
;;; You can use various optimizations by lowering the AST in `caten/api` into `caten/air`!
;;; => In the next section, we will learn about `caten/air`.
;;; ~~~[Low Level Interface (caten/air)]~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;;; In general, a deep learning compiler is a program that lowers transformations and operations on a DAG (Directed Acyclic Graph).
;;; Caten implements a general-purpose DAG processing library called `caten/air`.
;;; Any data structure with a DAG structure in Caten's source code should be defined as a `caten/air:Graph`.
(print (make-node :BinaryOps :ADD (list 'a) (list 'b 'c) :reduction t))
(print (make-graph (make-node :BinaryOps :ADD (list 'a) (list 'b 'c))))
;;; Any function starting with `%` represents low level operations for the air node creation.
;;; By wrapping such graph constructions with the with-context macro, Caten automatically organizes them into an executable graph.
;;; - caten/runtime:realize-graph to run the graph in GraphRuntime
;;; - caten/aasm:->dot to open the graph in your browser
;; Example:
;; The graph defines a low level graph that performs addition, two constants are defined and added together
;; .dot requires graphdotviz!
(let ((graph
(with-context
(x (%fconst 1.0))
(y (%fconst 2.0))
(out (%add x y)))))
(print graph)
(print (realize-graph graph :buffer-type 'caten/byoc/lisp:LispBuffer))
;;(->dot graph)
)
;;; - Nodes can be defined using defnode.
;;; - Pattern Matcher can be defined using `defsimplifier`
;;; - `caten/aasm` provides a set of instruction used in GraphRuntime
;;; ~~~[A Bridge between AIR and API]~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;;; In this section, we will explain the mechanism of lowering the high-level interface(API) to the low-level interface(AIR Graph).
;; utility function for presentation
(defun present (&rest tensors)
"Present compiles and executes the given tensor, then prints the result."
(format t "~{~& =>~% ~A~}" (multiple-value-list (apply #'proceed tensors))))
;;; To lower `caten/api` to a Graph, implement the lower method in the computation graph `Func` of `Caten/api` and describe the lowered computation graph there.
;;; Let’s try defining a `Func` to compute `Sin(Cos(x))` as an example.
(defclass SinCos (Func) nil
(:documentation "The func SinCos computes sin(cos(x))"))
;; Forward creates a lazy tensor for the next computation.
;; You can skip this process by using the `st` macro.
(defmethod forward ((op SinCos) &rest tensors) (st "A[~] -> A[~]" (tensors)))
;; Backward is optional (skipped this time)
(defmethod backward ((op SinCos) &optional prev-grad) (declare (ignore prev-grad)) nil)
;; Lower describes the lowered expression of `SinCos`
(defmethod lower ((op SinCos) &rest inputs)
(let ((x (car inputs)))
(with-context
(a (%sin (%add x (%fconst (/ pi 2)))))
(b (%sin a)))))
;; You can create the next lazy tensor using the forward method.
;; Let's define a function doing this as a utility named !sincos.
(defun !sincos (tensor)
(forward (make-instance 'SinCos) tensor))
;; Calling !SinCos (Make sure that :op = SINCOS)
(print (!sincos (make-tensor `(3 3))))
;; Running SinCos
(present
(!sincos (make-tensor `(3 3) :initial-element 1.0))) ;; = 0.51439524
;;; ~~~~[caten/codegen]~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
(defpackage :codegen-example
(:use :cl :caten/air :caten/aasm :caten/api :caten/runtime :caten/codegen/expr-cache)
;; Import some low-level APIs
(:import-from
:caten/codegen/scheduler
#:graph-schedule)
(:import-from
:caten/codegen/rewriting-rules
#:apply-rewriting-rules
#:schedule-item-write-define-global)
(:import-from
:caten/codegen/shape-inference
#:run-type-infer)
(:import-from
:caten/codegen/blueprint
#:lower-schedule-item
#:print-blueprint)
(:import-from
:caten/codegen/renderer
#:CStyle-Renderer
#:%render-kernel
#:%compile-kernel)
(:import-from
:caten/codegen/jit
#:schedule-graph->avm-graph))
(in-package :codegen-example)
;; Defining some utils ...
(defun run-shape-inference (runtime)
(run-type-infer runtime)
(apply-rewriting-rules runtime))
(defparameter *width* 120)
(defun saying (number title object)
(dotimes (i *width*) (princ "="))
(fresh-line)
(format t "~a. ~a~%~a~%" number title object)
object)
(defun try-codegen! (graph outputs)
"Utility for visualizing the process of compiling the `graph`."
(optimize-aasm graph)
(let ((vm (make-runtime graph :fw-outputs outputs)))
(fresh-line)
(saying 1 "Compiling the following initial computation graph:" graph)
(saying 2 "Created GraphRuntime with the computation graph:" vm)
(run-shape-inference vm)
(let ((schedule-graph (graph-schedule (runtime-graph vm)))
(*expr-cache* (make-expr-cache))
(renderer (make-instance 'CStyle-Renderer)))
(saying 3 "Generated schedule-graph with the computation graph" schedule-graph)
(dolist (item (graph-nodes schedule-graph))
;; If schedule-item was labelled as jitable, you can lower this
(when (getattr item :jitable)
(lower-schedule-item item (runtime-graph vm) schedule-graph)
(saying 4 "Lowered schedule item to a blueprint suitable for code generation:" (print-blueprint (getattr item :blueprint) nil))
(schedule-item-write-define-global item)
(let ((c-kernel (%render-kernel renderer item)))
(saying 5 "Generated C code from the blueprint:" c-kernel)
(setf (getattr item :rendered-object) c-kernel))))
;; Invoking gcc ...
(%compile-kernel renderer (graph-nodes schedule-graph) nil)
;; Overwrite the base graph with the compiled graph
(let ((optimized-graph (schedule-graph->avm-graph (runtime-graph vm) schedule-graph)))
(make-runtime optimized-graph :fw-outputs (graph-outputs schedule-graph) :buffer-type 'caten/byoc/lisp:LispBuffer)))))
;; Example 1. Lowering two matrices' addition into the C kernel.
(let* ((graph
;; This is a graph to compile: Z(3 3) = X(3 3) + Y(3 3)
(with-context
(x (%make-tensor `(3 3) :dtype :float32 :from 'x))
(y (%make-tensor `(3 3) :dtype :float32 :from 'y))
(c (%add x y :id 'z))))
;; Simplifying the input graph
(_ (optimize-aasm graph))
;; Wrap the graph as an instance of GraphRuntime to manage allocations
(vm (make-runtime graph :fw-outputs (list 'z))))
(declare (ignore _))
(fresh-line)
(saying 1 "Compiling the following initial
computation graph:" graph)
(saying 2 "Created GraphRunner with the computation graph:" vm)
;; caten/codegen requires the shape of all computation nodes to be known!
(run-shape-inference vm)
;; Ready for running the scheduler. `graph-schedule` to partition the input graph.
;; Pass the wrapped runtime graph into the `graph-schedule` which divides the graph into smaller units called `schedule-items`
(let ((schedule-graph (graph-schedule (runtime-graph vm)))
(*expr-cache* (make-expr-cache))
(renderer (make-instance 'CStyle-Renderer)))
(saying 3 "Generated schedule-graph with the computation graph" schedule-graph)
;; After the `schedule-items` are created, the items are lowered into blueprint suitable code for code-generation
(dolist (item (graph-nodes schedule-graph))
;; If schedule-item was labelled as jitable, you can lower this
(when (getattr item :jitable)
;;Each item from the AST is lowered using the lower-schedule-item function
(lower-schedule-item item (runtime-graph vm) schedule-graph)
(saying 4 "Lowered schedule item to a blueprint suitable for code generation:" (print-blueprint (getattr item :blueprint) nil))
(schedule-item-write-define-global item)
;; With the blueprint code, a C kernel is generated
(let ((c-kernel (%render-kernel renderer item)))
(saying 5 "Generated C code from the blueprint:" c-kernel)
(setf (getattr item :rendered-object) c-kernel))))
;; Invoking gcc ...
(%compile-kernel renderer (graph-nodes schedule-graph) nil)
;; Overwrite the base graph with compiled graph
(let* ((optimized-graph (schedule-graph->avm-graph (runtime-graph vm) schedule-graph))
(vm (make-runtime optimized-graph :fw-outputs (graph-outputs schedule-graph) :buffer-type 'caten/byoc/lisp:LispBuffer)))
;; Try axpy!
;; Finally, the C code is executed
(saying
6 "Running the computation X(3x3) + Y(3x3), the result is:"
(%run vm `(x . ,(linspace `(3 3) 1 0)) `(y . ,(linspace `(3 3) 1 0)))))))
;; Example 2. Lowering two squared matrix multiplying into the C kernel.
(defun %make-squared-gemm (N out)
"a @ b.T"
(optimize-aasm
(with-context
(a (%make-tensor `(,n ,n)))
(b (%make-tensor `(,n ,n)))
(a1 (%view a `(,n ,n ,n) `(0 0 0) `(,n ,n ,n) `(1 1 1) `(nil t nil) (%stride `(,n 1 ,n) :row)))
(b1 (%view b `(,n ,n ,n) `(0 0 0) `(,n ,n ,n) `(1 1 1) `(t nil nil) (%stride `(1 ,n ,n) :row)))
(o (%mul a1 b1))
(c (%load (%make-tensor `(,n ,n 1)) 0.0))
(c (%view c `(,n ,n 1) `(0 0 0) `(,n ,n ,n) `(1 1 1) `(nil nil t) (%stride `(,n ,n 1) :row)))
(c (%add c o :reduction t))
(c (%reshape c `(,n ,n) :id out)))))
;; Try matmul
(try-codegen!
(%make-squared-gemm 128 'out)
(list 'out))
#+(or nil)
(try-codegen!
(make-graph
;; your code follows ...
)
(list ... ))
;; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;;; Caten provides APIs that use the aforementioned JIT by default.
;;; - Set JIT=1 to enable it.
;;; - By setting JIT_DEBUG>=2, you can access a debugger similar to the previous one.
(defun present-beautiful (tensor)
(ctx:with-contextvar (:BACKEND "clang" :JIT_DEBUG 3)
(with-no-grad (caten tensor))))
;; auto-scheduler is WIP as of this writing!
(present-beautiful
(!matmul (make-tensor `(128 512)) (make-tensor `(512 1024))))
(present-beautiful
(!matmul (make-tensor `(a b)) (make-tensor `(b c))))
(present-beautiful
(forward (caten/nn:Embedding 10 20) (make-tensor `(b 20))))