-
Notifications
You must be signed in to change notification settings - Fork 0
/
swargol.py
435 lines (349 loc) · 15.5 KB
/
swargol.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
import sdl2
import os
import time
from queue import Queue, Empty
from threading import Thread
from multiprocessing import Pipe, Process, Event
from typing import List
from dataclasses import dataclass
"""
┌───────────┐ Graphics Process
│ ┌────┐ │ ┌───────────────────────────────┐
│ │ ┌─▼────┴─┐ │ ┌─────────┐ ┌────────────┐ │
│ │ │ Life ├─────► Blitter ├──► │ │
│ │ └─┬────▲─┘ │ └─────────┘ │ │ │
▼ │ ┌─▼────┴─┐ │ ┌─────────┐ │ │ │
│ ▲ │ Life ├─────► Blitter ├──► │ │
│ │ └─┬────▲─┘ │ └─────────┘ │ GUI │ │
│ │ ┌─▼────┴─┐ │ ┌─────────┐ │ Renderer │ │
│ │ │ Life ├─────► Blitter ├──► │ │
│ │ └─┬────▲─┘ │ └─────────┘ │ │ │
▼ │ ┌─▼────┴─┐ │ ┌─────────┐ │ │ │
│ ▲ │ Life ├─────► Blitter ├──► │ │
│ │ └─┬────▲─┘ │ └─────────┘ └────────────┘ │
│ └────┘ │ └───────────────────────────────┘
└───────────┘
"Life" threads implement the SWAR life algorithm, for a horizontal strip of
the overall canvas.
Blitter threads use SDL2 functions to unpack the SWAR buffers into RGBA8888
surfaces, which are passed to the main Renderer thread.
The renderer thread is responsible for uploading the surfaces to a GPU texture,
and making it show up on the screen. It's also responsible for dealing with SDL
events.
Each Life thread lives in its own process, to avoid the GIL. They
talk to each other (for overlap/wraparound), and to the Blitter threads (to
report their results), using Pipes.
Everything else happens in the main process, so the Blitters can talk to the
main thread using standard Queues - the hard work here is done inside SDL2,
which does not hold the GIL (meaning we can use multithreading, as opposed
to multiprocessing).
"""
# I prefer the way scaling works by default on x11, and can't be bothered to fix it under wayland
os.environ["SDL_VIDEODRIVER"] = "x11"
SURFACE_FMT = sdl2.SDL_PIXELFORMAT_ARGB8888
# at time of writing, SDL2 blits INDEX4LSB surfaces as if they were actually INDEX4MSB
# Setting this to False should give a small perf boost, but will render incorrectly on
# not-bleeding-edge SDL2 builds
INDEX4LSB_WORKAROUND = True
WIDTH_PADDING = 16
GLIDER_TEST = False
# RGBA
COLOUR_OFF = (40, 40, 40, 255)
COLOUR_ON = (255, 255, 255, 255)
@dataclass(kw_only=True)
class LifeConfig:
"""
Render Conway's Game of Life via SDL2, unreasonably quickly.
:param width: framebuffer width
:param height: framebuffer height
:param vsync: enable vsync
:param fullscreen: enable fullscreen
:param drylife: use the non-standard "drylife" algorithm
:param slow: use the very slow implementation (for benchmark comparisons)
:param frameskip: only render 1-in-n frames to the screen
:param num_procs: degree of parallelism (NB: number of actual threads will be 2n+1)
:param bench_frames: render a certain number of frames and then exit
""" # this docstring is used by clize
width: int = 1280
height: int = 720
vsync: bool = True
fullscreen: bool = False
drylife: bool = True
slow: bool = False
frameskip: int = 1
num_procs: int = 8
bench_frames: int = 0
# return all remaining items in a queue
def queue_purge(queue: Queue):
try:
while True:
yield queue.get_nowait()
except Empty:
pass
# This is the de-optimised version of the `life_thread` function
# It exists solely for reference purposes, and performance comparisons
def life_thread_naive(cfg: LifeConfig, i, width, height, packed_pipe, pipe_top, pipe_bottom):
assert(width % 2 == 0)
width += WIDTH_PADDING
# 4 bits per cell, to match the buffer format of the optimised version
state = bytearray((width * (height + 2)) // 2) # an extra row on top and bottom to handle wraparound
next_state = bytearray(len(state))
state[width//2:-width//2] = [x & 0x11 for x in os.urandom((width * height) // 2)]
def get_cell(state, x, y):
bit_idx = (y * width + x) * 4
return (state[bit_idx // 8] >> (bit_idx % 8)) & 1
def set_cell(state, x, y, val):
bit_idx = (y * width + x) * 4
state[bit_idx // 8] = (state[bit_idx // 8] & ~(1 << (bit_idx % 8))) | (val << (bit_idx % 8))
while True:
pipe_top.send_bytes(state[width//2:width])
pipe_bottom.send_bytes(state[-width:-width//2])
state[:width//2] = pipe_top.recv_bytes()
state[-width//2:] = pipe_bottom.recv_bytes()
for y in range(1, height + 1):
for x in range(width):
neighbor_count = sum(
get_cell(state, (x + dx) % width, y + dy)
for dx, dy in [
(-1, -1), (0, -1), (1, -1),
(-1, 0), (1, 0),
(-1, 1), (0, 1), (1, 1)
]
)
this_cell = get_cell(state, x, y)
next_value = neighbor_count == 3 or (this_cell and neighbor_count == 2)
if cfg.drylife: # another opportunity for dead cells to come alive
next_value |= (not this_cell) and neighbor_count == 7
set_cell(next_state, x, y, next_value)
state, next_state = next_state, state # swap buffers
packed_state = state[width//2:-width//2]
packed_pipe.send_bytes(packed_state[::-1] if INDEX4LSB_WORKAROUND else packed_state)
def life_thread(cfg: LifeConfig, i, width, height, packed_pipe, pipe_top, pipe_bottom):
assert(width % 2 == 0)
STRIDE = width + WIDTH_PADDING
STATE_BYTE_LENGTH = (STRIDE * height) // 2
COLSHIFT = STRIDE * 4
WRAPSHIFT = STRIDE * height * 4
BIAS = (STRIDE + 2) * 4
MASK_1 = int.from_bytes(b"\x11" * STATE_BYTE_LENGTH, "little") << BIAS
MASK_CANVAS = int.from_bytes((b"\x11" * (width // 2) + b"\x00" * (WIDTH_PADDING // 2)) * height, "little") << BIAS
MASK_WRAP_LEFT = int.from_bytes((b"\x11" * ((WIDTH_PADDING // 2) // 2) + b"\x00" * ((width - WIDTH_PADDING // 2) // 2) + b"\x00" * (WIDTH_PADDING // 2)) * (height + 2), "little") << (2 * 4)
MASK_WRAP_RIGHT = int.from_bytes((b"\x00" * ((width - WIDTH_PADDING // 2) // 2) + b"\x11" * ((WIDTH_PADDING // 2) // 2) + b"\x00" * (WIDTH_PADDING // 2)) * (height + 2), "little") << (2 * 4)
MASK_NOT_3 = MASK_1 * (15 ^ 3)
MASK_NOT_4 = MASK_1 * (15 ^ 4)
MASK_NOT_7 = MASK_1 * (15 ^ 7)
if not GLIDER_TEST:
seed_bytes = os.urandom(STATE_BYTE_LENGTH)
else:
# glider test
seed_bytes = bytearray(STATE_BYTE_LENGTH)
if i == 0:
seed_bytes[(STRIDE//2)*4+3:(STRIDE//2)*4+3+2] = b"\x10\x00"
seed_bytes[(STRIDE//2)*5+3:(STRIDE//2)*5+3+2] = b"\x00\x01"
seed_bytes[(STRIDE//2)*6+3:(STRIDE//2)*6+3+2] = b"\x11\x01"
state = (int.from_bytes(seed_bytes, "little") << BIAS) & MASK_CANVAS
pipe_top.send_bytes(seed_bytes[:STRIDE//2]) # send up our top row
pipe_bottom.send_bytes(seed_bytes[-STRIDE//2:]) # send down our bottom row
framectr = 0
try:
while True: # we'll keep going until killed
"""
if we include ourself as a neighbor:
alive = (exactly 3 neighbors) or (alive and 4 neighbors)
"""
# implement wraparound
# vertical wrap
state |= int.from_bytes(pipe_top.recv_bytes(), "little") | (int.from_bytes(pipe_bottom.recv_bytes(), "little") << (WRAPSHIFT + BIAS))
# horizontal wrap
state |= ((state & MASK_WRAP_LEFT) << (width * 4)) | ((state & MASK_WRAP_RIGHT) >> (width * 4))
# count neighbors
summed = state
summed += (summed >> 4) + (summed << 4)
summed += (summed >> COLSHIFT) + (summed << COLSHIFT)
# check if there are exactly 3 neighbors
has_3_neighbors = summed ^ MASK_NOT_3 # at this point, a value of all 1s means it was initially 3
has_3_neighbors &= has_3_neighbors >> 2 # fold in half
has_3_neighbors &= has_3_neighbors >> 1 # fold in half again
# check if there are exactly 4 neighbors
has_4_neighbors = summed ^ MASK_NOT_4 # at this point, a value of all 1s means it was initially 4
has_4_neighbors &= has_4_neighbors >> 2 # fold in half
has_4_neighbors &= has_4_neighbors >> 1 # fold in half again
if cfg.drylife:
# check if there are exactly 7 neighbors
has_7_neighbors = summed ^ MASK_NOT_7 # at this point, a value of all 1s means it was initially 7
has_7_neighbors &= has_7_neighbors >> 2 # fold in half
has_7_neighbors &= has_7_neighbors >> 1 # fold in half again
# variable name here is misleading...
has_3_neighbors |= (~state) & has_7_neighbors
# apply game-of-life rules
state = (has_3_neighbors | (state & has_4_neighbors)) & MASK_CANVAS
packed_state = (state>>BIAS).to_bytes(STATE_BYTE_LENGTH, "little")
pipe_top.send_bytes(packed_state[:STRIDE//2+1])
pipe_bottom.send_bytes(packed_state[-(STRIDE//2+1):])
framectr += 1
if framectr % cfg.frameskip:
continue
packed_pipe.send_bytes(packed_state[-WIDTH_PADDING//2::-1] if INDEX4LSB_WORKAROUND else packed_state)
except KeyboardInterrupt: # this should only happen if the user pressed Ctrl+C
print("life_thread SIGINT")
while True:
packed_pipe.send_bytes(bytes(STATE_BYTE_LENGTH)) # unblock any readers, until we get killed
def blit_thread(cfg: LifeConfig, i: int, section_height :int, stopped: Event, packed_queue, blitted_queue: Queue):
while not stopped.is_set():
packed_frame = packed_queue.recv_bytes() # this needs to stay in scope until SDL_ConvertSurfaceFormat is complete!
surface = sdl2.SDL_CreateRGBSurfaceWithFormatFrom(
packed_frame,
cfg.width, section_height,
4, # bit-depth
(cfg.width + WIDTH_PADDING) // 2, # pitch
sdl2.SDL_PIXELFORMAT_INDEX4MSB if INDEX4LSB_WORKAROUND else sdl2.SDL_PIXELFORMAT_INDEX4LSB
)
sdl2.SDL_SetPaletteColors(surface.contents.format.contents.palette, sdl2.SDL_Color(*COLOUR_OFF), 0, 1)
sdl2.SDL_SetPaletteColors(surface.contents.format.contents.palette, sdl2.SDL_Color(*COLOUR_ON), 1, 1)
blitted_surface = sdl2.SDL_ConvertSurfaceFormat(surface, SURFACE_FMT, 0)
if not blitted_surface:
raise Exception("SDL_ConvertSurfaceFormat: " + sdl2.SDL_GetError().decode())
blitted_queue.put(blitted_surface)
sdl2.SDL_FreeSurface(surface)
print(f"blit_thread {i}: graceful exit")
packed_queue.recv_bytes() # do a final read to un-block the writer
packed_queue.close() # close our end of the Pipe
def gui_thread(cfg: LifeConfig, section_heights: List[int], blitted_queues: List[Queue]):
window = sdl2.SDL_CreateWindow(
b"pyswargol",
sdl2.SDL_WINDOWPOS_UNDEFINED, sdl2.SDL_WINDOWPOS_UNDEFINED,
cfg.width, cfg.height,
sdl2.SDL_WINDOW_SHOWN
)
if not window:
raise Exception("Failed to create SDL2 Window")
renderer = sdl2.SDL_CreateRenderer(window, -1, sdl2.SDL_RENDERER_ACCELERATED | (sdl2.SDL_RENDERER_PRESENTVSYNC if cfg.vsync else 0))
if not renderer:
raise Exception("Failed to create SDL2 Renderer")
if cfg.fullscreen:
sdl2.SDL_SetWindowFullscreen(window, sdl2.SDL_WINDOW_FULLSCREEN)
sdl2.SDL_ShowCursor(sdl2.SDL_DISABLE) # hide cursor
textures = []
for h in section_heights:
texture = sdl2.SDL_CreateTexture(
renderer,
SURFACE_FMT,
sdl2.SDL_TEXTUREACCESS_STREAMING,
cfg.width, h
)
if not texture:
raise Exception("Failed to create SDL2 Texture")
textures.append(texture)
if INDEX4LSB_WORKAROUND:
blitted_queues.reverse()
textures.reverse()
prev_times = [time.time()]*60
frame_ctr = 0
running = True
try:
while running:
e = sdl2.SDL_Event()
while sdl2.SDL_PollEvent(e):
if e.type == sdl2.SDL_QUIT:
running = False
if e.type == sdl2.SDL_KEYDOWN:
if e.key.keysym.sym == sdl2.SDLK_ESCAPE:
running = False
if e.key.keysym.sym == sdl2.SDLK_q:
running = False
if not running:
break
y = 0
for surface_queue, texture in zip(blitted_queues, textures):
surface = surface_queue.get()
sdl2.SDL_UpdateTexture(
texture, None,
surface.contents.pixels,
surface.contents.pitch
)
h = surface.contents.h
sdl2.SDL_FreeSurface(surface)
sdl2.SDL_RenderCopy(
renderer, texture, None,
sdl2.SDL_Rect(0, y, cfg.width, h)
)
y += h
sdl2.SDL_RenderPresent(renderer)
now = time.time()
fps = len(prev_times)/(now-prev_times[frame_ctr % len(prev_times)])
msg = f"{round(fps)}fps ({round(fps*cfg.frameskip)}tps)" if frame_ctr > len(prev_times) else "??fps (??tps)"
sdl2.SDL_SetWindowTitle(window, (f"pyswargol - {cfg.width}x{cfg.height} - " + msg).encode())
prev_times[frame_ctr % len(prev_times)] = now
frame_ctr += 1
if cfg.bench_frames and frame_ctr > cfg.bench_frames:
os._exit(0) # exit immediately, don't waste time cleaning up
finally:
for texture in textures:
sdl2.SDL_DestroyTexture(texture)
sdl2.SDL_DestroyRenderer(renderer)
sdl2.SDL_DestroyWindow(window)
sdl2.SDL_Quit()
def main(cfg: LifeConfig):
# init sdl2 here so we can query screen size for fullscreen mode
if sdl2.SDL_Init(sdl2.SDL_INIT_VIDEO) < 0:
raise Exception("Failed to init SDL2")
if cfg.fullscreen:
dm = sdl2.SDL_DisplayMode()
sdl2.SDL_GetDesktopDisplayMode(0, dm)
print(f"Overriding fb size to match fullscreen resolution: {dm.w}x{dm.h}")
cfg.width, cfg.height = dm.w, dm.h
stopped = Event()
# vertically split the framebuffer into close-to-equal sized chunks
baseheight, rem = divmod(cfg.height, cfg.num_procs)
section_heights = [baseheight] * (cfg.num_procs - rem) + [baseheight + 1] * rem
assert(sum(section_heights) == cfg.height)
blitted_queues = [Queue(1) for _ in range(cfg.num_procs)]
wraparound_pipes = [Pipe() for _ in range(cfg.num_procs)]
packed_result_pipes = [Pipe(duplex=False) for _ in range(cfg.num_procs)]
life_procs = [
Process(target=life_thread_naive if cfg.slow else life_thread, args=[cfg, i, cfg.width, h, packed_result_pipes[i][1], wraparound_pipes[i][0], wraparound_pipes[(i+1)%cfg.num_procs][1]])
for i, h in enumerate(section_heights)
]
for proc in life_procs:
proc.start()
blitter_threads = [
Thread(target=blit_thread, args=[cfg, i, h, stopped, packed_result_pipes[i][0], blitted_queues[i]])
for i, h in enumerate(section_heights)
]
for thread in blitter_threads:
thread.start()
try:
gui_thread(cfg, section_heights, blitted_queues)
except KeyboardInterrupt:
print("Looks like you pressed Ctrl+C!")
# The shutdown process is surprisingly fiddly to get right, without deadlocks
print("Shutting down...")
stopped.set() # tell the blitter threads to stop
# unblock the blitters so they can "notice" the stop event
for queue in blitted_queues:
for surface in queue_purge(queue):
sdl2.SDL_FreeSurface(surface)
# wait for blitter threads to exit gracefully.
for thread in blitter_threads:
thread.join()
print("Stopped blitters.")
# final pass queue purge
for queue in blitted_queues:
for surface in queue_purge(queue):
sdl2.SDL_FreeSurface(surface)
# forcefully kill the life procs, trying to do this
# cleanly without races is proving too difficult...
for proc in life_procs:
proc.kill()
print("Stopped life procs.")
# clean up the remaining Pipes
# I think the gc will do this anyway but hey, nice to be explicit
for a, b in wraparound_pipes:
a.close()
b.close()
print("Bye!")
if __name__ == "__main__":
from dataclass_argparser import parse_args_for_dataclass_or_exit
cfg = parse_args_for_dataclass_or_exit(LifeConfig)
print("Config:", cfg)
main(cfg)