34
34
inductorconfig .coordinate_descent_check_all_directions = True
35
35
inductorconfig .allow_buffer_reuse = False
36
36
37
+ # torch._dynamo.config.capture_dynamic_output_shape_ops = True
37
38
torch ._dynamo .config .capture_dynamic_output_shape_ops = True
38
39
39
40
@@ -173,7 +174,7 @@ def masks_to_rle_dict(masks):
173
174
174
175
# Queue to hold incoming requests
175
176
request_queue = asyncio .Queue ()
176
- batch_interval = 0.1 # Time interval to wait before processing a batch
177
+ batch_interval = 0.01 # Time interval to wait before processing a batch
177
178
178
179
179
180
def process_batch (batch , mask_generator ):
@@ -186,7 +187,7 @@ def process_batch(batch, mask_generator):
186
187
print (f"Processing batch of len { len (batch )} using generate_batch" )
187
188
masks = mask_generator .generate_batch (image_tensors )
188
189
print (f"Took avg. { (time .time () - t ) / len (batch )} s per batch entry" )
189
- max_memory_allocated ()
190
+ # max_memory_allocated()
190
191
return masks
191
192
192
193
@@ -220,17 +221,17 @@ async def lifespan(app: FastAPI):
220
221
task .cancel ()
221
222
222
223
223
- def benchmark_fn (func , inp , mask_generator ):
224
+ def benchmark_fn (func , inp , mask_generator , warmup = 3 , runs = 10 ):
224
225
torch .cuda .empty_cache ()
225
226
torch .cuda .reset_peak_memory_stats ()
226
- logging .info ("Running 3 warumup iterations." )
227
- for _ in range (3 ):
227
+ logging .info ("Running {warmup} warmup iterations." )
228
+ for _ in range (warmup ):
228
229
func (inp , mask_generator )
229
- logging .info ("Running 10 benchmark iterations." )
230
+ logging .info ("Running {runs} benchmark iterations." )
230
231
t = time .time ()
231
- for _ in range (10 ):
232
+ for _ in range (runs ):
232
233
func (inp , mask_generator )
233
- print (f"Benchmark took { (time .time () - t )/ 10.0 } s per iteration." )
234
+ print (f"Benchmark took { (time .time () - t )/ runs } s per iteration." )
234
235
max_memory_allocated ()
235
236
236
237
@@ -244,11 +245,11 @@ def max_memory_allocated():
244
245
245
246
def unittest_fn (masks , ref_masks , order_by_area = False , verbose = False ):
246
247
from compare_rle_lists import compare_masks
247
- miou_sum , miou_count = compare_masks (masks , ref_masks , order_by_area = order_by_area , verbose = verbose )
248
- if miou_count == 0 :
248
+ miou , equal_count = compare_masks (masks , ref_masks , order_by_area = order_by_area , verbose = verbose )
249
+ if equal_count == len ( masks ) :
249
250
print ("Masks exactly match reference." )
250
251
else :
251
- print (f"mIoU is { miou_sum / miou_count } " )
252
+ print (f"mIoU is { miou } with equal count { equal_count } out of { len ( masks ) } " )
252
253
253
254
254
255
def main (checkpoint_path ,
@@ -290,7 +291,7 @@ def main(checkpoint_path,
290
291
291
292
logging .info (f"Loading model { sam2_checkpoint } with config { model_cfg } " )
292
293
sam2 = build_sam2 (model_cfg , sam2_checkpoint , device = device , apply_postprocessing = False )
293
-
294
+
294
295
logging .info (f"Using { points_per_batch } points_per_batch" )
295
296
mask_generator = SAM2AutomaticMaskGenerator (sam2 , points_per_batch = points_per_batch , output_mode = "uncompressed_rle" )
296
297
@@ -299,18 +300,31 @@ def main(checkpoint_path,
299
300
# TODO: Using CUDA graphs can cause numerical differences?
300
301
mask_generator .predictor .model .image_encoder = torch .compile (
301
302
mask_generator .predictor .model .image_encoder ,
302
- # mode="max-autotune-no-cudagraphs",
303
303
mode = "max-autotune" ,
304
304
fullgraph = True ,
305
305
dynamic = False ,
306
306
)
307
307
308
- mask_generator ._process_batch_fullgraph = torch .compile (
309
- mask_generator ._process_batch_fullgraph ,
308
+ mask_generator .predictor .model .sam_prompt_encoder .forward = torch .compile (
309
+ mask_generator .predictor .model .sam_prompt_encoder .forward ,
310
+ mode = "max-autotune" ,
311
+ fullgraph = True ,
312
+ dynamic = False ,
313
+ )
314
+
315
+ mask_generator .predictor ._predict_masks = torch .compile (
316
+ mask_generator .predictor ._predict_masks ,
317
+ mode = "max-autotune" ,
310
318
fullgraph = True ,
311
- dynamic = True ,
319
+ dynamic = False ,
312
320
)
313
321
322
+ # mask_generator.predictor._predict_masks_postprocess = torch.compile(
323
+ # mask_generator.predictor._predict_masks_postprocess,
324
+ # fullgraph=True,
325
+ # dynamic=True,
326
+ # )
327
+
314
328
if furious :
315
329
mask_generator .predictor .model .image_encoder = mask_generator .predictor .model .image_encoder .to (torch .float16 )
316
330
# NOTE: Not baseline feature
@@ -340,27 +354,28 @@ def main(checkpoint_path,
340
354
unittest_fn (masks , ref_masks , order_by_area = True , verbose = verbose )
341
355
342
356
if benchmark :
357
+ print (f"batch size { batch_size } dog benchmark" )
343
358
if batch_size == 1 :
344
- print ("batch size 1 test" )
345
359
benchmark_fn (image_tensor_to_masks , image_tensor , mask_generator )
346
- benchmark_fn (image_tensor_to_masks , torch .tensor (image_tensor ).transpose (0 , 1 ).numpy (), mask_generator )
347
360
else :
348
- print (f"batch size { batch_size } test" )
349
361
benchmark_fn (image_tensors_to_masks , [image_tensor ] * batch_size , mask_generator )
350
362
351
- print (f"batch size { batch_size } example shapes test" )
352
- random_images = [np .random .randint (0 , 256 , size = size , dtype = np .uint8 ) for size in example_shapes ()]
353
- random_images = random_images [:batch_size ]
354
- benchmark_fn (image_tensors_to_masks , random_images , mask_generator )
363
+ for i , shapes in enumerate ([example_shapes (), example_shapes_2 ()]):
364
+ print (f"batch size { batch_size } example shapes { i } benchmark" )
365
+ random_images = [np .random .randint (0 , 256 , size = size , dtype = np .uint8 ) for size in shapes ]
355
366
356
- print (f"batch size { batch_size } example shapes 2 test" )
357
- random_images = [np .random .randint (0 , 256 , size = size , dtype = np .uint8 ) for size in example_shapes_2 ()]
358
- random_images = random_images [:batch_size ]
359
- benchmark_fn (image_tensors_to_masks , random_images , mask_generator )
367
+ if batch_size == 1 :
368
+ [benchmark_fn (image_tensor_to_masks , r , mask_generator ) for r in random_images ]
369
+ else :
370
+ random_images = random_images [:batch_size ]
371
+ benchmark_fn (image_tensors_to_masks , random_images , mask_generator )
360
372
361
373
if profile is not None :
362
374
print (f"Saving profile under { profile } " )
363
- profiler_runner (profile , image_tensors_to_masks , [image_tensor ] * batch_size , mask_generator )
375
+ if batch_size == 1 :
376
+ profiler_runner (profile , image_tensor_to_masks , image_tensor , mask_generator )
377
+ else :
378
+ profiler_runner (profile , image_tensors_to_masks , [image_tensor ] * batch_size , mask_generator )
364
379
365
380
if dry :
366
381
return
@@ -406,7 +421,8 @@ async def upload_image(image: UploadFile = File(...)):
406
421
return StreamingResponse (buf , media_type = "image/png" )
407
422
408
423
409
- uvicorn .run (app , host = host , port = port , log_level = "info" )
424
+ # uvicorn.run(app, host=host, port=port, log_level="info")
425
+ uvicorn .run (app , host = host , port = port )
410
426
411
427
if __name__ == "__main__" :
412
428
fire .Fire (main )
0 commit comments