@@ -226,16 +226,23 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
226226    }
227227
228228    if  (best_fit_block  ==  -1 ) {
229-         // no suitable block found, try the last block (this will grow a chunks size) 
229+         // no suitable block found, try the last block (this may grow a chunks size) 
230+         int64_t  best_reuse  =  INT64_MIN ;
230231        for  (int  c  =  0 ; c  <  alloc -> n_chunks ; ++ c ) {
231232            struct  tallocr_chunk  *  chunk  =  alloc -> chunks [c ];
232233            if  (chunk -> n_free_blocks  >  0 ) {
233234                struct  free_block  *  block  =  & chunk -> free_blocks [chunk -> n_free_blocks  -  1 ];
234235                max_avail  =  MAX (max_avail , block -> size );
235-                 if  (block -> size  >= size ) {
236+                 int64_t  reuse_factor  =  chunk -> max_size  -  block -> offset  -  size ;
237+                 // reuse_factor < 0 : amount of extra memory that needs to be allocated 
238+                 // reuse_factor = 0 : allocated free space exactly matches tensor size 
239+                 // reuse_factor > 0 : superfluous memory that will remain unused 
240+                 bool  better_reuse  =  best_reuse  <  0  &&  reuse_factor  >  best_reuse ;
241+                 bool  better_fit  =  reuse_factor  >= 0  &&  reuse_factor  <  best_reuse ;
242+                 if  (block -> size  >= size  &&  (better_reuse  ||  better_fit )) {
236243                    best_fit_chunk  =  c ;
237244                    best_fit_block  =  chunk -> n_free_blocks  -  1 ;
238-                     break ;
245+                     best_reuse   =   reuse_factor ;
239246                }
240247            }
241248        }
@@ -268,7 +275,7 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
268275#ifdef  GGML_ALLOCATOR_DEBUG 
269276    add_allocated_tensor (alloc , addr , tensor );
270277    size_t  cur_max  =  addr .offset  +  size ;
271-     if  (cur_max  >  alloc -> max_size [ addr . chunk ] ) {
278+     if  (cur_max  >  chunk -> max_size ) {
272279        // sort allocated_tensors by chunk/offset 
273280        for  (int  i  =  0 ; i  <  1024 ; i ++ ) {
274281            for  (int  j  =  i  +  1 ; j  <  1024 ; j ++ ) {
0 commit comments