@@ -815,6 +815,21 @@ struct llama_mmap {
815815
816816 llama_mmap (const llama_mmap &) = delete ;
817817
818+ static void align_offset (size_t & offset, size_t & len, size_t page_size) {
819+ // align offset to the next page
820+ size_t offset_in_page = offset & (page_size - 1 );
821+ size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
822+ offset += offset_to_page;
823+
824+ if (offset_to_page >= len) {
825+ len = 0 ;
826+ } else {
827+ len -= offset_to_page;
828+ // align len to the previous page
829+ len -= len & (page_size - 1 );
830+ }
831+ }
832+
818833#ifdef _POSIX_MAPPED_FILES
819834 static constexpr bool SUPPORTED = true ;
820835
@@ -849,6 +864,24 @@ struct llama_mmap {
849864 }
850865 }
851866
867+ void unmap (size_t offset, size_t len) {
868+ int page_size = sysconf (_SC_PAGESIZE);
869+ align_offset (offset, len, page_size);
870+ if (len < (size_t )page_size) {
871+ return ;
872+ }
873+
874+ void * next_page_start = (uint8_t *) addr + offset;
875+ // unmap and discard the pages
876+ if (munmap (next_page_start, len)) {
877+ fprintf (stderr, " warning: munmap failed: %s\n " , strerror (errno));
878+ }
879+ if (posix_madvise (next_page_start, len, POSIX_MADV_DONTNEED)) {
880+ fprintf (stderr, " warning: posix_madvise(.., POSIX_MADV_DONTNEED) failed: %s\n " ,
881+ strerror (errno));
882+ }
883+ }
884+
852885 ~llama_mmap () {
853886 munmap (addr, size);
854887 }
@@ -898,6 +931,20 @@ struct llama_mmap {
898931 }
899932 }
900933
934+ void unmap (size_t offset, size_t len) {
935+ SYSTEM_INFO si;
936+ GetSystemInfo (&si);
937+ DWORD page_size = si.dwAllocationGranularity ;
938+ align_offset (offset, len, page_size);
939+
940+ if (len < (size_t )page_size) {
941+ return ;
942+ }
943+
944+ void * next_page_start = (uint8_t *) addr + offset;
945+ VirtualAlloc (next_page_start, len, MEM_RESET, PAGE_NOACCESS);
946+ }
947+
901948 ~llama_mmap () {
902949 if (!UnmapViewOfFile (addr)) {
903950 fprintf (stderr, " warning: UnmapViewOfFile failed: %s\n " ,
@@ -914,6 +961,13 @@ struct llama_mmap {
914961
915962 throw std::runtime_error (std::string (" mmap not supported" ));
916963 }
964+
965+ void unmap (size_t offset, size_t len) {
966+ (void ) offset;
967+ (void ) len;
968+
969+ throw std::runtime_error (std::string (" mmap not supported" ));
970+ }
917971#endif
918972};
919973
@@ -2243,7 +2297,9 @@ struct llama_model_loader {
22432297 return gguf_get_data_offset (ctx_gguf) + gguf_get_tensor_offset (ctx_gguf, idx);
22442298 }
22452299
2246- void init_mapping (struct ggml_context * ctx) {
2300+ void init_mapping () {
2301+ /*
2302+ // prefetch only CPU tensors
22472303 if (use_mmap) {
22482304 size_t size_pref = 0; // prefetch
22492305
@@ -2256,6 +2312,9 @@ struct llama_model_loader {
22562312 }
22572313 mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa()));
22582314 }
2315+ */
2316+ // prefetch the whole file - all the data is needed anyway
2317+ mapping.reset (new llama_mmap (&file, -1 , ggml_is_numa ()));
22592318 }
22602319
22612320 // for backwards compatibility only
@@ -2292,19 +2351,25 @@ struct llama_model_loader {
22922351
22932352 std::vector<no_init<uint8_t >> read_buf;
22942353
2295- size_t done_size = 0 ;
2354+ size_t size_done = 0 ;
2355+
2356+ size_t mmap_first = -1 ;
2357+ size_t mmap_last = 0 ;
2358+
22962359 for (int i = 0 ; i < gguf_get_n_tensors (ctx_gguf); i++) {
22972360 struct ggml_tensor * cur = ggml_get_tensor (ctx, gguf_get_tensor_name (ctx_gguf, i));
22982361 GGML_ASSERT (cur); // unused tensors should have been caught by load_data already
22992362 const size_t offs = file_offset (ggml_get_name (cur));
23002363
23012364 if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) {
2302- if (use_mmap) {
2365+ if (use_mmap && mapping ) {
23032366 if (buf_mmap) {
23042367 ggml_backend_tensor_alloc (buf_mmap, cur, (uint8_t *) mapping->addr + offs);
23052368 if (lmlock) {
23062369 lmlock->grow_to (offs + ggml_nbytes (cur));
23072370 }
2371+ mmap_first = std::min (mmap_first, offs);
2372+ mmap_last = std::max (mmap_last, offs + ggml_nbytes (cur));
23082373 } else {
23092374 ggml_backend_tensor_set (cur, (uint8_t *) mapping->addr + offs, 0 , ggml_nbytes (cur));
23102375 }
@@ -2323,7 +2388,7 @@ struct llama_model_loader {
23232388 // HACK: mark tensor as allocated
23242389 cur->data = (void *)(uintptr_t )1 ;
23252390 void * data;
2326- if (use_mmap) {
2391+ if (use_mmap && mapping ) {
23272392 data = (uint8_t *) mapping->addr + offs;
23282393 } else {
23292394 read_buf.resize (ggml_nbytes (cur));
@@ -2343,14 +2408,22 @@ struct llama_model_loader {
23432408#endif
23442409 }
23452410
2346- done_size += ggml_nbytes (cur);
2411+ size_done += ggml_nbytes (cur);
23472412
23482413 if (progress_callback) {
2349- progress_callback ((float ) done_size / size_data, progress_callback_user_data);
2414+ progress_callback ((float ) size_done / size_data, progress_callback_user_data);
23502415 }
23512416 }
23522417
2353- // TODO: unmap GPU tensors
2418+ // unmap GPU tensors
2419+ if (use_mmap && mapping) {
2420+ // unmap from 0 to mmap_first
2421+ printf (" unmapping from 0 - %zu bytes\n " , mmap_first);
2422+ mapping->unmap (0 , mmap_first);
2423+ // unmap from mmap_last to end
2424+ printf (" unmapping from %zu - %zu bytes\n " , mmap_last, mapping->size - mmap_last);
2425+ mapping->unmap (mmap_last, mapping->size - mmap_last);
2426+ }
23542427 }
23552428};
23562429
@@ -3507,7 +3580,7 @@ static void llm_load_tensors(
35073580
35083581 ml.done_getting_tensors ();
35093582
3510- ml.init_mapping (ctx );
3583+ ml.init_mapping ();
35113584
35123585 // allocate tensors
35133586 size_t vram_weights = 0 ;
0 commit comments