@@ -81,7 +81,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
81
81
var err error
82
82
var cpuRunner string
83
83
var estimate MemoryEstimate
84
- var systemMemory uint64
84
+ var systemTotalMemory uint64
85
+ var systemFreeMemory uint64
86
+
87
+ systemMemInfo , err := gpu .GetCPUMem ()
88
+ if err != nil {
89
+ slog .Error ("failed to lookup system memory" , "error" , err )
90
+ } else {
91
+ systemTotalMemory = systemMemInfo .TotalMemory
92
+ systemFreeMemory = systemMemInfo .FreeMemory
93
+ slog .Debug ("system memory" , "total" , format .HumanBytes2 (systemTotalMemory ), "free" , systemFreeMemory )
94
+ }
85
95
86
96
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
87
97
if opts .NumGPU == 0 {
@@ -91,19 +101,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
91
101
cpuRunner = serverForCpu ()
92
102
estimate = EstimateGPULayers (gpus , ggml , projectors , opts )
93
103
} else {
94
- if gpus [0 ].Library == "metal" {
95
- memInfo , err := gpu .GetCPUMem ()
96
- if err != nil {
97
- slog .Error ("failed to lookup system memory" , "error" , err )
98
- } else {
99
- systemMemory = memInfo .TotalMemory
100
- slog .Debug ("system memory" , "total" , format .HumanBytes2 (systemMemory ))
101
- }
102
- }
103
104
estimate = EstimateGPULayers (gpus , ggml , projectors , opts )
104
105
105
106
switch {
106
- case gpus [0 ].Library == "metal" && estimate .VRAMSize > systemMemory :
107
+ case gpus [0 ].Library == "metal" && estimate .VRAMSize > systemTotalMemory :
107
108
// disable partial offloading when model is greater than total system memory as this
108
109
// can lead to locking up the system
109
110
opts .NumGPU = 0
@@ -211,7 +212,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
211
212
}
212
213
213
214
// Windows CUDA should not use mmap for best performance
214
- if (runtime .GOOS == "windows" && gpus [0 ].Library == "cuda" ) || opts .UseMMap == api .TriStateFalse {
215
+ // Linux with a model larger than free space, mmap leads to thrashing
216
+ if (runtime .GOOS == "windows" && gpus [0 ].Library == "cuda" && opts .UseMMap == api .TriStateUndefined ) ||
217
+ (runtime .GOOS == "linux" && systemFreeMemory < estimate .TotalSize && opts .UseMMap == api .TriStateUndefined ) ||
218
+ opts .UseMMap == api .TriStateFalse {
215
219
params = append (params , "--no-mmap" )
216
220
}
217
221
0 commit comments