Skip to content

Commit c7c2f3b

Browse files
authored
Merge pull request #5194 from dhiltgen/linux_mmap_auto
Refine mmap default logic on linux
2 parents 54a79d6 + 5bf5aee commit c7c2f3b

File tree

1 file changed

+16
-12
lines changed

1 file changed

+16
-12
lines changed

llm/server.go

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
8181
var err error
8282
var cpuRunner string
8383
var estimate MemoryEstimate
84-
var systemMemory uint64
84+
var systemTotalMemory uint64
85+
var systemFreeMemory uint64
86+
87+
systemMemInfo, err := gpu.GetCPUMem()
88+
if err != nil {
89+
slog.Error("failed to lookup system memory", "error", err)
90+
} else {
91+
systemTotalMemory = systemMemInfo.TotalMemory
92+
systemFreeMemory = systemMemInfo.FreeMemory
93+
slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", systemFreeMemory)
94+
}
8595

8696
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
8797
if opts.NumGPU == 0 {
@@ -91,19 +101,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
91101
cpuRunner = serverForCpu()
92102
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
93103
} else {
94-
if gpus[0].Library == "metal" {
95-
memInfo, err := gpu.GetCPUMem()
96-
if err != nil {
97-
slog.Error("failed to lookup system memory", "error", err)
98-
} else {
99-
systemMemory = memInfo.TotalMemory
100-
slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
101-
}
102-
}
103104
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
104105

105106
switch {
106-
case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory:
107+
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
107108
// disable partial offloading when model is greater than total system memory as this
108109
// can lead to locking up the system
109110
opts.NumGPU = 0
@@ -211,7 +212,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
211212
}
212213

213214
// Windows CUDA should not use mmap for best performance
214-
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda") || opts.UseMMap == api.TriStateFalse {
215+
// Linux with a model larger than free space, mmap leads to thrashing
216+
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
217+
(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
218+
opts.UseMMap == api.TriStateFalse {
215219
params = append(params, "--no-mmap")
216220
}
217221

0 commit comments

Comments
 (0)