Skip to content

Commit bae0c8d

Browse files
dhiltgenrick-github
authored andcommitted
discovery: fix cudart driver version (ollama#11614)
We prefer the nvcuda library, which reports driver versions. When we dropped cuda v11, we added a safety check for too-old drivers. What we missed was the cudart fallback discovery logic didn't have driver version wired up. This fixes cudart discovery to expose the driver version as well so we no longer reject all GPUs if nvcuda didn't work.
1 parent 9d93927 commit bae0c8d

File tree

3 files changed

+7
-11
lines changed

3 files changed

+7
-11
lines changed

discover/gpu.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,8 @@ func GetGPUInfo() GpuInfoList {
263263
var driverMinor int
264264
if cHandles.cudart != nil {
265265
C.cudart_bootstrap(*cHandles.cudart, C.int(i), &memInfo)
266+
driverMajor = int(cHandles.cudart.driver_major)
267+
driverMinor = int(cHandles.cudart.driver_minor)
266268
} else {
267269
C.nvcuda_bootstrap(*cHandles.nvcuda, C.int(i), &memInfo)
268270
driverMajor = int(cHandles.nvcuda.driver_major)

discover/gpu_info_cudart.c

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,18 +69,15 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
6969
}
7070

7171
int version = 0;
72-
cudartDriverVersion_t driverVersion;
73-
driverVersion.major = 0;
74-
driverVersion.minor = 0;
7572

7673
// Report driver version if we're in verbose mode, ignore errors
7774
ret = (*resp->ch.cudaDriverGetVersion)(&version);
7875
if (ret != CUDART_SUCCESS) {
7976
LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret);
8077
} else {
81-
driverVersion.major = version / 1000;
82-
driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
83-
LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
78+
resp->ch.driver_major = version / 1000;
79+
resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
80+
LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", resp->ch.driver_major, resp->ch.driver_minor);
8481
}
8582

8683
ret = (*resp->ch.cudaGetDeviceCount)(&resp->num_devices);

discover/gpu_info_cudart.h

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,6 @@ typedef struct cudartMemory_st {
2929
size_t used;
3030
} cudartMemory_t;
3131

32-
typedef struct cudartDriverVersion {
33-
int major;
34-
int minor;
35-
} cudartDriverVersion_t;
36-
3732
typedef struct cudaUUID {
3833
unsigned char bytes[16];
3934
} cudaUUID_t;
@@ -123,6 +118,8 @@ typedef struct cudaDeviceProp {
123118
typedef struct cudart_handle {
124119
void *handle;
125120
uint16_t verbose;
121+
int driver_major;
122+
int driver_minor;
126123
cudartReturn_t (*cudaSetDevice)(int device);
127124
cudartReturn_t (*cudaDeviceSynchronize)(void);
128125
cudartReturn_t (*cudaDeviceReset)(void);

0 commit comments

Comments
 (0)