Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ func rootSteps() []pkg.Step {
pkg.CleanDisksStep,
pkg.SetupMultipathStep,
pkg.UpdateModprobeStep,
pkg.VerifyAmdgpuDriverStep,
pkg.PrepareLonghornDisksStep,
pkg.PrepareRKE2Step,
pkg.GenerateNodeLabelsStep,
Expand Down
164 changes: 152 additions & 12 deletions pkg/os-setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -399,22 +399,162 @@ func setupMultipath() error {
return nil
}

func updateModprobe() error {
cmd := exec.Command("sh", "-c", "sudo sed -i '/^blacklist amdgpu/s/^/# /' /etc/modprobe.d/*.conf")
output, err := cmd.Output()
// checkAmdgpuBlacklist checks if amdgpu is blacklisted in kernel cmdline or modprobe.d
func checkAmdgpuBlacklist() (bool, error) {
// Check kernel command line
cmdlineCmd := exec.Command("sh", "-c", "cat /proc/cmdline")
cmdlineOutput, err := cmdlineCmd.CombinedOutput()
if err != nil {
LogMessage(Warn, fmt.Sprintf("Modprobe configuration returned: %s", output))
return fmt.Errorf("failed to configure Modprobe: %w", err)
return false, fmt.Errorf("failed to read /proc/cmdline: %w", err)
}

hasKernelBlacklist := strings.Contains(string(cmdlineOutput), "modprobe.blacklist=amdgpu")

// Check modprobe.d
modprobeCmd := exec.Command("sh", "-c", "grep -r '^[[:space:]]*blacklist[[:space:]]*amdgpu' /etc/modprobe.d/ 2>/dev/null || true")
modprobeOutput, _ := modprobeCmd.CombinedOutput()
hasModprobeBlacklist := len(modprobeOutput) > 0

if hasKernelBlacklist {
LogMessage(Warn, "amdgpu is blacklisted in ACTIVE kernel command line (reboot required to fix)")
}
if hasModprobeBlacklist {
LogMessage(Warn, fmt.Sprintf("amdgpu is blacklisted in modprobe.d:\n%s", string(modprobeOutput)))
}

return hasKernelBlacklist || hasModprobeBlacklist, nil
}

// removeAmdgpuBlacklist removes amdgpu blacklist from modprobe.d and GRUB configuration
// Returns true if reboot is required, and any error encountered
func removeAmdgpuBlacklist() (bool, error) {
LogMessage(Info, "Starting amdgpu blacklist removal process...")
needsReboot := false

// Step 1: Check current kernel command line for blacklist
checkCmdlineCmd := exec.Command("sh", "-c", "cat /proc/cmdline | grep -o 'modprobe.blacklist=amdgpu' || true")
cmdlineOutput, _ := checkCmdlineCmd.CombinedOutput()
if len(cmdlineOutput) > 0 {
LogMessage(Warn, "Found 'modprobe.blacklist=amdgpu' in ACTIVE kernel command line")
needsReboot = true
}

// Step 2: Remove amdgpu blacklist from modprobe.d
LogMessage(Info, "Removing amdgpu blacklist from /etc/modprobe.d/...")

sedCmd := exec.Command("sh", "-c", "sed -i '/^[[:space:]]*blacklist[[:space:]]*amdgpu/s/^/# /' /etc/modprobe.d/*.conf 2>/dev/null || true")
if output, err := sedCmd.CombinedOutput(); err != nil {
LogMessage(Warn, fmt.Sprintf("sed command output: %s", string(output)))
}

// Step 3: Update GRUB configuration
LogMessage(Info, "Updating GRUB configuration to remove amdgpu blacklist...")

// Backup original GRUB config
backupCmd := exec.Command("cp", "/etc/default/grub", "/etc/default/grub.backup")
if err := backupCmd.Run(); err != nil {
LogMessage(Warn, fmt.Sprintf("Failed to backup GRUB config: %v", err))
} else {
LogMessage(Info, "")
LogMessage(Info, "Backed up /etc/default/grub to /etc/default/grub.backup")
}
cmd = exec.Command("modprobe", "amdgpu")
output, err = cmd.Output()
if err != nil {
LogMessage(Warn, fmt.Sprintf("Modprobe amdgpu returned: %s", output))
return fmt.Errorf("failed to modprobe amdgpu: %w", err)

// Check if GRUB config has the blacklist
checkGrubCmd := exec.Command("sh", "-c", "grep 'modprobe.blacklist=amdgpu' /etc/default/grub || true")
checkGrubOutput, _ := checkGrubCmd.CombinedOutput()
if len(checkGrubOutput) > 0 {
needsReboot = true
LogMessage(Info, "Found amdgpu blacklist in GRUB config, removing...")
}

// Remove modprobe.blacklist=amdgpu from GRUB
grubSedCmd := exec.Command("sh", "-c", `sed -i 's/modprobe\.blacklist=amdgpu[[:space:]]*//g' /etc/default/grub`)
if output, err := grubSedCmd.CombinedOutput(); err != nil {
LogMessage(Error, fmt.Sprintf("Failed to update GRUB config: %s", string(output)))
return needsReboot, fmt.Errorf("failed to update GRUB configuration: %w", err)
}
LogMessage(Info, "Successfully updated /etc/default/grub")

// Step 4: Verify GRUB changes
verifyGrubCmd := exec.Command("sh", "-c", "grep -E 'GRUB_CMDLINE_LINUX' /etc/default/grub")
if verifyOutput, err := verifyGrubCmd.CombinedOutput(); err == nil {
LogMessage(Info, fmt.Sprintf("Updated GRUB config:\n%s", string(verifyOutput)))
}

// Step 5: Update GRUB
LogMessage(Info, "Running update-grub...")
updateGrubCmd := exec.Command("update-grub")
if output, err := updateGrubCmd.CombinedOutput(); err != nil {
LogMessage(Error, fmt.Sprintf("update-grub failed: %s", string(output)))
return needsReboot, fmt.Errorf("failed to run update-grub: %w", err)
}
LogMessage(Info, "Successfully ran update-grub")

// Step 6: Verify no active blacklist in config files
verifyCmd := exec.Command("sh", "-c", "grep -r '^[[:space:]]*blacklist[[:space:]]*amdgpu' /etc/modprobe.d/ 2>/dev/null || true")
verifyOutput, _ := verifyCmd.CombinedOutput()
if len(verifyOutput) > 0 {
LogMessage(Warn, fmt.Sprintf("WARNING: Still found uncommented blacklist entries:\n%s", string(verifyOutput)))
return needsReboot, fmt.Errorf("blacklist entries still present after cleanup")
} else {
LogMessage(Info, "")
LogMessage(Info, "Verified: No active amdgpu blacklist entries in /etc/modprobe.d/")
}

LogMessage(Info, "Successfully removed amdgpu blacklist from modprobe.d and GRUB")
// Step 7: Try to load the amdgpu module if not already loaded
if !needsReboot {
LogMessage(Info, "Attempting to load amdgpu kernel module...")
lsmodCmd := exec.Command("sh", "-c", "lsmod | grep '^amdgpu' || true")
lsmodOutput, _ := lsmodCmd.CombinedOutput()
if len(lsmodOutput) == 0 {
modprobeCmd := exec.Command("modprobe", "amdgpu")
if output, err := modprobeCmd.CombinedOutput(); err != nil {
LogMessage(Warn, fmt.Sprintf("Failed to load amdgpu module: %s", string(output)))
LogMessage(Warn, "A system reboot may be required")
needsReboot = true
} else {
LogMessage(Info, "Successfully loaded amdgpu kernel module")
}
} else {
LogMessage(Info, "amdgpu kernel module already loaded")
}
}

return needsReboot, nil
}

// verifyAmdgpuDriverBinding verifies that the amdgpu module is loaded and GPUs are bound
func verifyAmdgpuDriverBinding() error {
LogMessage(Info, "Verifying amdgpu driver binding...")

// Check if amdgpu module is loaded
lsmodCmd := exec.Command("sh", "-c", "lsmod | grep '^amdgpu' || true")
lsmodOutput, _ := lsmodCmd.CombinedOutput()
if len(lsmodOutput) == 0 {
LogMessage(Error, "amdgpu module is NOT loaded")
return fmt.Errorf("amdgpu module not loaded")
}
LogMessage(Info, "amdgpu module is loaded")

// Check if GPUs are bound to amdgpu driver
lspciCmd := exec.Command("sh", "-c", "lspci -nnk -d 1002:75a3 | grep -A 3 'Processing accelerators'")
lspciOutput, _ := lspciCmd.CombinedOutput()

if !strings.Contains(string(lspciOutput), "Kernel driver in use: amdgpu") {
LogMessage(Error, "MI355X GPUs are NOT bound to amdgpu driver")
LogMessage(Info, fmt.Sprintf("lspci output:\n%s", string(lspciOutput)))
return fmt.Errorf("GPUs not bound to amdgpu driver")
}
LogMessage(Info, "MI355X GPUs are bound to amdgpu driver")

// Check for render nodes
renderNodesCmd := exec.Command("sh", "-c", "ls -la /dev/dri/renderD* 2>/dev/null || true")
renderNodesOutput, _ := renderNodesCmd.CombinedOutput()
if len(renderNodesOutput) == 0 {
LogMessage(Warn, "No render nodes found in /dev/dri/")
return fmt.Errorf("no render nodes found")
}
LogMessage(Info, fmt.Sprintf("Render nodes found:\n%s", string(renderNodesOutput)))

LogMessage(Info, "Successfully verified amdgpu driver binding")
return nil
}
58 changes: 48 additions & 10 deletions pkg/steps.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,23 +302,61 @@ var SetupMultipathStep = Step{

var UpdateModprobeStep = Step{
Id: "UpdateModprobeStep",
Name: "Update Modprobe",
Description: "Update Modprobe to unblacklist amdgpu",
Name: "Remove AMD GPU Blacklist",
Description: "Remove amdgpu blacklist from modprobe.d and GRUB configuration",
Skip: func() bool {
if !viper.GetBool("GPU_NODE") {
LogMessage(Info, "Skipping ROCm setup for non-GPU node")
LogMessage(Info, "Skipping amdgpu blacklist removal for non-GPU node")
return true
}
return false
},
Action: func() StepResult {
err := updateModprobe()
needsReboot, err := removeAmdgpuBlacklist()
if err != nil {
return StepResult{
Error: fmt.Errorf("update Modprobe failed: %w", err),
Error: fmt.Errorf("amdgpu blacklist removal failed: %w", err),
}
}
return StepResult{Error: nil}
return StepResult{
Error: nil,
RebootRequired: needsReboot,
Message: "AMD GPU blacklist removed from modprobe.d and GRUB",
}
},
}

var VerifyAmdgpuDriverStep = Step{
Id: "VerifyAmdgpuDriverStep",
Name: "Verify AMD GPU Driver",
Description: "Verify amdgpu driver is loaded and GPUs are properly bound",
Skip: func() bool {
if !viper.GetBool("GPU_NODE") {
LogMessage(Info, "Skipping AMD GPU driver verification for non-GPU node")
return true
}
return false
},
Action: func() StepResult {
if err := verifyAmdgpuDriverBinding(); err != nil {
LogMessage(Error, fmt.Sprintf("AMD GPU driver verification failed: %v", err))
// Check if this is likely a reboot requirement
if strings.Contains(err.Error(), "not loaded") || strings.Contains(err.Error(), "not bound") {
return StepResult{
Error: fmt.Errorf("AMD GPU driver not accessible - %w", err),
RebootRequired: true,
Message: "GPU driver verification failed - reboot recommended",
}
}
return StepResult{
Error: fmt.Errorf("AMD GPU driver verification failed: %w", err),
Message: "GPU driver verification failed",
}
}
return StepResult{
Error: nil,
Message: "AMD GPU driver verified and operational",
}
},
}

Expand Down Expand Up @@ -839,12 +877,12 @@ data:
// Get certificate paths from PrepareRKE2Step
tlsCertPath := viper.GetString("RUNTIME_TLS_CERT")
tlsKeyPath := viper.GetString("RUNTIME_TLS_KEY")

if tlsCertPath == "" || tlsKeyPath == "" {
LogMessage(Error, "Certificate paths not found - PrepareRKE2 may have failed")
return StepResult{Error: fmt.Errorf("certificate paths not found - PrepareRKE2 may have failed")}
}

// Verify files still exist
if _, err := os.Stat(tlsCertPath); os.IsNotExist(err) {
LogMessage(Error, fmt.Sprintf("Certificate file missing: %s", tlsCertPath))
Expand All @@ -857,7 +895,7 @@ data:

// Create ClusterRoleBindings for OIDC authorization
LogMessage(Info, "Creating OIDC ClusterRoleBindings")

clusterRoleBindingFile, err := os.CreateTemp("", "cluster-role-binding-*.yaml")
if err != nil {
LogMessage(Error, fmt.Sprintf("Failed to create temporary ClusterRoleBinding file: %v", err))
Expand Down Expand Up @@ -948,7 +986,7 @@ metadata:
LogMessage(Info, "Successfully created TLS secret")
return StepResult{Message: "Domain ConfigMap and TLS secret created successfully"}
}

return StepResult{Message: "Domain ConfigMap created successfully"}
},
}
Expand Down
20 changes: 18 additions & 2 deletions pkg/view.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,21 @@ func RunStepsWithUI(steps []Step) error {
if result.Message != "" {
monitor.AddLog("INFO", fmt.Sprintf("Message: %s", result.Message), step.Id)
}
if result.RebootRequired {
monitor.AddLog("WARNING", "═══════════════════════════════════════════════════════", step.Id)
monitor.AddLog("WARNING", "⚠️ SYSTEM REBOOT REQUIRED", step.Id)
monitor.AddLog("WARNING", "═══════════════════════════════════════════════════════", step.Id)
monitor.AddLog("WARNING", "The system configuration has been updated, but a reboot", step.Id)
monitor.AddLog("WARNING", "is required for changes to take effect.", step.Id)
monitor.AddLog("WARNING", "", step.Id)
monitor.AddLog("WARNING", "Please run: sudo reboot", step.Id)
monitor.AddLog("WARNING", "", step.Id)
monitor.AddLog("WARNING", "Then re-run cluster-bloom to continue setup.", step.Id)
monitor.AddLog("WARNING", "═══════════════════════════════════════════════════════", step.Id)
finalErr = fmt.Errorf("system reboot required - please reboot and re-run")
monitor.CompleteStep(step.Id, finalErr)
break
}
monitor.AddLog("INFO", fmt.Sprintf("Completed in %v", duration.Round(time.Millisecond)), step.Id)
monitor.CompleteStep(step.Id, nil)
}
Expand Down Expand Up @@ -696,6 +711,7 @@ const (
)

type StepResult struct {
Error error
Message string
Error error
Message string
RebootRequired bool
}
Loading