Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 96 additions & 9 deletions lpm_frontend/src/components/ModelStatus/index.tsx
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import { Status, statusRankMap, useTrainingStore } from '@/store/useTrainingStore';
import { startService, stopService } from '@/service/train';
import { startService, stopService, checkCudaAvailability } from '@/service/train';
import { StatusBar } from '../StatusBar';
import { useRef, useEffect, useState, useMemo } from 'react';
import { message } from 'antd';
import { message, Modal, Switch, Tooltip } from 'antd';
import {
CloudUploadOutlined,
CheckCircleOutlined,
PlayCircleOutlined,
PauseCircleOutlined,
LoadingOutlined
LoadingOutlined,
ThunderboltOutlined,
RocketOutlined
} from '@ant-design/icons';
import RegisterUploadModal from '../upload/RegisterUploadModal';

Expand All @@ -34,6 +36,9 @@ export function ModelStatus() {
const isTraining = useTrainingStore((state) => state.isTraining);

const [messageApi, contextHolder] = message.useMessage();
const [useGpu, setUseGpu] = useState(true);
const [cudaAvailable, setCudaAvailable] = useState(false);
const [showStartModal, setShowStartModal] = useState(false);

const loadInfo = useLoadInfoStore((state) => state.loadInfo);
const isRegistered = useMemo(() => {
Expand All @@ -43,6 +48,25 @@ export function ModelStatus() {
const [showRegisterModal, setShowRegisterModal] = useState(false);
const [showtrainingModal, setShowtrainingModal] = useState(false);

useEffect(() => {
// Check if CUDA is available
checkCudaAvailability().then(res => {
if (res.data.code === 0) {
const isCudaAvailable = res.data.data.cuda_available;
setCudaAvailable(isCudaAvailable);

// If CUDA is not available, default to CPU
if (!isCudaAvailable && useGpu) {
setUseGpu(false);
}
}
}).catch(error => {
console.error('Error checking CUDA availability:', error);
// Default to CPU if error checking CUDA
setUseGpu(false);
});
}, []);

const handleRegistryClick = () => {
if (!serviceStarted) {
messageApi.info({
Expand Down Expand Up @@ -137,15 +161,18 @@ export function ModelStatus() {

if (!config.model_name) {
message.error('Please train a base model first');

return;
}

setServiceStarting(true);
startService({ model_name: config.model_name })
startService({
model_name: config.model_name,
use_gpu: useGpu
})
.then((res) => {
if (res.data.code === 0) {
messageApi.success({ content: 'Service starting...', duration: 1 });
const modeText = useGpu ? 'GPU acceleration' : 'CPU-only mode';
messageApi.success({ content: `Service starting with ${modeText}...`, duration: 2 });
startPolling();
} else {
setServiceStarting(false);
Expand Down Expand Up @@ -190,11 +217,11 @@ export function ModelStatus() {
} else {
if (isTraining) {
setShowtrainingModal(true);

return;
}

handleStartService();
// Show the start modal with GPU/CPU selection
setShowStartModal(true);
}
};

Expand Down Expand Up @@ -260,11 +287,71 @@ export function ModelStatus() {
</div>
</div>

{/* Modal for selecting GPU/CPU mode */}
<Modal
title="Start Service"
open={showStartModal}
onOk={() => {
setShowStartModal(false);
handleStartService();
}}
onCancel={() => setShowStartModal(false)}
okText="Start"
cancelText="Cancel"
>
<div className="py-4">
<div className="mb-4">
<p>Choose the inference mode for your model:</p>
</div>

<div className="flex items-center justify-between mb-6 bg-gray-50 p-4 rounded-lg">
<div className="flex items-center">
{useGpu ? (
<ThunderboltOutlined style={{ fontSize: '24px', color: '#1890ff', marginRight: '12px' }} />
) : (
<RocketOutlined style={{ fontSize: '24px', color: '#52c41a', marginRight: '12px' }} />
)}
<div>
<div className="font-medium">{useGpu ? 'GPU Acceleration' : 'CPU Mode'}</div>
<div className="text-sm text-gray-500">
{useGpu
? 'Faster inference but requires compatible NVIDIA GPU'
: 'Compatible with all systems, but slower inference'}
</div>
</div>
</div>

<Tooltip title={!cudaAvailable && useGpu ? "CUDA GPU not available on this system" : ""}>
<Switch
checked={useGpu}
onChange={setUseGpu}
disabled={!cudaAvailable && useGpu}
checkedChildren="GPU"
unCheckedChildren="CPU"
/>
</Tooltip>
</div>

{!cudaAvailable && (
<div className="text-amber-500 text-sm mb-2">
<p>
No CUDA-compatible GPU detected. Running in CPU-only mode.
</p>
</div>
)}

<p className="text-gray-500 text-sm">
GPU acceleration requires a compatible NVIDIA graphics card with CUDA support.
CPU mode works on all systems but may be slower.
</p>
</div>
</Modal>

<RegisterUploadModal onClose={() => setShowRegisterModal(false)} open={showRegisterModal} />
<TrainingTipModal
confirm={() => {
handleStartService();
setShowtrainingModal(false);
setShowStartModal(true);
}}
onClose={() => setShowtrainingModal(false)}
open={showtrainingModal}
Expand Down
3 changes: 1 addition & 2 deletions lpm_kernel/api/domains/trainprocess/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,6 @@ def retrain():
data_synthesis_mode: Mode for data synthesis (optional)
use_cuda: Whether to use CUDA for training (optional)
is_cot: Whether to use Chain of Thought (optional)
use_previous_params: Whether to use previous training parameters (optional, default True)

Returns:
Response: JSON response
Expand Down Expand Up @@ -318,7 +317,7 @@ def retrain():
is_cot = data.get("is_cot", None)

# Log the received parameters
logger.info(f"Retrain parameters: model_name={model_name}, learning_rate={learning_rate}, number_of_epochs={number_of_epochs}, concurrency_threads={concurrency_threads}, data_synthesis_mode={data_synthesis_mode}, use_cuda={use_cuda}, is_cot={is_cot}, use_previous_params={use_previous_params}")
logger.info(f"Retrain parameters: model_name={model_name}, learning_rate={learning_rate}, number_of_epochs={number_of_epochs}, concurrency_threads={concurrency_threads}, data_synthesis_mode={data_synthesis_mode}, use_cuda={use_cuda}, is_cot={is_cot}")

# Create training service instance
train_service = TrainProcessService(current_model_name=model_name)
Expand Down
80 changes: 26 additions & 54 deletions lpm_kernel/api/services/local_llm_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ def start_server(self, model_path: str, use_gpu: bool = True) -> bool:

# Check for CUDA availability if GPU was requested
cuda_available = torch.cuda.is_available() if use_gpu else False
cuda_available = False
gpu_info = ""

if use_gpu and cuda_available:
Expand All @@ -81,7 +80,6 @@ def start_server(self, model_path: str, use_gpu: bool = True) -> bool:
logger.info("Using CPU for inference (GPU not requested)")

# Check for GPU optimization marker
gpu_optimized = False
model_dir = os.path.dirname(model_path)
gpu_marker_path = os.path.join(model_dir, "gpu_optimized.json")
if os.path.exists(gpu_marker_path):
Expand Down Expand Up @@ -118,77 +116,51 @@ def start_server(self, model_path: str, use_gpu: bool = True) -> bool:
"--cont-batching" # Enable continuous batching
]

# Set up environment with CUDA variables to ensure GPU detection
env = os.environ.copy()
# Default: do not expose GPU
env["CUDA_VISIBLE_DEVICES"] = ""
# Add GPU-related parameters if CUDA is available
if cuda_available and use_gpu:
# Force GPU usage with optimal parameters for faster loads

if use_gpu and cuda_available:
# --- GPU/CUDA setup ---
# Add GPU-specific llama.cpp arguments
cmd.extend([
"--n-gpu-layers", "999", # Use all layers on GPU
"--tensor-split", "0", # Use the first GPU for all operations
"--main-gpu", "0", # Use GPU 0 as the primary device
"--mlock" # Lock memory to prevent swapping during inference
"--flash-attn"
])

# Set CUDA environment variables to help with GPU detection
env["CUDA_VISIBLE_DEVICES"] = "0" # Force using first GPU

# Ensure comprehensive library paths for CUDA
cuda_lib_paths = [
"/usr/local/cuda/lib64",
"/usr/lib/cuda/lib64",
"/usr/local/lib",
"/usr/lib/x86_64-linux-gnu",
"/usr/lib/wsl/lib" # For Windows WSL environments
]

# Build a comprehensive LD_LIBRARY_PATH
current_ld_path = env.get("LD_LIBRARY_PATH", "")
for path in cuda_lib_paths:
if os.path.exists(path) and path not in current_ld_path:
current_ld_path = f"{path}:{current_ld_path}" if current_ld_path else path

env["LD_LIBRARY_PATH"] = current_ld_path
logger.info(f"Setting LD_LIBRARY_PATH to: {current_ld_path}")

# If this is Windows, use different approach for CUDA libraries
if os.name == 'nt':
# Windows typically has CUDA in PATH already if installed
logger.info("Windows system detected, using system CUDA libraries")
else:
# On Linux, try to find CUDA libraries in common locations
for cuda_path in [
# Common CUDA paths
# Set CUDA environment variables
env["CUDA_VISIBLE_DEVICES"] = "0" # Use first GPU

# Set up LD_LIBRARY_PATH for CUDA (Linux/WSL only)
if os.name != 'nt':
cuda_lib_paths = [
"/usr/local/cuda/lib64",
"/usr/lib/cuda/lib64",
"/usr/local/lib/python3.12/site-packages/nvidia/cuda_runtime/lib",
"/usr/local/lib/python3.10/site-packages/nvidia/cuda_runtime/lib",
]:
if os.path.exists(cuda_path):
# Add CUDA path to library path
env["LD_LIBRARY_PATH"] = f"{cuda_path}:{env.get('LD_LIBRARY_PATH', '')}"
env["CUDA_HOME"] = os.path.dirname(cuda_path)
logger.info(f"Found CUDA at {cuda_path}, setting environment variables")
break

# NOTE: CUDA support and rebuild should be handled at build/setup time (e.g., Docker build or setup script).
# The runtime check and rebuild logic has been removed for efficiency and reliability.
# Ensure llama.cpp is built with CUDA support before running the server if GPU is required.
"/usr/local/lib",
"/usr/lib/x86_64-linux-gnu",
"/usr/lib/wsl/lib"
]
current_ld_path = env.get("LD_LIBRARY_PATH", "")
for path in cuda_lib_paths:
if os.path.exists(path) and path not in current_ld_path:
current_ld_path = f"{path}:{current_ld_path}" if current_ld_path else path
env["LD_LIBRARY_PATH"] = current_ld_path
logger.info(f"Setting LD_LIBRARY_PATH to: {current_ld_path}")
else:
logger.info("Windows system detected, using system CUDA libraries")

# Pre-heat GPU to ensure faster initial response
# Pre-heat GPU for faster initial response
if torch.cuda.is_available():
logger.info("Pre-warming GPU to reduce initial latency...")
dummy_tensor = torch.zeros(1, 1).cuda()
del dummy_tensor
torch.cuda.synchronize()
torch.cuda.empty_cache()
logger.info("GPU warm-up complete")

logger.info("Using GPU acceleration for inference with optimized settings")
else:
# If GPU isn't available or supported, optimize for CPU
# --- CPU setup ---
cmd.extend([
"--threads", str(max(1, os.cpu_count() - 1)), # Use all CPU cores except one
])
Expand Down