deepjavalibrary · frankfliu · Dec 9, 2023 · Dec 9, 2023
@@ -235,7 +235,6 @@ public void load(Path modelPath, String prefix, Map<String, ?> options) throws I
             }
             mpiWorkers = Integer.parseInt(getProperty("gpu.maxWorkers"));
 
-            properties.put("mpi_mode", "true");
             properties.forEach((k, v) -> pyEnv.addParameter(k, v));
 
             createAllPyProcesses(mpiWorkers, partitions);

@@ -69,8 +69,9 @@ static String inferLmiEngine(ModelInfo<?, ?> modelInfo) throws ModelException {
         Properties prop = modelInfo.prop;
         HuggingFaceModelConfig modelConfig = getHuggingFaceModelConfig(modelInfo);
         if (modelConfig == null) {
-            logger.info("No config.json found, use Python engine.");
-            return "Python";
+            String engineName = isTrtLLM(modelInfo) ? "MPI" : "Python";
+            logger.info("No config.json found, use {} engine.", engineName);
+            return engineName;
         }
         String features = Utils.getenv("SERVING_FEATURES");
         String modelType = modelConfig.getModelType();
@@ -90,18 +91,19 @@ static String inferLmiEngine(ModelInfo<?, ?> modelInfo) throws ModelException {
         return engineName;
     }
 
-    static void convertIfNeed(ModelInfo<?, ?> info) throws IOException {
+    static boolean isTrtLLM(ModelInfo<?, ?> info) {
         String rollingBatch = info.prop.getProperty("option.rolling_batch");
         if (rollingBatch == null || "auto".equals(rollingBatch)) {
             // FIXME: find a better way to set default rolling batch for trtllm
             String features = Utils.getenv("SERVING_FEATURES");
-            if (features != null && features.contains("trtllm")) {
-                info.prop.put("option.rolling_batch", "trtllm");
-                rollingBatch = "trtllm";
-            }
+            return features != null && features.contains("trtllm");
         }
+        return false;
+    }
 
-        if ("trtllm".equals(rollingBatch)) {
+    static void convertIfNeed(ModelInfo<?, ?> info) throws IOException {
+        if (isTrtLLM(info)) {
+            info.prop.put("option.rolling_batch", "trtllm");
             Path trtRepo;
             String modelId = null;
             if (info.downloadDir != null) {

@@ -733,16 +733,21 @@ private void configPerModelSettings() throws ModelException {
             }
         }
 
+        if ("DeepSpeed".equals(engineName) || "MPI".equals(engineName)) {
+            prop.put("option.mpi_mode", "true");
+        }
+
         logger.info(
                 "Apply per model settings:\n\tjob_queue_size: {}\n\tbatch_size: {}"
                         + "\n\tmax_batch_delay: {}\n\tmax_idle_time: {}\n\tload_on_devices: {}"
-                        + "\n\tengine: {}\n\toption.entryPoint: {}{}",
+                        + "\n\tengine: {}\n\tmpi_mode: {}\n\toption.entryPoint: {}{}",
                 queueSize,
                 batchSize,
                 maxBatchDelayMillis,
                 maxIdleSeconds,
                 loadOnDevices,
                 engineName,
+                prop.get("option.mpi_mode"),
                 prop.get("option.entryPoint"),
                 sb);
     }
@@ -841,7 +846,9 @@ public String[] getLoadOnDevices() {
             }
             if (gpuCount > 0) {
                 int gpuPerWorker = 1;
-                if ("Python".equals(engineName)) {
+                if (Boolean.parseBoolean(prop.getProperty("option.mpi_mode"))) {
+                    return new String[] {"0"};
+                } else if ("Python".equals(engineName)) {
                     if (tpDegree > 0) {
                         gpuPerWorker = tpDegree;
                         int procs = gpuCount / gpuPerWorker;
@@ -851,10 +858,12 @@ public String[] getLoadOnDevices() {
                                             + gpuPerWorker
                                             + " partitions.");
                         }
-                        gpuCount = procs;
+                        if (maxWorkers == null) {
+                            gpuCount = procs;
+                        } else {
+                            gpuCount = Math.min(procs, maxWorkers);
+                        }
                     }
-                } else if ("DeepSpeed".equals(engineName) || "MPI".equals(engineName)) {
-                    return new String[] {"0"};
                 }
 
                 String[] ret = new String[gpuCount];