mindspore-lab · HaoyangLee · Jun 19, 2023 · Jun 19, 2023
diff --git a/configs/cls/mobilenetv3/cls_mv3.yaml b/configs/cls/mobilenetv3/cls_mv3.yaml
@@ -1,7 +1,7 @@
 system:
   mode: 0 # 0 for graph mode, 1 for pynative mode in MindSpore
   distribute: True
-  device_id: 7  # only valid when distribute=True, i.e., standalone training
+  device_id: 7  # only valid when distribute=False (standalone training) and environment variable 'DEVICE_ID' is NOT set
   amp_level: 'O0'
   seed: 4
   ckpt_save_policy: top_k # top_k or latest_k

diff --git a/docs/cn/tutorials/yaml_configuration.md b/docs/cn/tutorials/yaml_configuration.md
@@ -23,6 +23,7 @@
 | ---- | ---- | ---- | ---- | ---- |
 | mode | MindSpore运行模式(静态图/动态图) | 0 | 0 / 1 | 0: 表示在GRAPH_MODE模式中运行; 1: PYNATIVE_MODE模式 |
 | distribute | 是否开启并行训练 | True | True / False | \ |
+| device_id | 指定单卡训练时的卡id | 7 | 机器可用的卡的id | 该参数仅在distribute=False（单卡训练）和环境变量DEVICE_ID未设置时生效。单卡训练时，如该参数和环境变量DEVICE_ID均未设置，则默认使用0卡。 |
 | amp_level | 混合精度模式 | O0 | O0/O1/O2/O3 | 'O0' - 不变化。<br> 'O1' - 将白名单内的Cell和运算转为float16精度，其余部分保持float32精度。<br> 'O2' - 将黑名单内的Cell和运算保持float32精度，其余部分转为float16精度。<br> 'O3' - 将网络全部转为float16精度。|
 | seed | 随机种子 | 42 | Integer | \ |
 | ckpt_save_policy | 模型权重保存策略 | top_k | "top_k" 或 "latest_k" | "top_k"表示保存前k个评估指标分数最高的checkpoint；"latest_k"表示保存最新的k个checkpoint。 `k`的数值通过`ckpt_max_keep`参数定义 |

diff --git a/docs/en/tutorials/yaml_configuration.md b/docs/en/tutorials/yaml_configuration.md
@@ -22,6 +22,7 @@ This document takes `configs/rec/crnn/crnn_icdar15.yaml` as an example to descri
 | ---- | ---- | ---- | ---- | ---- |
 | mode | Mindspore running mode (static graph/dynamic graph) | 0 | 0 / 1 | 0: means running in GRAPH_MODE mode; 1: PYNATIVE_MODE mode |
 | distribute | Whether to enable parallel training | True | True / False | \ |
+| device_id | Specify the device id while standalone training | 7 | The ids of all devices in the server | Only valid when distribute=False (standalone training) and environment variable 'DEVICE_ID' is NOT set. While standalone training, if both this arg and environment variable 'DEVICE_ID' are NOT set, use device 0 by default. |
 | amp_level | Mixed precision mode | O0 | O0/O1/O2/O3 | 'O0' - no change. <br> 'O1' - convert the cells and operations in the whitelist to float16 precision, and keep the rest in float32 precision. <br> 'O2' - Keep the cells and operations in the blacklist with float32 precision, and convert the rest to float16 precision. <br> 'O3' - Convert all networks to float16 precision. |
 | seed | Random seed | 42 | Integer | \ |
 | ckpt_save_policy | The policy for saving model weights | top_k | "top_k" or "latest_k" | "top_k" means to keep the top k checkpoints according to the metric score; "latest_k" means to keep the last k checkpoints. The value of `k` is set via `ckpt_max_keep` |

diff --git a/tools/benchmarking/multi_dataset_eval.py b/tools/benchmarking/multi_dataset_eval.py
@@ -76,6 +76,18 @@ def main(cfg):
     else:
         device_num = None
         rank_id = None
+        if "DEVICE_ID" in os.environ:
+            print(
+                f"INFO: Standalone evaluation. Device id: {os.environ.get('DEVICE_ID')}, "
+                f"specified by environment variable 'DEVICE_ID'."
+            )
+        else:
+            device_id = cfg.system.get("device_id", 0)
+            ms.set_context(device_id=device_id)
+            print(
+                f"INFO: Standalone evaluation. Device id: {device_id}, "
+                f"specified by system.device_id in yaml config file or is default value 0."
+            )
 
     is_main_device = rank_id in [None, 0]
 

diff --git a/tools/eval.py b/tools/eval.py
@@ -36,7 +36,18 @@ def main(cfg):
     else:
         device_num = None
         rank_id = None
-        ms.set_context(device_id=cfg.system.get("device_id", 0))
+        if "DEVICE_ID" in os.environ:
+            print(
+                f"INFO: Standalone evaluation. Device id: {os.environ.get('DEVICE_ID')}, "
+                f"specified by environment variable 'DEVICE_ID'."
+            )
+        else:
+            device_id = cfg.system.get("device_id", 0)
+            ms.set_context(device_id=device_id)
+            print(
+                f"INFO: Standalone evaluation. Device id: {device_id}, "
+                f"specified by system.device_id in yaml config file or is default value 0."
+            )
 
     is_main_device = rank_id in [None, 0]
 

diff --git a/tools/train.py b/tools/train.py
@@ -51,7 +51,18 @@ def main(cfg):
     else:
         device_num = None
         rank_id = None
-        ms.set_context(device_id=cfg.system.get("device_id", 0))
+        if "DEVICE_ID" in os.environ:
+            print(
+                f"INFO: Standalone training. Device id: {os.environ.get('DEVICE_ID')}, "
+                f"specified by environment variable 'DEVICE_ID'."
+            )
+        else:
+            device_id = cfg.system.get("device_id", 0)
+            ms.set_context(device_id=device_id)
+            print(
+                f"INFO: Standalone training. Device id: {device_id}, "
+                f"specified by system.device_id in yaml config file or is default value 0."
+            )
 
     set_seed(cfg.system.seed)