Open
Description
13900A RTX-A4000
https://www.tensorflow.org/guide/distributed_training#multiworkermirroredstrategy
# CommunicationImplementation.RING is RPC-based and supports both CPUs and GPUs.
os.environ["TF_CONFIG"] = json.dumps({
"cluster": {
"worker": ["192.168.0.129:4001", "192.168.0.131:4002"]#,
#"ps": ["192.168.0.129:4003"]
},
"task": {"type": "worker", "index": 1}
})
communication_options = tf.distribute.experimental.CommunicationOptions(
implementation=tf.distribute.experimental.CommunicationImplementation.NCCL)
strategy = tf.distribute.MultiWorkerMirroredStrategy(
communication_options=communication_options)