-
Notifications
You must be signed in to change notification settings - Fork 969
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from volcano-sh/master
update
- Loading branch information
Showing
76 changed files
with
5,975 additions
and
3,295 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
[submodule "installer/helm"] | ||
path = installer/helm | ||
url = https://github.com/volcano-sh/charts.git |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
apiVersion: apps/v1 | ||
kind: ReplicaSet | ||
metadata: | ||
name: nginx-1 | ||
labels: | ||
app: nginx-1 | ||
spec: | ||
# modify replicas according to your case | ||
replicas: 8 | ||
selector: | ||
matchLabels: | ||
app: nginx-1 | ||
template: | ||
metadata: | ||
labels: | ||
app: nginx-1 | ||
spec: | ||
schedulerName: volcano | ||
containers: | ||
- name: nginx-1 | ||
image: nginx | ||
resources: | ||
requests: | ||
cpu: "1000m" | ||
limits: | ||
cpu: "1000m" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
apiVersion: apps/v1 | ||
kind: ReplicaSet | ||
metadata: | ||
name: nginx-2 | ||
labels: | ||
app: nginx-2 | ||
spec: | ||
# modify replicas according to your case | ||
replicas: 8 | ||
selector: | ||
matchLabels: | ||
app: nginx-2 | ||
template: | ||
metadata: | ||
labels: | ||
app: nginx-2 | ||
spec: | ||
schedulerName: volcano | ||
containers: | ||
- name: nginx-2 | ||
image: nginx | ||
resources: | ||
requests: | ||
cpu: "1000m" | ||
limits: | ||
cpu: "1000m" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
apiVersion: apps/v1 | ||
kind: ReplicaSet | ||
metadata: | ||
name: nginx | ||
labels: | ||
app: nginx | ||
spec: | ||
# modify replicas according to your case | ||
replicas: 8 | ||
selector: | ||
matchLabels: | ||
app: nginx | ||
template: | ||
metadata: | ||
labels: | ||
app: nginx | ||
spec: | ||
containers: | ||
- name: nginx | ||
image: nginx | ||
resources: | ||
requests: | ||
cpu: "1000m" | ||
limits: | ||
cpu: "1000m" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
apiVersion: batch.volcano.sh/v1alpha1 | ||
kind: Job | ||
metadata: | ||
name: lm-mpi-job | ||
labels: | ||
# 根据业务需要设置作业类型 | ||
"volcano.sh/job-type": "MPI" | ||
spec: | ||
# 设置最小需要的服务 (小于总replicas数) | ||
minAvailable: 4 | ||
schedulerName: volcano | ||
plugins: | ||
# 提供 ssh 免密认证 | ||
ssh: [] | ||
# 提供运行作业所需要的网络信息,hosts文件,headless service等 | ||
svc: [] | ||
# 如果有pod被 杀死,重启整个作业 | ||
policies: | ||
- event: PodEvicted | ||
action: RestartJob | ||
tasks: | ||
- replicas: 1 | ||
name: mpimaster | ||
# 当 mpiexec 结束,认识整个mpi作业结束 | ||
policies: | ||
- event: TaskCompleted | ||
action: CompleteJob | ||
template: | ||
spec: | ||
# Volcano 的信息会统一放到 /etc/volcano 目录下 | ||
containers: | ||
- command: | ||
- /bin/sh | ||
- -c | ||
- | | ||
MPI_HOST=`cat /etc/volcano/mpiworker.host | tr "\n" ","`; | ||
mkdir -p /var/run/sshd; /usr/sbin/sshd; | ||
mpiexec --allow-run-as-root --host ${MPI_HOST} -np 3 mpi_hello_world; | ||
image: swr.cn-north-1.myhuaweicloud.com/hwstaff_z00383385/example-mpi:0.0.1 | ||
name: mpimaster | ||
ports: | ||
- containerPort: 22 | ||
name: mpijob-port | ||
workingDir: /home | ||
resources: | ||
requests: | ||
cpu: "500m" | ||
limits: | ||
cpu: "500m" | ||
restartPolicy: OnFailure | ||
imagePullSecrets: | ||
- name: default-secret | ||
- replicas: 3 | ||
name: mpiworker | ||
template: | ||
spec: | ||
containers: | ||
- command: | ||
- /bin/sh | ||
- -c | ||
- | | ||
mkdir -p /var/run/sshd; /usr/sbin/sshd -D; | ||
image: swr.cn-north-1.myhuaweicloud.com/hwstaff_z00383385/example-mpi:0.0.1 | ||
name: mpiworker | ||
ports: | ||
- containerPort: 22 | ||
name: mpijob-port | ||
workingDir: /home | ||
resources: | ||
requests: | ||
cpu: "1000m" | ||
limits: | ||
cpu: "1000m" | ||
restartPolicy: OnFailure | ||
imagePullSecrets: | ||
- name: default-secret | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
apiVersion: apps/v1 | ||
kind: ReplicaSet | ||
metadata: | ||
name: nginx | ||
labels: | ||
app: nginx | ||
spec: | ||
# modify replicas according to your case | ||
replicas: 6 | ||
selector: | ||
matchLabels: | ||
app: nginx | ||
template: | ||
metadata: | ||
labels: | ||
app: nginx | ||
spec: | ||
containers: | ||
- name: nginx | ||
image: nginx | ||
resources: | ||
requests: | ||
cpu: "1000m" | ||
limits: | ||
cpu: "1000m" |
73 changes: 73 additions & 0 deletions
73
docs/samples/kubecon-2019-china/horovod-sample/lm-horovod-tf-mnist-v0.5.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
apiVersion: batch.volcano.sh/v1alpha1 | ||
kind: Job | ||
metadata: | ||
name: lm-horovod-job | ||
labels: | ||
"volcano.sh/job-type": Horovod | ||
spec: | ||
minAvailable: 4 | ||
schedulerName: volcano | ||
plugins: | ||
ssh: [] | ||
svc: [] | ||
# 如果有pod被 杀死,重启整个作业 | ||
policies: | ||
- event: PodEvicted | ||
action: RestartJob | ||
tasks: | ||
- replicas: 1 | ||
name: master | ||
policies: | ||
- event: TaskCompleted | ||
action: CompleteJob | ||
template: | ||
spec: | ||
containers: | ||
- command: | ||
- /bin/sh | ||
- -c | ||
- | | ||
WORKER_HOST=`cat /etc/volcano/worker.host | tr "\n" ","`; | ||
mkdir -p /var/run/sshd; /usr/sbin/sshd; | ||
mpiexec --allow-run-as-root --host ${WORKER_HOST} -np 3 python tensorflow_mnist_lm.py; | ||
image: swr.cn-north-1.myhuaweicloud.com/hwstaff_z00383385/horovod-tf-mnist:0.5 | ||
name: master | ||
ports: | ||
- containerPort: 22 | ||
name: job-port | ||
resources: | ||
requests: | ||
cpu: "500m" | ||
memory: "1024Mi" | ||
limits: | ||
cpu: "500m" | ||
memory: "1024Mi" | ||
restartPolicy: OnFailure | ||
imagePullSecrets: | ||
- name: default-secret | ||
- replicas: 3 | ||
name: worker | ||
template: | ||
spec: | ||
containers: | ||
- command: | ||
- /bin/sh | ||
- -c | ||
- | | ||
mkdir -p /var/run/sshd; /usr/sbin/sshd -D; | ||
image: swr.cn-north-1.myhuaweicloud.com/hwstaff_z00383385/horovod-tf-mnist:0.5 | ||
name: worker | ||
ports: | ||
- containerPort: 22 | ||
name: job-port | ||
resources: | ||
requests: | ||
cpu: "1000m" | ||
memory: "2048Mi" | ||
limits: | ||
cpu: "1000m" | ||
memory: "2048Mi" | ||
restartPolicy: OnFailure | ||
imagePullSecrets: | ||
- name: default-secret | ||
--- |
Oops, something went wrong.