-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjob-template.yaml
105 lines (98 loc) · 2.81 KB
/
job-template.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
apiVersion: v1
kind: Service
metadata:
name: svc_ID_
spec:
clusterIP: None
selector:
job-name: job_ID_
ports:
- name: http
protocol: TCP
port: 8080
---
apiVersion: v1
kind: ConfigMap
metadata:
name: script-code_ID_
data:
main.py: |
from http.server import BaseHTTPRequestHandler, HTTPServer
from urllib.request import urlopen
import sys, os, time, logging
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
serverPort = 8080
INDEX_COUNT = int(sys.argv[1])
index = int(os.environ.get('JOB_COMPLETION_INDEX'))
logger = logging.getLogger('LOG' + str(index))
class WorkerServer(BaseHTTPRequestHandler):
def do_GET(self):
self.send_response(200)
self.end_headers()
if "exit" in self.path:
self.wfile.write(bytes("Exiting", "utf-8"))
self.wfile.close()
sys.exit(0)
else:
self.wfile.write(bytes("Running", "utf-8"))
def call_until_success(url):
while True:
try:
logger.info("Calling URL: " + url)
with urlopen(url) as response:
response_content = response.read().decode('utf-8')
logger.info("Response content from %s: %s" % (url, response_content))
return
except Exception as e:
logger.warning("Got exception when calling %s: %s" % (url, e))
time.sleep(1)
if __name__ == "__main__":
if index == 0:
for i in range(1, INDEX_COUNT):
call_until_success("http://job_ID_-%d.svc_ID_:8080/ping" % i)
logger.info("All workers running")
time.sleep(10) # sleep 10s to simulate doing something
for i in range(1, INDEX_COUNT):
call_until_success("http://job_ID_-%d.svc_ID_:8080/exit" % i)
logger.info("All workers stopped")
else:
webServer = HTTPServer(("", serverPort), WorkerServer)
logger.info("Server started at port %s" % serverPort)
webServer.serve_forever()
---
apiVersion: batch/v1
kind: Job
metadata:
name: job_ID_
labels:
kueue.x-k8s.io/queue-name: user-queue
spec:
parallelism: 20
completions: 20
completionMode: Indexed
suspend: true
template:
spec:
subdomain: svc_ID_
volumes:
- name: script-volume
configMap:
name: script-code_ID_
containers:
- name: main
image: python:bullseye
command: ["python"]
args:
- /script-path/main.py
- "20"
ports:
- containerPort: 8080
imagePullPolicy: IfNotPresent
resources:
requests:
memory: "2Gi" # choose the value as 75% * (total allocatable memory / 20)
volumeMounts:
- mountPath: /script-path
name: script-volume
restartPolicy: Never
backoffLimit: 0