-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeploy.yaml
103 lines (97 loc) · 2.59 KB
/
deploy.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: gpu-watchdog-cluster-role
rules:
- apiGroups: [""]
resources: ["nodes", "services", "pods", "endpoints", "namespaces"]
verbs: ["get", "list", "watch"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: gpu-watchdog-service-account
namespace: default
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: gpu-watchdog-cluster-role-binding
roleRef:
kind: ClusterRole
name: gpu-watchdog-cluster-role
apiGroup: rbac.authorization.k8s.io
subjects:
- kind: ServiceAccount
name: gpu-watchdog-service-account
namespace: default
---
apiVersion: v1
kind: Secret
metadata:
name: gpu-watchdog-secrets
type: Opaque
data:
DD_API_KEY: "<REDACTED>"
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: gpu-watchdog-daemonset
namespace: default
labels:
app: gpu-watchdog
spec:
selector:
matchLabels:
name: gpu-watchdog
updateStrategy:
type: RollingUpdate
template:
metadata:
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: gpu-watchdog
spec:
serviceAccount: gpu-watchdog-service-account
# REQUIRED to retrieve Pod metadata
hostPID: true
# Only run on GPU nodes
nodeSelector:
k8s.amazonaws.com/accelerator: vgpu
tolerations:
# This toleration is deprecated. Kept here for backward compatibility
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
- key: CriticalAddonsOnly
operator: Exists
- key: k8s.amazonaws.com/vgpu
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
containers:
- name: gpu-watchdog
image: "joehaaga/gpu-watchdog:latest"
env:
- name: DD_SITE
value: "datadoghq.com"
- name: DD_API_KEY
valueFrom:
secretKeyRef:
name: gpu-watchdog-secrets
key: DD_API_KEY
- name: STATSD_PORT
value: "8125"
- name: LOG_LEVEL
value: "INFO"
resources:
requests:
memory: "50Mi"
cpu: "2m"
limits:
memory: "250Mi"
cpu: "10m"