Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(eks): pods become CrashLoopBackOff when using INFERENTIA or TRAINIUM instance type #29651

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# source: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/containers/tutorials/k8s-setup.html
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: neuron-device-plugin
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- ""
resources:
- pods
verbs:
- update
- patch
- get
- list
- watch
- apiGroups:
- ""
resources:
- nodes/status
verbs:
- patch
- update
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: neuron-device-plugin
namespace: kube-system
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: neuron-device-plugin
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: neuron-device-plugin
subjects:
- kind: ServiceAccount
name: neuron-device-plugin
namespace: kube-system
74 changes: 48 additions & 26 deletions packages/aws-cdk-lib/aws-eks/lib/addons/neuron-device-plugin.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# source: https://github.com/aws/aws-neuron-sdk/blob/master/docs/neuron-container-tools/k8s-neuron-device-plugin.yml
# source: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/containers/tutorials/k8s-setup.html
# https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/
apiVersion: apps/v1
kind: DaemonSet
Expand All @@ -13,11 +13,13 @@ spec:
type: RollingUpdate
template:
metadata:
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
# Uncomment the annotation below if k8s version is 1.13 or lower
# annotations:
# scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: neuron-device-plugin-ds
spec:
serviceAccount: neuron-device-plugin
tolerations:
- key: CriticalAddonsOnly
operator: Exists
Expand All @@ -33,18 +35,22 @@ spec:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "beta.kubernetes.io/instance-type"
operator: In
values:
- inf1.xlarge
- inf1.2xlarge
- inf1.6xlarge
- inf1.24xlarge
- inf2.xlarge
- inf2.8xlarge
- inf2.24xlarge
- inf2.48xlarge
# Uncomment following matchExpressions if using k8s 1.16 or lower
#- matchExpressions:
# - key: "beta.kubernetes.io/instance-type"
# operator: In
# values:
# - inf1.xlarge
# - inf1.2xlarge
# - inf1.6xlarge
# - inf1.24xlarge
# - inf2.xlarge
# - inf2.8xlarge
# - inf2.24xlarge
# - inf2.48xlarge
# - trn1.2xlarge
# - trn1.32xlarge
# - trn1n.32xlarge
- matchExpressions:
- key: "node.kubernetes.io/instance-type"
operator: In
Expand All @@ -57,18 +63,34 @@ spec:
- inf2.8xlarge
- inf2.24xlarge
- inf2.48xlarge
- trn1.2xlarge
- trn1.32xlarge
- trn1n.32xlarge
containers:
- image: 790709498068.dkr.ecr.us-west-2.amazonaws.com/neuron-device-plugin:1.0.9043.0
imagePullPolicy: Always
name: k8s-neuron-device-plugin-ctr
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
# Find all neuron-device-plugin images at https://gallery.ecr.aws/neuron/neuron-device-plugin
- image: public.ecr.aws/neuron/neuron-device-plugin:2.19.16.0
imagePullPolicy: Always
name: neuron-device-plugin
env:
- name: KUBECONFIG
value: /etc/kubernetes/kubelet.conf
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: infa-map
mountPath: /run
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
path: /var/lib/kubelet/device-plugins
- name: infa-map
hostPath:
path: /run
18 changes: 18 additions & 0 deletions packages/aws-cdk-lib/aws-eks/lib/cluster.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1445,6 +1445,8 @@ export class Cluster extends ClusterBase {

private _neuronDevicePlugin?: KubernetesManifest;

private _neuronDevicePluginRbac?: KubernetesManifest;

private readonly endpointAccess: EndpointAccess;

private readonly vpcSubnets: ec2.SubnetSelection[];
Expand Down Expand Up @@ -1803,6 +1805,7 @@ export class Cluster extends ClusterBase {
if (nodeTypeForInstanceType(options.instanceType) === NodeType.INFERENTIA ||
nodeTypeForInstanceType(options.instanceType) === NodeType.TRAINIUM ) {
this.addNeuronDevicePlugin();
this.addNeuronDevicePluginRbac();
}

return asg;
Expand All @@ -1826,6 +1829,7 @@ export class Cluster extends ClusterBase {

if (hasInferentiaOrTrainiumInstanceType) {
this.addNeuronDevicePlugin();
this.addNeuronDevicePluginRbac();
}
return new Nodegroup(this, `Nodegroup${id}`, {
cluster: this,
Expand Down Expand Up @@ -1987,6 +1991,20 @@ export class Cluster extends ClusterBase {
return this._neuronDevicePlugin;
}

/**
* Installs the Neuron device plugin RBAC on the cluster if it's not
* already added.
*/
private addNeuronDevicePluginRbac() {
if (!this._neuronDevicePluginRbac) {
const fileContents = fs.readFileSync(path.join(__dirname, 'addons', 'neuron-device-plugin-rbac.yaml'), 'utf8');
const sanitized = YAML.parse(fileContents);
this._neuronDevicePluginRbac = this.addManifest('NeuronDevicePluginRbac', sanitized);
}

return this._neuronDevicePluginRbac;
}

/**
* Opportunistically tag subnets with the required tags.
*
Expand Down
Loading