Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

[Rest Server] Add pod GPU number for default scheduler #3642

Merged
merged 4 commits into from
Sep 18, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 15 additions & 9 deletions src/rest-server/src/models/v2/job/k8s.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ const launcherConfig = require('@pai/config/launcher');
const createError = require('@pai/utils/error');
const userModel = require('@pai/models/v2/user');
const env = require('@pai/utils/env');
const k8s = require('@pai/utils/k8sUtils');
const path = require('path');
const fs = require('fs');
const _ = require('lodash');
Expand Down Expand Up @@ -144,17 +145,22 @@ const convertTaskDetail = async (taskStatus, ports, userName, jobName, taskRoleN
}
}
// get container gpus
let containerGpus = 0;
if (launcherConfig.enabledHived) {
try {
const isolation = (await axios({
method: 'get',
url: launcherConfig.podPath(taskStatus.attemptStatus.podName),
})).data.metadata.annotations['hivedscheduler.microsoft.com/pod-gpu-isolation'];
let containerGpus = null;
try {
const pod = (await axios({
method: 'get',
url: launcherConfig.podPath(taskStatus.attemptStatus.podName),
})).data;
if (launcherConfig.enabledHived) {
const isolation = pod.metadata.annotations['hivedscheduler.microsoft.com/pod-gpu-isolation'];
containerGpus = isolation.split(',').reduce((attr, id) => attr + Math.pow(2, id), 0);
} catch (e) {
containerGpus = 0;
} else {
const gpuNumber = k8s.atoi(pod.spec.containers[0].resources.limits['nvidia.com/gpu']);
// mock GPU ids from 0 to (gpuNumber - 1)
containerGpus = Math.pow(2, gpuNumber) - 1;
sunqinzheng marked this conversation as resolved.
Show resolved Hide resolved
}
} catch (err) {
containerGpus = null;
}
const completionStatus = taskStatus.attemptStatus.completionStatus;
return {
Expand Down