From d7a62f66b7fa13898ad5d7e48f42b9dd8602bb14 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Thu, 31 Oct 2019 17:03:03 +0800 Subject: [PATCH 01/18] check pylint for nni_cmd --- tools/nni_cmd/command_utils.py | 2 +- tools/nni_cmd/common_utils.py | 10 +-- tools/nni_cmd/config_schema.py | 104 ++++++++++++++-------------- tools/nni_cmd/config_utils.py | 29 ++++---- tools/nni_cmd/constants.py | 5 +- tools/nni_cmd/launcher.py | 46 ++++++------ tools/nni_cmd/launcher_utils.py | 20 +++--- tools/nni_cmd/nnictl.py | 16 +++-- tools/nni_cmd/nnictl_utils.py | 66 +++++++++--------- tools/nni_cmd/package_management.py | 6 +- tools/nni_cmd/ssh_utils.py | 6 +- tools/nni_cmd/tensorboard_utils.py | 25 +++---- tools/nni_cmd/updater.py | 2 +- tools/nni_cmd/url_utils.py | 6 +- 14 files changed, 171 insertions(+), 172 deletions(-) diff --git a/tools/nni_cmd/command_utils.py b/tools/nni_cmd/command_utils.py index a3bcb81965..cf13f63eae 100644 --- a/tools/nni_cmd/command_utils.py +++ b/tools/nni_cmd/command_utils.py @@ -3,7 +3,7 @@ import os import signal import psutil -from .common_utils import print_error, print_normal, print_warning +from .common_utils import print_error def check_output_command(file_path, head=None, tail=None): diff --git a/tools/nni_cmd/common_utils.py b/tools/nni_cmd/common_utils.py index 3a5e909ca2..af0fe3efa6 100644 --- a/tools/nni_cmd/common_utils.py +++ b/tools/nni_cmd/common_utils.py @@ -21,10 +21,10 @@ import os import sys import json -import ruamel.yaml as yaml -import psutil import socket from pathlib import Path +import ruamel.yaml as yaml +import psutil from .constants import ERROR_INFO, NORMAL_INFO, WARNING_INFO, COLOR_RED_FORMAT, COLOR_YELLOW_FORMAT def get_yml_content(file_path): @@ -34,6 +34,7 @@ def get_yml_content(file_path): return yaml.load(file, Loader=yaml.Loader) except yaml.scanner.ScannerError as err: print_error('yaml file format error!') + print_error(err) exit(1) except Exception as exception: print_error(exception) @@ -46,6 +47,7 @@ def get_json_content(file_path): return json.load(file) except TypeError as err: print_error('json file format error!') + print_error(err) return None def print_error(content): @@ -70,7 +72,7 @@ def detect_process(pid): def detect_port(port): '''Detect if the port is used''' - socket_test = socket.socket(socket.AF_INET,socket.SOCK_STREAM) + socket_test = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: socket_test.connect(('127.0.0.1', int(port))) socket_test.close() @@ -79,7 +81,7 @@ def detect_port(port): return False def get_user(): - if sys.platform =='win32': + if sys.platform == 'win32': return os.environ['USERNAME'] else: return os.environ['USER'] diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index da943564fb..dded8d1e95 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -19,13 +19,13 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import os -from schema import Schema, And, Use, Optional, Regex, Or +from schema import Schema, And, Optional, Regex, Or from .constants import SCHEMA_TYPE_ERROR, SCHEMA_RANGE_ERROR, SCHEMA_PATH_ERROR -def setType(key, type): +def setType(key, valueType): '''check key type''' - return And(type, error=SCHEMA_TYPE_ERROR % (key, type.__name__)) + return And(valueType, error=SCHEMA_TYPE_ERROR % (key, valueType.__name__)) def setChoice(key, *args): '''check choice''' @@ -47,7 +47,7 @@ def setPathCheck(key): 'experimentName': setType('experimentName', str), Optional('description'): setType('description', str), 'trialConcurrency': setNumberRange('trialConcurrency', int, 1, 99999), - Optional('maxExecDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$',error='ERROR: maxExecDuration format is [digit]{s,m,h,d}')), + Optional('maxExecDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$', error='ERROR: maxExecDuration format is [digit]{s,m,h,d}')), Optional('maxTrialNum'): setNumberRange('maxTrialNum', int, 1, 99999), 'trainingServicePlatform': setChoice('trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller'), Optional('searchSpacePath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'searchSpacePath'), @@ -106,7 +106,7 @@ def setPathCheck(key): 'builtinTunerName': 'NetworkMorphism', Optional('classArgs'): { Optional('optimize_mode'): setChoice('optimize_mode', 'maximize', 'minimize'), - Optional('task'): setChoice('task', 'cv','nlp','common'), + Optional('task'): setChoice('task', 'cv', 'nlp', 'common'), Optional('input_width'): setType('input_width', int), Optional('input_channel'): setType('input_channel', int), Optional('n_output_node'): setType('n_output_node', int), @@ -139,7 +139,7 @@ def setPathCheck(key): Optional('selection_num_warm_up'): setType('selection_num_warm_up', int), Optional('selection_num_starting_points'): setType('selection_num_starting_points', int), }, - Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool), + Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool), Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'), }, 'PPOTuner': { @@ -232,35 +232,35 @@ def setPathCheck(key): } common_trial_schema = { -'trial':{ - 'command': setType('command', str), - 'codeDir': setPathCheck('codeDir'), - Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999), - Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode') + 'trial':{ + 'command': setType('command', str), + 'codeDir': setPathCheck('codeDir'), + Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999), + Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode') } } pai_trial_schema = { -'trial':{ - 'command': setType('command', str), - 'codeDir': setPathCheck('codeDir'), - 'gpuNum': setNumberRange('gpuNum', int, 0, 99999), - 'cpuNum': setNumberRange('cpuNum', int, 0, 99999), - 'memoryMB': setType('memoryMB', int), - 'image': setType('image', str), - Optional('authFile'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'authFile'), - Optional('shmMB'): setType('shmMB', int), - Optional('dataDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\ - error='ERROR: dataDir format error, dataDir format is hdfs://xxx.xxx.xxx.xxx:xxx'), - Optional('outputDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\ - error='ERROR: outputDir format error, outputDir format is hdfs://xxx.xxx.xxx.xxx:xxx'), - Optional('virtualCluster'): setType('virtualCluster', str), - Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), - Optional('portList'): [{ - "label": setType('label', str), - "beginAt": setType('beginAt', int), - "portNumber": setType('portNumber', int) - }] + 'trial':{ + 'command': setType('command', str), + 'codeDir': setPathCheck('codeDir'), + 'gpuNum': setNumberRange('gpuNum', int, 0, 99999), + 'cpuNum': setNumberRange('cpuNum', int, 0, 99999), + 'memoryMB': setType('memoryMB', int), + 'image': setType('image', str), + Optional('authFile'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'authFile'), + Optional('shmMB'): setType('shmMB', int), + Optional('dataDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\ + error='ERROR: dataDir format error, dataDir format is hdfs://xxx.xxx.xxx.xxx:xxx'), + Optional('outputDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\ + error='ERROR: outputDir format error, outputDir format is hdfs://xxx.xxx.xxx.xxx:xxx'), + Optional('virtualCluster'): setType('virtualCluster', str), + Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), + Optional('portList'): [{ + "label": setType('label', str), + "beginAt": setType('beginAt', int), + "portNumber": setType('portNumber', int) + }] } } @@ -273,7 +273,7 @@ def setPathCheck(key): } kubeflow_trial_schema = { -'trial':{ + 'trial':{ 'codeDir': setPathCheck('codeDir'), Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), Optional('ps'): { @@ -315,7 +315,7 @@ def setPathCheck(key): 'server': setType('server', str), 'path': setType('path', str) } - },{ + }, { 'operator': setChoice('operator', 'tf-operator', 'pytorch-operator'), 'apiVersion': setType('apiVersion', str), Optional('storage'): setChoice('storage', 'nfs', 'azureStorage'), @@ -363,7 +363,7 @@ def setPathCheck(key): 'server': setType('server', str), 'path': setType('path', str) } - },{ + }, { Optional('storage'): setChoice('storage', 'nfs', 'azureStorage'), Optional('serviceAccountName'): setType('serviceAccountName', str), 'keyVault': { @@ -383,24 +383,24 @@ def setPathCheck(key): } machine_list_schema = { -Optional('machineList'):[Or({ - 'ip': setType('ip', str), - Optional('port'): setNumberRange('port', int, 1, 65535), - 'username': setType('username', str), - 'passwd': setType('passwd', str), - Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'), - Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int), - Optional('useActiveGpu'): setType('useActiveGpu', bool) - },{ - 'ip': setType('ip', str), - Optional('port'): setNumberRange('port', int, 1, 65535), - 'username': setType('username', str), - 'sshKeyPath': setPathCheck('sshKeyPath'), - Optional('passphrase'): setType('passphrase', str), - Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'), - Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int), - Optional('useActiveGpu'): setType('useActiveGpu', bool) -})] + Optional('machineList'):[Or({ + 'ip': setType('ip', str), + Optional('port'): setNumberRange('port', int, 1, 65535), + 'username': setType('username', str), + 'passwd': setType('passwd', str), + Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'), + Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int), + Optional('useActiveGpu'): setType('useActiveGpu', bool) + }, { + 'ip': setType('ip', str), + Optional('port'): setNumberRange('port', int, 1, 65535), + 'username': setType('username', str), + 'sshKeyPath': setPathCheck('sshKeyPath'), + Optional('passphrase'): setType('passphrase', str), + Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'), + Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int), + Optional('useActiveGpu'): setType('useActiveGpu', bool) + })] } LOCAL_CONFIG_SCHEMA = Schema({**common_schema, **common_trial_schema}) diff --git a/tools/nni_cmd/config_utils.py b/tools/nni_cmd/config_utils.py index 6b2b8a0cc0..c5a36b374d 100644 --- a/tools/nni_cmd/config_utils.py +++ b/tools/nni_cmd/config_utils.py @@ -21,7 +21,6 @@ import os import json -import shutil from .constants import NNICTL_HOME_DIR class Config: @@ -73,29 +72,29 @@ def __init__(self): self.experiment_file = os.path.join(NNICTL_HOME_DIR, '.experiment') self.experiments = self.read_file() - def add_experiment(self, id, port, time, file_name, platform): + def add_experiment(self, expId, port, time, file_name, platform): '''set {key:value} paris to self.experiment''' - self.experiments[id] = {} - self.experiments[id]['port'] = port - self.experiments[id]['startTime'] = time - self.experiments[id]['endTime'] = 'N/A' - self.experiments[id]['status'] = 'INITIALIZED' - self.experiments[id]['fileName'] = file_name - self.experiments[id]['platform'] = platform + self.experiments[expId] = {} + self.experiments[expId]['port'] = port + self.experiments[expId]['startTime'] = time + self.experiments[expId]['endTime'] = 'N/A' + self.experiments[expId]['status'] = 'INITIALIZED' + self.experiments[expId]['fileName'] = file_name + self.experiments[expId]['platform'] = platform self.write_file() - def update_experiment(self, id, key, value): + def update_experiment(self, expId, key, value): '''Update experiment''' if id not in self.experiments: return False - self.experiments[id][key] = value + self.experiments[expId][key] = value self.write_file() return True - def remove_experiment(self, id): + def remove_experiment(self, expId): '''remove an experiment by id''' if id in self.experiments: - self.experiments.pop(id) + self.experiments.pop(expId) self.write_file() def get_all_experiments(self): @@ -109,7 +108,7 @@ def write_file(self): json.dump(self.experiments, file) except IOError as error: print('Error:', error) - return + return '' def read_file(self): '''load config from local file''' @@ -119,4 +118,4 @@ def read_file(self): return json.load(file) except ValueError: return {} - return {} + return {} diff --git a/tools/nni_cmd/constants.py b/tools/nni_cmd/constants.py index d22a509c46..0777d2db98 100644 --- a/tools/nni_cmd/constants.py +++ b/tools/nni_cmd/constants.py @@ -21,7 +21,7 @@ import os from colorama import Fore -NNICTL_HOME_DIR = os.path.join(os.path.expanduser('~'), '.local', 'nnictl') +NNICTL_HOME_DIR = os.path.join(os.path.expanduser('~'), '.local', 'nnictl') ERROR_INFO = 'ERROR: %s' @@ -58,7 +58,8 @@ '-----------------------------------------------------------------------\n' EXPERIMENT_START_FAILED_INFO = 'There is an experiment running in the port %d, please stop it first or set another port!\n' \ - 'You could use \'nnictl stop --port [PORT]\' command to stop an experiment!\nOr you could use \'nnictl create --config [CONFIG_PATH] --port [PORT]\' to set port!\n' + 'You could use \'nnictl stop --port [PORT]\' command to stop an experiment!\nOr you could ' \ + 'use \'nnictl create --config [CONFIG_PATH] --port [PORT]\' to set port!\n' EXPERIMENT_INFORMATION_FORMAT = '----------------------------------------------------------------------------------------\n' \ ' Experiment information\n' \ diff --git a/tools/nni_cmd/launcher.py b/tools/nni_cmd/launcher.py index e2fac2cb42..f99f8dfe43 100644 --- a/tools/nni_cmd/launcher.py +++ b/tools/nni_cmd/launcher.py @@ -22,22 +22,21 @@ import json import os import sys -import shutil import string -from subprocess import Popen, PIPE, call, check_output, check_call, CalledProcessError +import random +import site +import time import tempfile +from subprocess import Popen, check_call, CalledProcessError +from nni_annotation import expand_annotations, generate_search_space from nni.constants import ModuleName, AdvisorModuleName -from nni_annotation import * from .launcher_utils import validate_all_content -from .rest_utils import rest_put, rest_post, check_rest_server, check_rest_server_quick, check_response +from .rest_utils import rest_put, rest_post, check_rest_server, check_response from .url_utils import cluster_metadata_url, experiment_url, get_local_urls from .config_utils import Config, Experiments -from .common_utils import get_yml_content, get_json_content, print_error, print_normal, print_warning, detect_process, detect_port, get_user, get_python_dir -from .constants import * -import random -import site -import time -from pathlib import Path +from .common_utils import get_yml_content, get_json_content, print_error, print_normal, \ + detect_port, get_user, get_python_dir +from .constants import NNICTL_HOME_DIR, ERROR_INFO, REST_TIME_OUT, EXPERIMENT_SUCCESS_INFO, LOG_HEADER, PACKAGE_REQUIREMENTS from .command_utils import check_output_command, kill_command from .nnictl_utils import update_experiment @@ -83,7 +82,8 @@ def _generate_installation_path(sitepackages_path): python_dir = os.getenv('VIRTUAL_ENV') else: python_sitepackage = site.getsitepackages()[0] - # If system-wide python is used, we will give priority to using `local sitepackage`--"usersitepackages()" given that nni exists there + # If system-wide python is used, we will give priority to using `local sitepackage`--"usersitepackages()" given + # that nni exists there if python_sitepackage.startswith('/usr') or python_sitepackage.startswith('/Library'): python_dir = try_installation_path_sequentially(site.getusersitepackages(), site.getsitepackages()[0]) else: @@ -98,7 +98,6 @@ def _generate_installation_path(sitepackages_path): def start_rest_server(port, platform, mode, config_file_name, experiment_id=None, log_dir=None, log_level=None): '''Run nni manager process''' - nni_config = Config(config_file_name) if detect_port(port): print_error('Port %s is used by another process, please reset the port!\n' \ 'You could use \'nnictl create --help\' to get help information' % port) @@ -114,7 +113,7 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None entry_dir = get_nni_installation_path() entry_file = os.path.join(entry_dir, 'main.js') - + node_command = 'node' if sys.platform == 'win32': node_command = os.path.join(entry_dir[:-3], 'Scripts', 'node.exe') @@ -132,7 +131,7 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None cmds += ['--experiment_id', experiment_id] stdout_full_path, stderr_full_path = get_log_path(config_file_name) with open(stdout_full_path, 'a+') as stdout_file, open(stderr_full_path, 'a+') as stderr_file: - time_now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + time_now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #add time information in the header of log files log_header = LOG_HEADER % str(time_now) stdout_file.write(log_header) @@ -212,7 +211,7 @@ def setNNIManagerIp(experiment_config, port, config_file_name): if experiment_config.get('nniManagerIp') is None: return True, None ip_config_dict = dict() - ip_config_dict['nni_manager_ip'] = { 'nniManagerIp' : experiment_config['nniManagerIp'] } + ip_config_dict['nni_manager_ip'] = {'nniManagerIp': experiment_config['nniManagerIp']} response = rest_put(cluster_metadata_url(port), json.dumps(ip_config_dict), REST_TIME_OUT) err_message = None if not response or not response.status_code == 200: @@ -403,11 +402,12 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen stdout_full_path, stderr_full_path = get_log_path(config_file_name) with open(stdout_full_path, 'a+') as stdout_file, open(stderr_full_path, 'a+') as stderr_file: check_call([sys.executable, '-c', 'import %s'%(module_name)], stdout=stdout_file, stderr=stderr_file) - except CalledProcessError as e: + except CalledProcessError: print_error('some errors happen when import package %s.' %(package_name)) print_log_content(config_file_name) if package_name in PACKAGE_REQUIREMENTS: - print_error('If %s is not installed, it should be installed through \'nnictl package install --name %s\''%(package_name, package_name)) + print_error('If %s is not installed, it should be installed through '\ + '\'nnictl package install --name %s\''%(package_name, package_name)) exit(1) log_dir = experiment_config['logDir'] if experiment_config.get('logDir') else None log_level = experiment_config['logLevel'] if experiment_config.get('logLevel') else None @@ -416,7 +416,8 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen if log_level not in ['trace', 'debug'] and (args.debug or experiment_config.get('debug') is True): log_level = 'debug' # start rest server - rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, config_file_name, experiment_id, log_dir, log_level) + rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], \ + mode, config_file_name, experiment_id, log_dir, log_level) nni_config.set_config('restServerPid', rest_process.pid) # Deal with annotation if experiment_config.get('useAnnotation'): @@ -450,8 +451,9 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen exit(1) if mode != 'view': # set platform configuration - set_platform_config(experiment_config['trainingServicePlatform'], experiment_config, args.port, config_file_name, rest_process) - + set_platform_config(experiment_config['trainingServicePlatform'], experiment_config, args.port,\ + config_file_name, rest_process) + # start a new experiment print_normal('Starting experiment...') # set debug configuration @@ -478,7 +480,8 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen #save experiment information nnictl_experiment_config = Experiments() - nnictl_experiment_config.add_experiment(experiment_id, args.port, start_time, config_file_name, experiment_config['trainingServicePlatform']) + nnictl_experiment_config.add_experiment(experiment_id, args.port, start_time, config_file_name,\ + experiment_config['trainingServicePlatform']) print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, ' '.join(web_ui_url_list))) @@ -503,7 +506,6 @@ def manage_stopped_experiment(args, mode): experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() experiment_id = None - experiment_endTime = None #find the latest stopped experiment if not args.id: print_error('Please set experiment id! \nYou could use \'nnictl {0} {id}\' to {0} a stopped experiment!\n' \ diff --git a/tools/nni_cmd/launcher_utils.py b/tools/nni_cmd/launcher_utils.py index da6a668064..f6c849abab 100644 --- a/tools/nni_cmd/launcher_utils.py +++ b/tools/nni_cmd/launcher_utils.py @@ -20,11 +20,11 @@ import os import json -from .config_schema import LOCAL_CONFIG_SCHEMA, REMOTE_CONFIG_SCHEMA, PAI_CONFIG_SCHEMA, KUBEFLOW_CONFIG_SCHEMA, FRAMEWORKCONTROLLER_CONFIG_SCHEMA, \ -tuner_schema_dict, advisor_schema_dict, assessor_schema_dict -from schema import SchemaMissingKeyError, SchemaForbiddenKeyError, SchemaUnexpectedTypeError, SchemaWrongKeyError, SchemaError -from .common_utils import get_json_content, print_error, print_warning, print_normal -from schema import Schema, And, Use, Optional, Regex, Or +from schema import SchemaError +from schema import Schema +from .config_schema import LOCAL_CONFIG_SCHEMA, REMOTE_CONFIG_SCHEMA, PAI_CONFIG_SCHEMA, KUBEFLOW_CONFIG_SCHEMA,\ + FRAMEWORKCONTROLLER_CONFIG_SCHEMA, tuner_schema_dict, advisor_schema_dict, assessor_schema_dict +from .common_utils import print_error, print_warning, print_normal def expand_path(experiment_config, key): '''Change '~' to user home directory''' @@ -164,11 +164,11 @@ def validate_common_content(experiment_config): print_error('Please set correct trainingServicePlatform!') exit(1) schema_dict = { - 'local': LOCAL_CONFIG_SCHEMA, - 'remote': REMOTE_CONFIG_SCHEMA, - 'pai': PAI_CONFIG_SCHEMA, - 'kubeflow': KUBEFLOW_CONFIG_SCHEMA, - 'frameworkcontroller': FRAMEWORKCONTROLLER_CONFIG_SCHEMA + 'local': LOCAL_CONFIG_SCHEMA, + 'remote': REMOTE_CONFIG_SCHEMA, + 'pai': PAI_CONFIG_SCHEMA, + 'kubeflow': KUBEFLOW_CONFIG_SCHEMA, + 'frameworkcontroller': FRAMEWORKCONTROLLER_CONFIG_SCHEMA } separate_schema_dict = { 'tuner': tuner_schema_dict, diff --git a/tools/nni_cmd/nnictl.py b/tools/nni_cmd/nnictl.py index 8da30fdfb7..88ee311423 100644 --- a/tools/nni_cmd/nnictl.py +++ b/tools/nni_cmd/nnictl.py @@ -20,14 +20,18 @@ import argparse +import os import pkg_resources +from colorama import init +from .common_utils import print_error from .launcher import create_experiment, resume_experiment, view_experiment from .updater import update_searchspace, update_concurrency, update_duration, update_trialnum, import_data -from .nnictl_utils import * -from .package_management import * -from .constants import * -from .tensorboard_utils import * -from colorama import init +from .nnictl_utils import stop_experiment, trial_ls, trial_kill, list_experiment, experiment_status,\ + log_trial, experiment_clean, platform_clean, experiment_list, \ + monitor_experiment, export_trials_data, trial_codegen, webui_url, get_config, log_stdout, log_stderr +from .package_management import package_install, package_show +from .constants import DEFAULT_REST_PORT +from .tensorboard_utils import start_tensorboard, stop_tensorboard init(autoreset=True) if os.environ.get('COVERAGE_PROCESS_START'): @@ -38,7 +42,7 @@ def nni_info(*args): if args[0].version: try: print(pkg_resources.get_distribution('nni').version) - except pkg_resources.ResolutionError as err: + except pkg_resources.ResolutionError: print_error('Get version failed, please use `pip3 list | grep nni` to check nni version!') else: print('please run "nnictl {positional argument} --help" to see nnictl guidance') diff --git a/tools/nni_cmd/nnictl_utils.py b/tools/nni_cmd/nnictl_utils.py index b6fada56e8..4cadce182d 100644 --- a/tools/nni_cmd/nnictl_utils.py +++ b/tools/nni_cmd/nnictl_utils.py @@ -20,15 +20,13 @@ import csv import os -import psutil import json -from datetime import datetime, timezone import time import re -from pathlib import Path -from pyhdfs import HdfsClient, HdfsFileNotFoundException import shutil -from subprocess import call, check_output +from datetime import datetime, timezone +from pathlib import Path +from pyhdfs import HdfsClient from nni_annotation import expand_annotations from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url, export_data_url @@ -102,7 +100,8 @@ def check_experiment_id(args, update=True): experiment_information = "" for key in running_experiment_list: experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ - experiment_dict[key]['port'], experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) + experiment_dict[key]['port'], experiment_dict[key].get('platform'), experiment_dict[key]['startTime'],\ + experiment_dict[key]['endTime'])) print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) exit(1) elif not running_experiment_list: @@ -157,23 +156,24 @@ def parse_ids(args): experiment_information = "" for key in running_experiment_list: experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ - experiment_dict[key]['port'], experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) + experiment_dict[key]['port'], experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], \ + experiment_dict[key]['endTime'])) print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) exit(1) else: result_list = running_experiment_list elif args.id.endswith('*'): - for id in running_experiment_list: - if id.startswith(args.id[:-1]): - result_list.append(id) + for expId in running_experiment_list: + if expId.startswith(args.id[:-1]): + result_list.append(expId) elif args.id in running_experiment_list: result_list.append(args.id) else: - for id in running_experiment_list: - if id.startswith(args.id): - result_list.append(id) + for expId in running_experiment_list: + if expId.startswith(args.id): + result_list.append(expId) if len(result_list) > 1: - print_error(args.id + ' is ambiguous, please choose ' + ' '.join(result_list) ) + print_error(args.id + ' is ambiguous, please choose ' + ' '.join(result_list)) return None if not result_list and (args.id or args.port): print_error('There are no experiments matched, please set correct experiment id or restful server port') @@ -235,7 +235,6 @@ def stop_experiment(args): for experiment_id in experiment_id_list: print_normal('Stoping experiment %s' % experiment_id) nni_config = Config(experiment_dict[experiment_id]['fileName']) - rest_port = nni_config.get_config('restServerPort') rest_pid = nni_config.get_config('restServerPid') if rest_pid: kill_command(rest_pid) @@ -249,7 +248,7 @@ def stop_experiment(args): nni_config.set_config('tensorboardPidList', []) print_normal('Stop experiment success.') experiment_config.update_experiment(experiment_id, 'status', 'STOPPED') - time_now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + time_now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) experiment_config.update_experiment(experiment_id, 'endTime', str(time_now)) def trial_ls(args): @@ -401,9 +400,9 @@ def local_clean(directory): print_normal('removing folder {0}'.format(directory)) try: shutil.rmtree(directory) - except FileNotFoundError as err: + except FileNotFoundError: print_error('{0} does not exist.'.format(directory)) - + def remote_clean(machine_list, experiment_id=None): '''clean up remote data''' for machine in machine_list: @@ -418,7 +417,7 @@ def remote_clean(machine_list, experiment_id=None): sftp = create_ssh_sftp_client(host, port, userName, passwd) print_normal('removing folder {0}'.format(host + ':' + str(port) + remote_dir)) remove_remote_directory(sftp, remote_dir) - + def hdfs_clean(host, user_name, output_dir, experiment_id=None): '''clean up hdfs data''' hdfs_client = HdfsClient(hosts='{0}:80'.format(host), user_name=user_name, webhdfs_path='/webhdfs/api/v1', timeout=5) @@ -475,7 +474,7 @@ def experiment_clean(args): machine_list = nni_config.get_config('experimentConfig').get('machineList') remote_clean(machine_list, experiment_id) elif platform == 'pai': - host = nni_config.get_config('experimentConfig').get('paiConfig').get('host') + host = nni_config.get_config('experimentConfig').get('paiConfig').get('host') user_name = nni_config.get_config('experimentConfig').get('paiConfig').get('userName') output_dir = nni_config.get_config('experimentConfig').get('trial').get('outputDir') hdfs_clean(host, user_name, output_dir, experiment_id) @@ -492,7 +491,7 @@ def experiment_clean(args): experiment_config = Experiments() print_normal('removing metadata of experiment {0}'.format(experiment_id)) experiment_config.remove_experiment(experiment_id) - print_normal('Done.') + print_normal('Done.') def get_platform_dir(config_content): '''get the dir list to be deleted''' @@ -505,8 +504,7 @@ def get_platform_dir(config_content): port = machine.get('port') dir_list.append(host + ':' + str(port) + '/tmp/nni') elif platform == 'pai': - pai_config = config_content.get('paiConfig') - host = config_content.get('paiConfig').get('host') + host = config_content.get('paiConfig').get('host') user_name = config_content.get('paiConfig').get('userName') output_dir = config_content.get('trial').get('outputDir') dir_list.append('server: {0}, path: {1}/nni'.format(host, user_name)) @@ -529,17 +527,15 @@ def platform_clean(args): print_normal('platform {0} not supported.'.format(platform)) exit(0) update_experiment() - experiment_config = Experiments() - experiment_dict = experiment_config.get_all_experiments() - id_list = list(experiment_dict.keys()) dir_list = get_platform_dir(config_content) if not dir_list: print_normal('No folder of NNI caches is found.') exit(1) while True: - print_normal('This command will remove below folders of NNI caches. If other users are using experiments on below hosts, it will be broken.') - for dir in dir_list: - print(' ' + dir) + print_normal('This command will remove below folders of NNI caches. If other users are using experiments' \ + ' on below hosts, it will be broken.') + for value in dir_list: + print(' ' + value) inputs = input('INFO: do you want to continue?[y/N]:') if not inputs.lower() or inputs.lower() in ['n', 'no']: exit(0) @@ -549,11 +545,9 @@ def platform_clean(args): break if platform == 'remote': machine_list = config_content.get('machineList') - for machine in machine_list: - remote_clean(machine_list, None) + remote_clean(machine_list, None) elif platform == 'pai': - pai_config = config_content.get('paiConfig') - host = config_content.get('paiConfig').get('host') + host = config_content.get('paiConfig').get('host') user_name = config_content.get('paiConfig').get('userName') output_dir = config_content.get('trial').get('outputDir') hdfs_clean(host, user_name, output_dir, None) @@ -618,7 +612,8 @@ def show_experiment_info(): return for key in experiment_id_list: print(EXPERIMENT_MONITOR_INFO % (key, experiment_dict[key]['status'], experiment_dict[key]['port'], \ - experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], get_time_interval(experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))) + experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], \ + get_time_interval(experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))) print(TRIAL_MONITOR_HEAD) running, response = check_rest_server_quick(experiment_dict[key]['port']) if running: @@ -627,7 +622,8 @@ def show_experiment_info(): content = json.loads(response.text) for index, value in enumerate(content): content[index] = convert_time_stamp_to_date(value) - print(TRIAL_MONITOR_CONTENT % (content[index].get('id'), content[index].get('startTime'), content[index].get('endTime'), content[index].get('status'))) + print(TRIAL_MONITOR_CONTENT % (content[index].get('id'), content[index].get('startTime'), \ + content[index].get('endTime'), content[index].get('status'))) print(TRIAL_MONITOR_TAIL) def monitor_experiment(args): diff --git a/tools/nni_cmd/package_management.py b/tools/nni_cmd/package_management.py index de8dbe62ec..32ed79496d 100644 --- a/tools/nni_cmd/package_management.py +++ b/tools/nni_cmd/package_management.py @@ -18,12 +18,10 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import nni import os -import sys -from subprocess import call +import nni from .constants import PACKAGE_REQUIREMENTS -from .common_utils import print_normal, print_error +from .common_utils import print_error from .command_utils import install_requirements_command def process_install(package_name): diff --git a/tools/nni_cmd/ssh_utils.py b/tools/nni_cmd/ssh_utils.py index da707dac48..7453830323 100644 --- a/tools/nni_cmd/ssh_utils.py +++ b/tools/nni_cmd/ssh_utils.py @@ -20,7 +20,6 @@ import os from .common_utils import print_error -from subprocess import call from .command_utils import install_package_command def check_environment(): @@ -29,6 +28,8 @@ def check_environment(): import paramiko except: install_package_command('paramiko') + import paramiko + return paramiko def copy_remote_directory_to_local(sftp, remote_path, local_path): '''copy remote directory to local machine''' @@ -49,8 +50,7 @@ def copy_remote_directory_to_local(sftp, remote_path, local_path): def create_ssh_sftp_client(host_ip, port, username, password): '''create ssh client''' try: - check_environment() - import paramiko + paramiko = check_environment() conn = paramiko.Transport(host_ip, port) conn.connect(username=username, password=password) sftp = paramiko.SFTPClient.from_transport(conn) diff --git a/tools/nni_cmd/tensorboard_utils.py b/tools/nni_cmd/tensorboard_utils.py index b4578c34b0..9646b4de0e 100644 --- a/tools/nni_cmd/tensorboard_utils.py +++ b/tools/nni_cmd/tensorboard_utils.py @@ -19,21 +19,17 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import os -import psutil import json -import datetime -import time -from subprocess import call, check_output, Popen, PIPE -from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response -from .config_utils import Config, Experiments -from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url, get_local_urls -from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, COLOR_GREEN_FORMAT -import time -from .common_utils import print_normal, print_error, print_warning, detect_process, detect_port -from .nnictl_utils import * import re -from .ssh_utils import create_ssh_sftp_client, copy_remote_directory_to_local import tempfile +from subprocess import call, Popen +from .rest_utils import rest_get, check_rest_server_quick, check_response +from .config_utils import Config, Experiments +from .url_utils import trial_jobs_url, get_local_urls +from .constants import COLOR_GREEN_FORMAT, REST_TIME_OUT +from .common_utils import print_normal, print_error, detect_process, detect_port +from .nnictl_utils import check_experiment_id, check_experiment_id +from .ssh_utils import create_ssh_sftp_client, copy_remote_directory_to_local def parse_log_path(args, trial_content): '''parse log path''' @@ -43,7 +39,7 @@ def parse_log_path(args, trial_content): if args.trial_id and args.trial_id != 'all' and trial.get('id') != args.trial_id: continue pattern = r'(?P.+)://(?P.+):(?P.*)' - match = re.search(pattern,trial['logPath']) + match = re.search(pattern, trial['logPath']) if match: path_list.append(match.group('path')) host_list.append(match.group('host')) @@ -94,7 +90,8 @@ def start_tensorboard_process(args, nni_config, path_list, temp_nni_path): if detect_port(args.port): print_error('Port %s is used by another process, please reset port!' % str(args.port)) exit(1) - with open(os.path.join(temp_nni_path, 'tensorboard_stdout'), 'a+') as stdout_file, open(os.path.join(temp_nni_path, 'tensorboard_stderr'), 'a+') as stderr_file: + with open(os.path.join(temp_nni_path, 'tensorboard_stdout'), 'a+') as stdout_file, \ + open(os.path.join(temp_nni_path, 'tensorboard_stderr'), 'a+') as stderr_file: cmds = ['tensorboard', '--logdir', format_tensorboard_log_path(path_list), '--port', str(args.port)] tensorboard_process = Popen(cmds, stdout=stdout_file, stderr=stderr_file) url_list = get_local_urls(args.port) diff --git a/tools/nni_cmd/updater.py b/tools/nni_cmd/updater.py index 9258d73f0a..07ae6123cb 100644 --- a/tools/nni_cmd/updater.py +++ b/tools/nni_cmd/updater.py @@ -25,7 +25,7 @@ from .url_utils import experiment_url, import_data_url from .config_utils import Config from .common_utils import get_json_content, print_normal, print_error, print_warning -from .nnictl_utils import check_experiment_id, get_experiment_port, get_config_filename +from .nnictl_utils import get_experiment_port, get_config_filename from .launcher_utils import parse_time from .constants import REST_TIME_OUT, TUNERS_SUPPORTING_IMPORT_DATA, TUNERS_NO_NEED_TO_IMPORT_DATA diff --git a/tools/nni_cmd/url_utils.py b/tools/nni_cmd/url_utils.py index c50b2551d2..05cfa8e66f 100644 --- a/tools/nni_cmd/url_utils.py +++ b/tools/nni_cmd/url_utils.py @@ -18,8 +18,8 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +import socket import psutil -from socket import AddressFamily BASE_URL = 'http://localhost' @@ -83,8 +83,8 @@ def tensorboard_url(port): def get_local_urls(port): '''get urls of local machine''' url_list = [] - for name, info in psutil.net_if_addrs().items(): + for _, info in psutil.net_if_addrs().items(): for addr in info: - if AddressFamily.AF_INET == addr.family: + if socket.AddressFamily.AF_INET == addr.family: url_list.append('http://{}:{}'.format(addr.address, port)) return url_list From e259d109fea97fbce6f81b3081390fcb99d594fa Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Thu, 31 Oct 2019 17:55:41 +0800 Subject: [PATCH 02/18] fix id error --- tools/nni_cmd/config_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/nni_cmd/config_utils.py b/tools/nni_cmd/config_utils.py index c5a36b374d..c7c88bcf3e 100644 --- a/tools/nni_cmd/config_utils.py +++ b/tools/nni_cmd/config_utils.py @@ -85,7 +85,7 @@ def add_experiment(self, expId, port, time, file_name, platform): def update_experiment(self, expId, key, value): '''Update experiment''' - if id not in self.experiments: + if expId not in self.experiments: return False self.experiments[expId][key] = value self.write_file() From f9d33127b659cd9605de29d98a71102186d2c60b Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Thu, 14 Nov 2019 17:23:47 +0800 Subject: [PATCH 03/18] init --- src/nni_manager/main.ts | 9 +- .../rest_server/restValidationSchemas.ts | 8 ++ .../common/trialConfigMetadataKey.ts | 1 + .../pai/paiTrainingService.ts | 63 +++++----- .../training_service/paiLite/paiLiteConfig.ts | 43 +++++++ .../training_service/paiLite/paiLiteData.ts | 71 +++++++++++ .../paiLite/paiLiteTrainingService.ts | 114 ++++++++++++++++++ tools/nni_cmd/config_schema.py | 20 ++- tools/nni_cmd/launcher.py | 28 ++++- tools/nni_cmd/launcher_utils.py | 7 +- 10 files changed, 326 insertions(+), 38 deletions(-) create mode 100644 src/nni_manager/training_service/paiLite/paiLiteConfig.ts create mode 100644 src/nni_manager/training_service/paiLite/paiLiteData.ts create mode 100644 src/nni_manager/training_service/paiLite/paiLiteTrainingService.ts diff --git a/src/nni_manager/main.ts b/src/nni_manager/main.ts index fec5a8819e..9e913a35f2 100644 --- a/src/nni_manager/main.ts +++ b/src/nni_manager/main.ts @@ -37,6 +37,7 @@ import { FrameworkControllerTrainingService } from './training_service/kubernete import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService'; import { LocalTrainingService } from './training_service/local/localTrainingService'; import { PAITrainingService } from './training_service/pai/paiTrainingService'; +import { PAILiteTrainingService } from './training_service/paiLite/paiLiteTrainingService'; import { RemoteMachineTrainingService } from './training_service/remote_machine/remoteMachineTrainingService'; @@ -62,6 +63,10 @@ async function initContainer(platformMode: string): Promise { Container.bind(TrainingService) .to(PAITrainingService) .scope(Scope.Singleton); + } else if (platformMode === 'paiLite') { + Container.bind(TrainingService) + .to(PAILiteTrainingService) + .scope(Scope.Singleton); } else if (platformMode === 'kubeflow') { Container.bind(TrainingService) .to(KubeflowTrainingService) @@ -89,7 +94,7 @@ async function initContainer(platformMode: string): Promise { function usage(): void { console.info('usage: node main.js --port --mode \ - --start_mode --experiment_id '); + --start_mode --experiment_id '); } const strPort: string = parseArg(['--port', '-p']); @@ -101,7 +106,7 @@ if (!strPort || strPort.length === 0) { const port: number = parseInt(strPort, 10); const mode: string = parseArg(['--mode', '-m']); -if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller'].includes(mode)) { +if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiLite'].includes(mode)) { console.log(`FATAL: unknown mode: ${mode}`); usage(); process.exit(1); diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index 69a7ec1d90..ba5a26b341 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -111,6 +111,14 @@ export namespace ValidationSchemas { token: joi.string().min(1), host: joi.string().min(1).required() }), + pai_lite_config: joi.object({ + userName: joi.string().min(1).required(), + passWord: joi.string().min(1), + token: joi.string().min(1), + host: joi.string().min(1).required(), + nniManagerNFSMountPath: joi.string().min(1).required(), + containerNFSMountPath: joi.string().min(1).required() + }), kubeflow_config: joi.object({ operator: joi.string().min(1).required(), storage: joi.string().min(1), diff --git a/src/nni_manager/training_service/common/trialConfigMetadataKey.ts b/src/nni_manager/training_service/common/trialConfigMetadataKey.ts index 928c0587b2..66a89d338d 100644 --- a/src/nni_manager/training_service/common/trialConfigMetadataKey.ts +++ b/src/nni_manager/training_service/common/trialConfigMetadataKey.ts @@ -30,6 +30,7 @@ export enum TrialConfigMetadataKey { MULTI_PHASE = 'multiPhase', RANDOM_SCHEDULER = 'random_scheduler', PAI_CLUSTER_CONFIG = 'pai_config', + PAI_LITE_CLUSTER_CONFIG = 'pai_lite_config', KUBEFLOW_CLUSTER_CONFIG = 'kubeflow_config', NNI_MANAGER_IP = 'nni_manager_ip', FRAMEWORKCONTROLLER_CLUSTER_CONFIG = 'frameworkcontroller_config', diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index d741931b29..11b792c734 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -55,30 +55,30 @@ import * as WebHDFS from 'webhdfs'; */ @component.Singleton class PAITrainingService implements TrainingService { - private readonly log!: Logger; - private readonly metricsEmitter: EventEmitter; - private readonly trialJobsMap: Map; - private readonly expRootDir: string; - private paiTrialConfig: NNIPAITrialConfig | undefined; - private paiClusterConfig?: PAIClusterConfig; - private readonly jobQueue: string[]; - private stopping: boolean = false; + protected readonly log!: Logger; + protected readonly metricsEmitter: EventEmitter; + protected readonly trialJobsMap: Map; + protected readonly expRootDir: string; + protected paiTrialConfig: NNIPAITrialConfig | undefined; + protected paiClusterConfig?: PAIClusterConfig; + protected readonly jobQueue: string[]; + protected stopping: boolean = false; // tslint:disable-next-line:no-any - private hdfsClient: any; - private paiToken? : string; - private paiTokenUpdateTime?: number; - private readonly paiTokenUpdateInterval: number; - private readonly experimentId! : string; - private readonly paiJobCollector : PAIJobInfoCollector; - private paiRestServerPort?: number; - private nniManagerIpConfig?: NNIManagerIpConfig; - private copyExpCodeDirPromise?: Promise; - private copyAuthFilePromise?: Promise; - private versionCheck: boolean = true; - private logCollection: string; - private isMultiPhase: boolean = false; - private authFileHdfsPath: string | undefined = undefined; - private portList?: string | undefined; + protected hdfsClient: any; + protected paiToken? : string; + protected paiTokenUpdateTime?: number; + protected readonly paiTokenUpdateInterval: number; + protected readonly experimentId! : string; + protected readonly paiJobCollector : PAIJobInfoCollector; + protected paiRestServerPort?: number; + protected nniManagerIpConfig?: NNIManagerIpConfig; + protected copyExpCodeDirPromise?: Promise; + protected copyAuthFilePromise?: Promise; + protected versionCheck: boolean = true; + protected logCollection: string; + protected isMultiPhase: boolean = false; + protected authFileHdfsPath: string | undefined = undefined; + protected portList?: string | undefined; constructor() { this.log = getLogger(); @@ -239,7 +239,8 @@ class PAITrainingService implements TrainingService { // tslint:disable-next-line:max-func-body-length public async setClusterMetadata(key: string, value: string): Promise { const deferred : Deferred = new Deferred(); - + console.log('-------------------242--------------') + console.log(key) switch (key) { case TrialConfigMetadataKey.NNI_MANAGER_IP: this.nniManagerIpConfig = JSON.parse(value); @@ -311,7 +312,7 @@ class PAITrainingService implements TrainingService { break; default: //Reject for unknown keys - throw new Error(`Uknown key: ${key}`); + deferred.reject(new Error(`Uknown key: ${key}`)); } return deferred.promise; @@ -350,7 +351,7 @@ class PAITrainingService implements TrainingService { } // tslint:disable-next-line:max-func-body-length - private async submitTrialJobToPAI(trialJobId: string): Promise { + protected async submitTrialJobToPAI(trialJobId: string): Promise { const deferred : Deferred = new Deferred(); const trialJobDetail: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); @@ -501,7 +502,7 @@ class PAITrainingService implements TrainingService { return deferred.promise; } - private async statusCheckingLoop(): Promise { + protected async statusCheckingLoop(): Promise { while (!this.stopping) { if(this.paiClusterConfig && this.paiClusterConfig.passWord) { try { @@ -523,7 +524,7 @@ class PAITrainingService implements TrainingService { } } - private async submitJobLoop(): Promise { + protected async submitJobLoop(): Promise { while (!this.stopping) { while (!this.stopping && this.jobQueue.length > 0) { const trialJobId: string = this.jobQueue[0]; @@ -542,7 +543,7 @@ class PAITrainingService implements TrainingService { /** * Update pai token by the interval time or initialize the pai token */ - private async updatePaiToken(): Promise { + protected async updatePaiToken(): Promise { const deferred : Deferred = new Deferred(); const currentTime: number = new Date().getTime(); @@ -594,7 +595,7 @@ class PAITrainingService implements TrainingService { .finally(() => { clearTimeout(timeoutId); }); } - private async writeParameterFile(trialJobId: string, hyperParameters: HyperParameters): Promise { + protected async writeParameterFile(trialJobId: string, hyperParameters: HyperParameters): Promise { if (this.paiClusterConfig === undefined) { throw new Error('PAI Cluster config is not initialized'); } @@ -618,7 +619,7 @@ class PAITrainingService implements TrainingService { }); } - private postParameterFileMeta(parameterFileMeta: ParameterFileMeta): Promise { + protected postParameterFileMeta(parameterFileMeta: ParameterFileMeta): Promise { const deferred : Deferred = new Deferred(); const restServer: PAIJobRestServer = component.get(PAIJobRestServer); const req: request.Options = { diff --git a/src/nni_manager/training_service/paiLite/paiLiteConfig.ts b/src/nni_manager/training_service/paiLite/paiLiteConfig.ts new file mode 100644 index 0000000000..8f1a7446f6 --- /dev/null +++ b/src/nni_manager/training_service/paiLite/paiLiteConfig.ts @@ -0,0 +1,43 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; +import { PAIClusterConfig } from '../pai/paiConfig'; + +/** + * PAILite cluster configuration + */ +export class PAILiteClusterConfig extends PAIClusterConfig { + public readonly nniManagerNFSMountPath: string; + public readonly containerNFSMountPath: string; + + /** + * Constructor + * @param userName User name of PAI Cluster + * @param passWord password of PAI Cluster + * @param host Host IP of PAI Cluster + * @param token PAI token of PAI Cluster + */ + constructor(userName: string, host : string, nniManagerNFSMountPath: string, + containerNFSMountPath: string, passWord?: string, token?: string) { + super(userName, host, passWord, token); + this.nniManagerNFSMountPath = nniManagerNFSMountPath; + this.containerNFSMountPath = containerNFSMountPath; + } +} diff --git a/src/nni_manager/training_service/paiLite/paiLiteData.ts b/src/nni_manager/training_service/paiLite/paiLiteData.ts new file mode 100644 index 0000000000..1687f3841e --- /dev/null +++ b/src/nni_manager/training_service/paiLite/paiLiteData.ts @@ -0,0 +1,71 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; + +/** + * PAI trial job detail + */ +export class PAITrialJobDetail implements TrialJobDetail { + public id: string; + public status: TrialJobStatus; + public paiJobName: string; + public submitTime: number; + public startTime?: number; + public endTime?: number; + public tags?: string[]; + public url?: string; + public workingDirectory: string; + public form: TrialJobApplicationForm; + public isEarlyStopped?: boolean; + + constructor(id: string, status: TrialJobStatus, paiJobName : string, + submitTime: number, workingDirectory: string, form: TrialJobApplicationForm) { + this.id = id; + this.status = status; + this.paiJobName = paiJobName; + this.submitTime = submitTime; + this.workingDirectory = workingDirectory; + this.form = form; + this.tags = []; + } +} + +export const PAI_INSTALL_NNI_SHELL_FORMAT: string = +`#!/bin/bash +if python3 -c 'import nni' > /dev/null 2>&1; then + # nni module is already installed, skip + return +else + # Install nni + python3 -m pip install --user nni +fi`; + +export const PAI_TRIAL_COMMAND_FORMAT: string = +`export NNI_PLATFORM=paiLite NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \ +&& ls $NNI_SYS_DIR \ +&& cd $NNI_SYS_DIR && sh install_nni.sh \ +&& python3 -m nni_trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' \ +--nni_manager_version '{9}' --log_collection '{10}'`; + +// tslint:disable:no-http-string +export const PAI_LOG_PATH_FORMAT: string = +`http://{0}/webhdfs/explorer.html#{1}`; diff --git a/src/nni_manager/training_service/paiLite/paiLiteTrainingService.ts b/src/nni_manager/training_service/paiLite/paiLiteTrainingService.ts new file mode 100644 index 0000000000..6034e7b935 --- /dev/null +++ b/src/nni_manager/training_service/paiLite/paiLiteTrainingService.ts @@ -0,0 +1,114 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import * as cpp from 'child-process-promise'; +import * as fs from 'fs'; +import * as path from 'path'; +// tslint:disable-next-line:no-implicit-dependencies +import * as request from 'request'; +import * as component from '../../common/component'; + +import { EventEmitter } from 'events'; +import { Deferred } from 'ts-deferred'; +import { String } from 'typescript-string-operations'; +import { MethodNotImplementedError } from '../../common/errors'; +import { getExperimentId } from '../../common/experimentStartupInfo'; +import { getLogger, Logger } from '../../common/log'; +import { + HyperParameters, NNIManagerIpConfig, TrainingService, + TrialJobApplicationForm, TrialJobDetail, TrialJobMetric +} from '../../common/trainingService'; +import { delay, generateParamFileName, + getExperimentRootDir, getIPV4Address, getVersion, uniqueString, unixPathJoin } from '../../common/utils'; +import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; +import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; +import { execMkdir, validateCodeDir, execCopydir } from '../common/util'; +import { PAILiteClusterConfig } from './paiLiteConfig'; +import { PAI_LOG_PATH_FORMAT, PAI_TRIAL_COMMAND_FORMAT, PAITrialJobDetail } from './paiLiteData'; +import { PAITrainingService } from '../pai/paiTrainingService'; + +import * as WebHDFS from 'webhdfs'; + +/** + * Training Service implementation for OpenPAI (Open Platform for AI) + * Refer https://github.com/Microsoft/pai for more info about OpenPAI + */ +@component.Singleton +class PAILiteTrainingService extends PAITrainingService { + private containerRootDir: string = ""; + protected paiLiteClusterConfig?: PAILiteClusterConfig; + + constructor() { + super(); + } + + public async submitTrialJob(form: TrialJobApplicationForm): Promise { + console.log('---------------82-----------') + const deferred : Deferred = new Deferred(); + return deferred.promise; + } + + // tslint:disable:no-http-string + public cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise { + return super.cancelTrialJob(trialJobId, isEarlyStopped); + } + + // tslint:disable: no-unsafe-any no-any + // tslint:disable-next-line:max-func-body-length + public async setClusterMetadata(key: string, value: string): Promise { + const deferred : Deferred = new Deferred(); + switch (key) { + case TrialConfigMetadataKey.NNI_MANAGER_IP: + return super.setClusterMetadata(key, value); + + case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG: + deferred.resolve(); + break; + + case TrialConfigMetadataKey.TRIAL_CONFIG: + + deferred.resolve(); + break; + case TrialConfigMetadataKey.VERSION_CHECK: + return super.setClusterMetadata(key, value); + + case TrialConfigMetadataKey.LOG_COLLECTION: + return super.setClusterMetadata(key, value); + + case TrialConfigMetadataKey.MULTI_PHASE: + return super.setClusterMetadata(key, value); + + default: + //Reject for unknown keys + deferred.reject(new Error(`Uknown key: ${key}`)); + } + + return deferred.promise; + } + + // tslint:disable-next-line:max-func-body-length + public async submitTrialJobToPAI(trialJobId: string): Promise { + const deferred : Deferred = new Deferred(); + return deferred.promise; + } +} + +export { PAILiteTrainingService }; diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index 5eb9538ad8..b614ba20c1 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -49,7 +49,7 @@ def setPathCheck(key): 'trialConcurrency': setNumberRange('trialConcurrency', int, 1, 99999), Optional('maxExecDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$', error='ERROR: maxExecDuration format is [digit]{s,m,h,d}')), Optional('maxTrialNum'): setNumberRange('maxTrialNum', int, 1, 99999), - 'trainingServicePlatform': setChoice('trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller'), + 'trainingServicePlatform': setChoice('trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'paiLite'), Optional('searchSpacePath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'searchSpacePath'), Optional('multiPhase'): setType('multiPhase', bool), Optional('multiThread'): setType('multiThread', bool), @@ -276,6 +276,22 @@ def setPathCheck(key): }) } +pai_lite_config_schema = { + 'paiLiteConfig': Or({ + 'userName': setType('userName', str), + 'passWord': setType('passWord', str), + 'host': setType('host', str), + 'nniManagerNFSMountPath': setPathCheck('nniManagerNFSMountPath'), + 'containerNFSMountPath': setType('containerNFSMountPath', str) + }, { + 'userName': setType('userName', str), + 'token': setType('token', str), + 'host': setType('host', str), + 'nniManagerNFSMountPath': setPathCheck('nniManagerNFSMountPath'), + 'containerNFSMountPath': setType('containerNFSMountPath', str) + }) +} + kubeflow_trial_schema = { 'trial':{ 'codeDir': setPathCheck('codeDir'), @@ -413,6 +429,8 @@ def setPathCheck(key): PAI_CONFIG_SCHEMA = Schema({**common_schema, **pai_trial_schema, **pai_config_schema}) +PAI_LITE_CONFIG_SCHEMA = Schema({**common_schema, **pai_trial_schema, **pai_lite_config_schema}) + KUBEFLOW_CONFIG_SCHEMA = Schema({**common_schema, **kubeflow_trial_schema, **kubeflow_config_schema}) FRAMEWORKCONTROLLER_CONFIG_SCHEMA = Schema({**common_schema, **frameworkcontroller_trial_schema, **frameworkcontroller_config_schema}) diff --git a/tools/nni_cmd/launcher.py b/tools/nni_cmd/launcher.py index f99f8dfe43..df1f2cbae2 100644 --- a/tools/nni_cmd/launcher.py +++ b/tools/nni_cmd/launcher.py @@ -140,7 +140,7 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None from subprocess import CREATE_NEW_PROCESS_GROUP process = Popen(cmds, cwd=entry_dir, stdout=stdout_file, stderr=stderr_file, creationflags=CREATE_NEW_PROCESS_GROUP) else: - process = Popen(cmds, cwd=entry_dir, stdout=stdout_file, stderr=stderr_file) + process = Popen(cmds, cwd=entry_dir) return process, str(time_now) def set_trial_config(experiment_config, port, config_file_name): @@ -242,6 +242,25 @@ def set_pai_config(experiment_config, port, config_file_name): #set trial_config return set_trial_config(experiment_config, port, config_file_name), err_message +def set_pai_lite_config(experiment_config, port, config_file_name): + '''set pai configuration''' + pai_lite_config_data = dict() + pai_lite_config_data['pai_lite_config'] = experiment_config['paiLiteConfig'] + response = rest_put(cluster_metadata_url(port), json.dumps(pai_lite_config_data), REST_TIME_OUT) + err_message = None + if not response or not response.status_code == 200: + if response is not None: + err_message = response.text + _, stderr_full_path = get_log_path(config_file_name) + with open(stderr_full_path, 'a+') as fout: + fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':'))) + return False, err_message + result, message = setNNIManagerIp(experiment_config, port, config_file_name) + if not result: + return result, message + #set trial_config + return set_trial_config(experiment_config, port, config_file_name), err_message + def set_kubeflow_config(experiment_config, port, config_file_name): '''set kubeflow configuration''' kubeflow_config_data = dict() @@ -338,6 +357,11 @@ def set_experiment(experiment_config, mode, port, config_file_name): {'key': 'pai_config', 'value': experiment_config['paiConfig']}) request_data['clusterMetaData'].append( {'key': 'trial_config', 'value': experiment_config['trial']}) + elif experiment_config['trainingServicePlatform'] == 'paiLite': + request_data['clusterMetaData'].append( + {'key': 'pai_lite_config', 'value': experiment_config['paiLiteConfig']}) + request_data['clusterMetaData'].append( + {'key': 'trial_config', 'value': experiment_config['trial']}) elif experiment_config['trainingServicePlatform'] == 'kubeflow': request_data['clusterMetaData'].append( {'key': 'kubeflow_config', 'value': experiment_config['kubeflowConfig']}) @@ -369,6 +393,8 @@ def set_platform_config(platform, experiment_config, port, config_file_name, res config_result, err_msg = set_remote_config(experiment_config, port, config_file_name) elif platform == 'pai': config_result, err_msg = set_pai_config(experiment_config, port, config_file_name) + elif platform == 'paiLite': + config_result, err_msg = set_pai_lite_config(experiment_config, port, config_file_name) elif platform == 'kubeflow': config_result, err_msg = set_kubeflow_config(experiment_config, port, config_file_name) elif platform == 'frameworkcontroller': diff --git a/tools/nni_cmd/launcher_utils.py b/tools/nni_cmd/launcher_utils.py index f6c849abab..01da6ef59f 100644 --- a/tools/nni_cmd/launcher_utils.py +++ b/tools/nni_cmd/launcher_utils.py @@ -22,7 +22,7 @@ import json from schema import SchemaError from schema import Schema -from .config_schema import LOCAL_CONFIG_SCHEMA, REMOTE_CONFIG_SCHEMA, PAI_CONFIG_SCHEMA, KUBEFLOW_CONFIG_SCHEMA,\ +from .config_schema import LOCAL_CONFIG_SCHEMA, REMOTE_CONFIG_SCHEMA, PAI_CONFIG_SCHEMA, PAI_LITE_CONFIG_SCHEMA, KUBEFLOW_CONFIG_SCHEMA,\ FRAMEWORKCONTROLLER_CONFIG_SCHEMA, tuner_schema_dict, advisor_schema_dict, assessor_schema_dict from .common_utils import print_error, print_warning, print_normal @@ -160,13 +160,14 @@ def validate_kubeflow_operators(experiment_config): def validate_common_content(experiment_config): '''Validate whether the common values in experiment_config is valid''' if not experiment_config.get('trainingServicePlatform') or \ - experiment_config.get('trainingServicePlatform') not in ['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller']: + experiment_config.get('trainingServicePlatform') not in ['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiLite']: print_error('Please set correct trainingServicePlatform!') exit(1) schema_dict = { 'local': LOCAL_CONFIG_SCHEMA, 'remote': REMOTE_CONFIG_SCHEMA, 'pai': PAI_CONFIG_SCHEMA, + 'paiLite': PAI_LITE_CONFIG_SCHEMA, 'kubeflow': KUBEFLOW_CONFIG_SCHEMA, 'frameworkcontroller': FRAMEWORKCONTROLLER_CONFIG_SCHEMA } @@ -278,7 +279,7 @@ def validate_machine_list(experiment_config): def validate_pai_trial_conifg(experiment_config): '''validate the trial config in pai platform''' - if experiment_config.get('trainingServicePlatform') == 'pai': + if experiment_config.get('trainingServicePlatform') in ['pai', 'paiLite']: if experiment_config.get('trial').get('shmMB') and \ experiment_config['trial']['shmMB'] > experiment_config['trial']['memoryMB']: print_error('shmMB should be no more than memoryMB!') From 27032ccbe67bafecea3d864d4f1082a886281934 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Thu, 12 Dec 2019 23:47:15 +0800 Subject: [PATCH 04/18] basic work version --- .../rest_server/restValidationSchemas.ts | 5 +- .../training_service/pai/paiData.ts | 6 +- .../pai/paiJobInfoCollector.ts | 4 +- .../pai/paiTrainingService.ts | 46 +-- .../training_service/paiLite/paiLiteConfig.ts | 30 +- .../training_service/paiLite/paiLiteData.ts | 36 +-- .../paiLite/paiLiteJobRestServer.ts | 76 +++++ .../paiLite/paiLiteTrainingService.ts | 265 ++++++++++++++++-- src/sdk/pynni/nni/platform/__init__.py | 2 +- tools/nni_cmd/config_schema.py | 26 +- tools/nni_cmd/launcher.py | 2 +- 11 files changed, 386 insertions(+), 112 deletions(-) create mode 100644 src/nni_manager/training_service/paiLite/paiLiteJobRestServer.ts diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index d60caedd17..d645276b0b 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -36,6 +36,9 @@ export namespace ValidationSchemas { virtualCluster: joi.string(), shmMB: joi.number(), authFile: joi.string(), + nniManagerNFSMountPath: joi.string().min(1), + containerNFSMountPath: joi.string().min(1), + paiStoragePlugin: joi.string().min(1), nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), portList: joi.array().items(joi.object({ label: joi.string().required(), @@ -100,8 +103,6 @@ export namespace ValidationSchemas { passWord: joi.string().min(1), token: joi.string().min(1), host: joi.string().min(1).required(), - nniManagerNFSMountPath: joi.string().min(1).required(), - containerNFSMountPath: joi.string().min(1).required() }), kubeflow_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase operator: joi.string().min(1).required(), diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index 011c9ff318..6433c1ffa6 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -19,11 +19,11 @@ export class PAITrialJobDetail implements TrialJobDetail { public url?: string; public workingDirectory: string; public form: TrialJobApplicationForm; - public hdfsLogPath: string; + public logPath: string; public isEarlyStopped?: boolean; constructor(id: string, status: TrialJobStatus, paiJobName: string, - submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, hdfsLogPath: string) { + submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, logPath: string) { this.id = id; this.status = status; this.paiJobName = paiJobName; @@ -31,7 +31,7 @@ export class PAITrialJobDetail implements TrialJobDetail { this.workingDirectory = workingDirectory; this.form = form; this.tags = []; - this.hdfsLogPath = hdfsLogPath; + this.logPath = logPath; } } diff --git a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts index 04a10a8ac1..6304db3639 100644 --- a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts +++ b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts @@ -114,8 +114,8 @@ export class PAIJobInfoCollector { paiTrialJob.endTime = response.body.jobStatus.completedTime; } // Set pai trial job's url to WebHDFS output path - if (paiTrialJob.hdfsLogPath !== undefined) { - paiTrialJob.url += `,${paiTrialJob.hdfsLogPath}`; + if (paiTrialJob.logPath !== undefined) { + paiTrialJob.url += `,${paiTrialJob.logPath}`; } } } diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index eba857f56f..d5669c20ad 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -36,29 +36,29 @@ import * as WebHDFS from 'webhdfs'; */ @component.Singleton class PAITrainingService implements TrainingService { - private readonly log!: Logger; - private readonly metricsEmitter: EventEmitter; - private readonly trialJobsMap: Map; - private readonly expRootDir: string; - private paiTrialConfig: NNIPAITrialConfig | undefined; - private paiClusterConfig?: PAIClusterConfig; - private readonly jobQueue: string[]; - private stopping: boolean = false; + protected readonly log!: Logger; + protected readonly metricsEmitter: EventEmitter; + protected readonly trialJobsMap: Map; + protected readonly expRootDir: string; + protected paiTrialConfig: NNIPAITrialConfig | undefined; + protected paiClusterConfig?: PAIClusterConfig; + protected readonly jobQueue: string[]; + protected stopping: boolean = false; private hdfsClient: any; - private paiToken? : string; - private paiTokenUpdateTime?: number; - private readonly paiTokenUpdateInterval: number; - private readonly experimentId!: string; - private readonly paiJobCollector: PAIJobInfoCollector; - private paiRestServerPort?: number; - private nniManagerIpConfig?: NNIManagerIpConfig; + protected paiToken? : string; + protected paiTokenUpdateTime?: number; + protected readonly paiTokenUpdateInterval: number; + protected readonly experimentId!: string; + protected readonly paiJobCollector: PAIJobInfoCollector; + protected paiRestServerPort?: number; + protected nniManagerIpConfig?: NNIManagerIpConfig; private copyExpCodeDirPromise?: Promise; private copyAuthFilePromise?: Promise; - private versionCheck: boolean = true; - private logCollection: string; - private isMultiPhase: boolean = false; - private authFileHdfsPath: string | undefined = undefined; - private portList?: string | undefined; + protected versionCheck: boolean = true; + protected logCollection: string; + protected isMultiPhase: boolean = false; + protected authFileHdfsPath: string | undefined = undefined; + protected portList?: string | undefined; constructor() { this.log = getLogger(); @@ -323,7 +323,7 @@ class PAITrainingService implements TrainingService { return this.metricsEmitter; } - private async submitTrialJobToPAI(trialJobId: string): Promise { + protected async submitTrialJobToPAI(trialJobId: string): Promise { const deferred: Deferred = new Deferred(); const trialJobDetail: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); @@ -512,7 +512,7 @@ class PAITrainingService implements TrainingService { /** * Update pai token by the interval time or initialize the pai token */ - private async updatePaiToken(): Promise { + protected async updatePaiToken(): Promise { const deferred: Deferred = new Deferred(); const currentTime: number = new Date().getTime(); @@ -588,7 +588,7 @@ class PAITrainingService implements TrainingService { }); } - private postParameterFileMeta(parameterFileMeta: ParameterFileMeta): Promise { + protected postParameterFileMeta(parameterFileMeta: ParameterFileMeta): Promise { const deferred: Deferred = new Deferred(); const restServer: PAIJobRestServer = component.get(PAIJobRestServer); const req: request.Options = { diff --git a/src/nni_manager/training_service/paiLite/paiLiteConfig.ts b/src/nni_manager/training_service/paiLite/paiLiteConfig.ts index 8f1a7446f6..5861b0adb6 100644 --- a/src/nni_manager/training_service/paiLite/paiLiteConfig.ts +++ b/src/nni_manager/training_service/paiLite/paiLiteConfig.ts @@ -18,26 +18,30 @@ */ 'use strict'; -import { PAIClusterConfig } from '../pai/paiConfig'; +import {TrialConfig} from '../common/trialConfig'; /** - * PAILite cluster configuration + * PAILite trial configuration */ -export class PAILiteClusterConfig extends PAIClusterConfig { +export class NNIPAILiteTrialConfig extends TrialConfig { + public readonly cpuNum: number; + public readonly memoryMB: number; + public readonly image: string; + public virtualCluster?: string; public readonly nniManagerNFSMountPath: string; public readonly containerNFSMountPath: string; + public readonly paiStoragePlugin: string; - /** - * Constructor - * @param userName User name of PAI Cluster - * @param passWord password of PAI Cluster - * @param host Host IP of PAI Cluster - * @param token PAI token of PAI Cluster - */ - constructor(userName: string, host : string, nniManagerNFSMountPath: string, - containerNFSMountPath: string, passWord?: string, token?: string) { - super(userName, host, passWord, token); + constructor(command: string, codeDir: string, gpuNum: number, cpuNum: number, memoryMB: number, + image: string, virtualCluster: string, nniManagerNFSMountPath: string, containerNFSMountPath: string, + paiStoragePlugin: string) { + super(command, codeDir, gpuNum); + this.cpuNum = cpuNum; + this.memoryMB = memoryMB; + this.image = image; + this.virtualCluster = virtualCluster; this.nniManagerNFSMountPath = nniManagerNFSMountPath; this.containerNFSMountPath = containerNFSMountPath; + this.paiStoragePlugin = paiStoragePlugin; } } diff --git a/src/nni_manager/training_service/paiLite/paiLiteData.ts b/src/nni_manager/training_service/paiLite/paiLiteData.ts index 1687f3841e..300ca0817e 100644 --- a/src/nni_manager/training_service/paiLite/paiLiteData.ts +++ b/src/nni_manager/training_service/paiLite/paiLiteData.ts @@ -19,36 +19,6 @@ 'use strict'; -import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; - -/** - * PAI trial job detail - */ -export class PAITrialJobDetail implements TrialJobDetail { - public id: string; - public status: TrialJobStatus; - public paiJobName: string; - public submitTime: number; - public startTime?: number; - public endTime?: number; - public tags?: string[]; - public url?: string; - public workingDirectory: string; - public form: TrialJobApplicationForm; - public isEarlyStopped?: boolean; - - constructor(id: string, status: TrialJobStatus, paiJobName : string, - submitTime: number, workingDirectory: string, form: TrialJobApplicationForm) { - this.id = id; - this.status = status; - this.paiJobName = paiJobName; - this.submitTime = submitTime; - this.workingDirectory = workingDirectory; - this.form = form; - this.tags = []; - } -} - export const PAI_INSTALL_NNI_SHELL_FORMAT: string = `#!/bin/bash if python3 -c 'import nni' > /dev/null 2>&1; then @@ -59,13 +29,9 @@ else python3 -m pip install --user nni fi`; -export const PAI_TRIAL_COMMAND_FORMAT: string = +export const PAI_LITE_TRIAL_COMMAND_FORMAT: string = `export NNI_PLATFORM=paiLite NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \ && ls $NNI_SYS_DIR \ && cd $NNI_SYS_DIR && sh install_nni.sh \ && python3 -m nni_trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' \ --nni_manager_version '{9}' --log_collection '{10}'`; - -// tslint:disable:no-http-string -export const PAI_LOG_PATH_FORMAT: string = -`http://{0}/webhdfs/explorer.html#{1}`; diff --git a/src/nni_manager/training_service/paiLite/paiLiteJobRestServer.ts b/src/nni_manager/training_service/paiLite/paiLiteJobRestServer.ts new file mode 100644 index 0000000000..b363a4d40e --- /dev/null +++ b/src/nni_manager/training_service/paiLite/paiLiteJobRestServer.ts @@ -0,0 +1,76 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +'use strict'; + +import { Request, Response, Router } from 'express'; +import { Inject } from 'typescript-ioc'; +import * as component from '../../common/component'; +import { ClusterJobRestServer } from '../common/clusterJobRestServer'; +import { PAILiteTrainingService } from './paiLiteTrainingService'; + +export interface ParameterFileMeta { + readonly experimentId: string; + readonly trialId: string; + readonly filePath: string; +} + +/** + * PAI Training service Rest server, provides rest API to support pai job metrics update + * + */ +@component.Singleton +export class PAILiteJobRestServer extends ClusterJobRestServer { + private parameterFileMetaList: ParameterFileMeta[] = []; + + @Inject + private readonly paiLiteTrainingService: PAILiteTrainingService; + + /** + * constructor to provide NNIRestServer's own rest property, e.g. port + */ + constructor() { + super(); + this.paiLiteTrainingService = component.get(PAILiteTrainingService); + } + + protected handleTrialMetrics(jobId: string, metrics: any[]): void { + // Split metrics array into single metric, then emit + // Warning: If not split metrics into single ones, the behavior will be UNKNOWN + for (const singleMetric of metrics) { + this.paiLiteTrainingService.MetricsEmitter.emit('metric', { + id : jobId, + data : singleMetric + }); + } + } + + protected createRestHandler(): Router { + const router: Router = super.createRestHandler(); + + router.post(`/parameter-file-meta`, (req: Request, res: Response) => { + try { + this.log.info(`POST /parameter-file-meta, body is ${JSON.stringify(req.body)}`); + this.parameterFileMetaList.push(req.body); + res.send(); + } catch (err) { + this.log.error(`POST parameter-file-meta error: ${err}`); + res.status(500); + res.send(err.message); + } + }); + + router.get(`/parameter-file-meta`, (req: Request, res: Response) => { + try { + this.log.info(`GET /parameter-file-meta`); + res.send(this.parameterFileMetaList); + } catch (err) { + this.log.error(`GET parameter-file-meta error: ${err}`); + res.status(500); + res.send(err.message); + } + }); + + return router; + } +} diff --git a/src/nni_manager/training_service/paiLite/paiLiteTrainingService.ts b/src/nni_manager/training_service/paiLite/paiLiteTrainingService.ts index 6034e7b935..4db823e51d 100644 --- a/src/nni_manager/training_service/paiLite/paiLiteTrainingService.ts +++ b/src/nni_manager/training_service/paiLite/paiLiteTrainingService.ts @@ -41,11 +41,15 @@ import { delay, generateParamFileName, import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { execMkdir, validateCodeDir, execCopydir } from '../common/util'; -import { PAILiteClusterConfig } from './paiLiteConfig'; -import { PAI_LOG_PATH_FORMAT, PAI_TRIAL_COMMAND_FORMAT, PAITrialJobDetail } from './paiLiteData'; +import { PAI_LITE_TRIAL_COMMAND_FORMAT } from './paiLiteData'; import { PAITrainingService } from '../pai/paiTrainingService'; +import { NNIPAILiteTrialConfig } from './paiLiteConfig'; +import { PAITrialJobDetail } from '../pai/paiData'; +import { PAILiteJobRestServer } from './paiLiteJobRestServer'; import * as WebHDFS from 'webhdfs'; +import { PAIClusterConfig } from 'training_service/pai/paiConfig'; +const yaml = require('js-yaml'); /** * Training Service implementation for OpenPAI (Open Platform for AI) @@ -53,49 +57,93 @@ import * as WebHDFS from 'webhdfs'; */ @component.Singleton class PAILiteTrainingService extends PAITrainingService { - private containerRootDir: string = ""; - protected paiLiteClusterConfig?: PAILiteClusterConfig; + protected paiTrialConfig: NNIPAILiteTrialConfig | undefined; constructor() { super(); } - public async submitTrialJob(form: TrialJobApplicationForm): Promise { - console.log('---------------82-----------') - const deferred : Deferred = new Deferred(); - return deferred.promise; + public async run(): Promise { + this.log.info('Run PAILite training service.'); + const restServer: PAILiteJobRestServer = component.get(PAILiteJobRestServer); + await restServer.start(); + restServer.setEnableVersionCheck = this.versionCheck; + this.log.info(`PAILite Training service rest server listening on: ${restServer.endPoint}`); + await Promise.all([ + this.statusCheckingLoop(), + this.submitJobLoop()]); + this.log.info('PAILite training service exit.'); } - // tslint:disable:no-http-string - public cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise { - return super.cancelTrialJob(trialJobId, isEarlyStopped); + public async cleanUp(): Promise { + this.log.info('Stopping PAILite training service...'); + this.stopping = true; + + const deferred: Deferred = new Deferred(); + const restServer: PAILiteJobRestServer = component.get(PAILiteJobRestServer); + try { + await restServer.stop(); + deferred.resolve(); + this.log.info('PAILite Training service rest server stopped successfully.'); + } catch (error) { + this.log.error(`PAILite Training service rest server stopped failed, error: ${error.message}`); + deferred.reject(error); + } + + return deferred.promise; } - // tslint:disable: no-unsafe-any no-any - // tslint:disable-next-line:max-func-body-length public async setClusterMetadata(key: string, value: string): Promise { - const deferred : Deferred = new Deferred(); + const deferred: Deferred = new Deferred(); + switch (key) { case TrialConfigMetadataKey.NNI_MANAGER_IP: - return super.setClusterMetadata(key, value); + this.nniManagerIpConfig = JSON.parse(value); + deferred.resolve(); + break; + + case TrialConfigMetadataKey.PAI_LITE_CLUSTER_CONFIG: + this.paiClusterConfig = JSON.parse(value); + + if(this.paiClusterConfig.passWord) { + // Get PAI authentication token + await this.updatePaiToken(); + } else if(this.paiClusterConfig.token) { + this.paiToken = this.paiClusterConfig.token; + } else { + deferred.reject(new Error('pai cluster config format error, please set password or token!')); + } - case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG: deferred.resolve(); break; case TrialConfigMetadataKey.TRIAL_CONFIG: + if (this.paiClusterConfig === undefined) { + this.log.error('pai cluster config is not initialized'); + deferred.reject(new Error('pai cluster config is not initialized')); + break; + } + this.paiTrialConfig = JSON.parse(value); + // Validate to make sure codeDir doesn't have too many files + try { + await validateCodeDir(this.paiTrialConfig.codeDir); + } catch (error) { + this.log.error(error); + deferred.reject(new Error(error)); + break; + } deferred.resolve(); break; case TrialConfigMetadataKey.VERSION_CHECK: - return super.setClusterMetadata(key, value); - + this.versionCheck = (value === 'true' || value === 'True'); + break; case TrialConfigMetadataKey.LOG_COLLECTION: - return super.setClusterMetadata(key, value); - + this.logCollection = value; + break; case TrialConfigMetadataKey.MULTI_PHASE: - return super.setClusterMetadata(key, value); - + this.isMultiPhase = (value === 'true' || value === 'True'); + break; default: //Reject for unknown keys deferred.reject(new Error(`Uknown key: ${key}`)); @@ -104,9 +152,176 @@ class PAILiteTrainingService extends PAITrainingService { return deferred.promise; } - // tslint:disable-next-line:max-func-body-length - public async submitTrialJobToPAI(trialJobId: string): Promise { - const deferred : Deferred = new Deferred(); + public async submitTrialJob(form: TrialJobApplicationForm): Promise { + if (this.paiClusterConfig === undefined) { + throw new Error(`paiClusterConfig not initialized!`); + } + const deferred: Deferred = new Deferred(); + + this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`); + + const trialJobId: string = uniqueString(5); + //TODO: use HDFS working folder instead + const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId); + const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; + const logPath: string = ''; + const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( + trialJobId, + 'WAITING', + paiJobName, + Date.now(), + trialWorkingFolder, + form, + logPath); + + this.trialJobsMap.set(trialJobId, trialJobDetail); + this.jobQueue.push(trialJobId); + deferred.resolve(trialJobDetail); + + return deferred.promise; + } + + public generateJobConfigInYamlFormat(trialJobId: string, command: string) { + if (this.paiTrialConfig === undefined) { + throw new Error('trial config is not initialized'); + } + const jobName = `nni_exp_${this.experimentId}_trial_${trialJobId}` + const paiJobConfig = { + protocolVersion: 2, + name: jobName, + type: 'job', + jobRetryCount: 0, + prerequisites: [ + { + type: 'dockerimage', + uri: this.paiTrialConfig.image, + name: 'docker_image_0' + } + ], + taskRoles: { + taskrole: { + instances: 1, + completion: { + minFailedInstances: 1, + minSucceededInstances: -1 + }, + taskRetryCount: 0, + dockerImage: 'docker_image_0', + resourcePerInstance: { + gpu: this.paiTrialConfig.gpuNum, + cpu: this.paiTrialConfig.cpuNum, + memoryMB: this.paiTrialConfig.memoryMB + }, + commands: [ + command + ] + } + }, + defaults: { + virtualCluster: this.paiTrialConfig.virtualCluster + }, + extras: { + 'com.microsoft.pai.runtimeplugin': [ + { + plugin: this.paiTrialConfig.paiStoragePlugin + } + ], + submitFrom: 'submit-job-v2' + } + } + return yaml.safeDump(paiJobConfig); + } + + protected async submitTrialJobToPAI(trialJobId: string): Promise { + const deferred: Deferred = new Deferred(); + const trialJobDetail: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); + + if (trialJobDetail === undefined) { + throw new Error(`Failed to find PAITrialJobDetail for job ${trialJobId}`); + } + + if (this.paiClusterConfig === undefined) { + throw new Error('PAI Cluster config is not initialized'); + } + if (this.paiTrialConfig === undefined) { + throw new Error('trial config is not initialized'); + } + if (this.paiToken === undefined) { + throw new Error('PAI token is not initialized'); + } + + if (this.paiRestServerPort === undefined) { + const restServer: PAILiteJobRestServer = component.get(PAILiteJobRestServer); + this.paiRestServerPort = restServer.clusterRestServerPort; + } + + // Step 1. Prepare PAI job configuration + const trialLocalFolder: string = path.join(this.paiTrialConfig.nniManagerNFSMountPath, this.experimentId, trialJobId); + //create trial local working folder locally. + await execMkdir(trialLocalFolder); + + const runScriptContent: string = CONTAINER_INSTALL_NNI_SHELL_FORMAT; + // Write NNI installation file to local files + await fs.promises.writeFile(path.join(trialLocalFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' }); + + // Write file content ( parameter.cfg ) to local working folders + if (trialJobDetail.form !== undefined) { + await fs.promises.writeFile( + path.join(trialLocalFolder, generateParamFileName(trialJobDetail.form.hyperParameters)), + trialJobDetail.form.hyperParameters.value, { encoding: 'utf8' } + ); + } + + //Copy codeDir files to local working folder + await execCopydir(this.paiTrialConfig.codeDir, trialLocalFolder); + + const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address(); + const version: string = this.versionCheck ? await getVersion() : ''; + const containerWorkingDir: string = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}/${trialJobId}`; + const nniPaiTrialCommand: string = String.Format( + PAI_LITE_TRIAL_COMMAND_FORMAT, + `${containerWorkingDir}`, + `${containerWorkingDir}/nnioutput`, + trialJobId, + this.experimentId, + trialJobDetail.form.sequenceId, + this.isMultiPhase, + this.paiTrialConfig.command, + nniManagerIp, + this.paiRestServerPort, + version, + this.logCollection + ) + .replace(/\r\n|\n|\r/gm, ''); + + this.log.info(`nniPAItrial command is ${nniPaiTrialCommand.trim()}`); + + const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobId, nniPaiTrialCommand); + console.log(paiJobConfig); + + // Step 3. Submit PAI job via Rest call + // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API + const submitJobRequest: request.Options = { + uri: `http://${this.paiClusterConfig.host}/rest-server/api/v2/jobs`, + method: 'POST', + body: paiJobConfig, + headers: { + 'Content-Type': 'text/yaml', + Authorization: `Bearer ${this.paiToken}` + } + }; + request(submitJobRequest, (error: Error, response: request.Response, body: any) => { + if ((error !== undefined && error !== null) || response.statusCode >= 400) { + const errorMessage: string = (error !== undefined && error !== null) ? error.message : + `Submit trial ${trialJobId} failed, http code:${response.statusCode}, http body: ${response.body.message}`; + this.log.error(errorMessage); + trialJobDetail.status = 'FAILED'; + } else { + trialJobDetail.submitTime = Date.now(); + } + deferred.resolve(true); + }); + return deferred.promise; } } diff --git a/src/sdk/pynni/nni/platform/__init__.py b/src/sdk/pynni/nni/platform/__init__.py index a14d1b67d4..2a2ac7128e 100644 --- a/src/sdk/pynni/nni/platform/__init__.py +++ b/src/sdk/pynni/nni/platform/__init__.py @@ -7,7 +7,7 @@ from .standalone import * elif trial_env_vars.NNI_PLATFORM == 'unittest': from .test import * -elif trial_env_vars.NNI_PLATFORM in ('local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller'): +elif trial_env_vars.NNI_PLATFORM in ('local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiLite'): from .local import * else: raise RuntimeError('Unknown platform %s' % trial_env_vars.NNI_PLATFORM) diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index f4eff08f6d..c61be32ea0 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -259,19 +259,31 @@ def setPathCheck(key): }) } + +pai_lite_trial_schema = { + 'trial':{ + 'command': setType('command', str), + 'codeDir': setPathCheck('codeDir'), + 'gpuNum': setNumberRange('gpuNum', int, 0, 99999), + 'cpuNum': setNumberRange('cpuNum', int, 0, 99999), + 'memoryMB': setType('memoryMB', int), + 'image': setType('image', str), + 'virtualCluster': setType('virtualCluster', str), + 'nniManagerNFSMountPath': setPathCheck('nniManagerNFSMountPath'), + 'containerNFSMountPath': setType('containerNFSMountPath', str), + 'paiStoragePlugin': setType('paiStoragePlugin', str) + } +} + pai_lite_config_schema = { 'paiLiteConfig': Or({ 'userName': setType('userName', str), 'passWord': setType('passWord', str), - 'host': setType('host', str), - 'nniManagerNFSMountPath': setPathCheck('nniManagerNFSMountPath'), - 'containerNFSMountPath': setType('containerNFSMountPath', str) + 'host': setType('host', str) }, { 'userName': setType('userName', str), 'token': setType('token', str), - 'host': setType('host', str), - 'nniManagerNFSMountPath': setPathCheck('nniManagerNFSMountPath'), - 'containerNFSMountPath': setType('containerNFSMountPath', str) + 'host': setType('host', str) }) } @@ -412,7 +424,7 @@ def setPathCheck(key): PAI_CONFIG_SCHEMA = Schema({**common_schema, **pai_trial_schema, **pai_config_schema}) -PAI_LITE_CONFIG_SCHEMA = Schema({**common_schema, **pai_trial_schema, **pai_lite_config_schema}) +PAI_LITE_CONFIG_SCHEMA = Schema({**common_schema, **pai_lite_trial_schema, **pai_lite_config_schema}) KUBEFLOW_CONFIG_SCHEMA = Schema({**common_schema, **kubeflow_trial_schema, **kubeflow_config_schema}) diff --git a/tools/nni_cmd/launcher.py b/tools/nni_cmd/launcher.py index d8f86f3fec..dff4e5840f 100644 --- a/tools/nni_cmd/launcher.py +++ b/tools/nni_cmd/launcher.py @@ -122,7 +122,7 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None from subprocess import CREATE_NEW_PROCESS_GROUP process = Popen(cmds, cwd=entry_dir, stdout=stdout_file, stderr=stderr_file, creationflags=CREATE_NEW_PROCESS_GROUP) else: - process = Popen(cmds, cwd=entry_dir) + process = Popen(cmds, cwd=entry_dir, stdout=stdout_file, stderr=stderr_file) return process, str(time_now) def set_trial_config(experiment_config, port, config_file_name): From f0b4c4c80774fdac5817bd2d8ab5a3c6429f51e2 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Thu, 12 Dec 2019 23:53:15 +0800 Subject: [PATCH 05/18] fix typo --- src/nni_manager/training_service/pai/paiTrainingService.ts | 2 +- .../training_service/paiLite/paiLiteJobRestServer.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index d5669c20ad..3bdaa6983f 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -287,7 +287,7 @@ class PAITrainingService implements TrainingService { break; default: //Reject for unknown keys - deferred.reject(new Error(`Uknown key: ${key}`)); + throw new Error(`Uknown key: ${key}`); } return deferred.promise; diff --git a/src/nni_manager/training_service/paiLite/paiLiteJobRestServer.ts b/src/nni_manager/training_service/paiLite/paiLiteJobRestServer.ts index b363a4d40e..3457cc43d0 100644 --- a/src/nni_manager/training_service/paiLite/paiLiteJobRestServer.ts +++ b/src/nni_manager/training_service/paiLite/paiLiteJobRestServer.ts @@ -16,7 +16,7 @@ export interface ParameterFileMeta { } /** - * PAI Training service Rest server, provides rest API to support pai job metrics update + * PAILite Training service Rest server, provides rest API to support pai job metrics update * */ @component.Singleton From e7a9495869d780b4a41252ff280f30cebccb01f1 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Fri, 13 Dec 2019 17:55:52 +0800 Subject: [PATCH 06/18] fix eslint --- src/nni_manager/rest_server/restValidationSchemas.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index d645276b0b..fce5c1fb89 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -98,7 +98,7 @@ export namespace ValidationSchemas { token: joi.string().min(1), host: joi.string().min(1).required() }), - pai_lite_config: joi.object({ + pai_lite_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase userName: joi.string().min(1).required(), passWord: joi.string().min(1), token: joi.string().min(1), From 8b69167c18beea79b747b5fe71a413702c26503f Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Sun, 15 Dec 2019 22:23:53 +0800 Subject: [PATCH 07/18] refactor code --- src/nni_manager/main.ts | 12 +- .../rest_server/restValidationSchemas.ts | 4 +- .../common/trialConfigMetadataKey.ts | 2 +- .../paiLite/paiLiteJobRestServer.ts | 76 ----- .../pai/paiConfig.ts} | 6 +- .../pai/paiData.ts} | 2 +- .../pai_base/pai/paiJobRestServer.ts | 33 ++ .../pai/paiTrainingService.ts} | 80 +++-- .../paiData.ts => pai_base/paiBaseConfig.ts} | 44 +-- .../paiBaseJobInfoCollector.ts} | 71 +++-- .../paiBaseJobRestServer.ts} | 14 +- .../pai_base/paiBaseTrainingService.ts | 297 ++++++++++++++++++ .../paiYarn}/hdfsClientUtility.ts | 6 +- .../paiYarn/paiYarnConfig.ts} | 26 +- .../pai_base/paiYarn/paiYarnData.ts | 26 ++ .../pai_base/paiYarn/paiYarnJobRestServer.ts | 33 ++ .../paiYarn/paiYarnTrainingService.ts} | 248 ++++----------- .../paiYarn/paiYarnTrialConfig.ts} | 4 +- .../test/hdfsClientUtility.test.ts | 2 +- ...test.ts => paiYarnTrainingService.test.ts} | 18 +- src/sdk/pynni/nni/platform/__init__.py | 2 +- tools/nni_cmd/config_schema.py | 16 +- tools/nni_cmd/launcher.py | 18 +- tools/nni_cmd/launcher_utils.py | 8 +- 24 files changed, 609 insertions(+), 439 deletions(-) delete mode 100644 src/nni_manager/training_service/paiLite/paiLiteJobRestServer.ts rename src/nni_manager/training_service/{paiLite/paiLiteConfig.ts => pai_base/pai/paiConfig.ts} (93%) rename src/nni_manager/training_service/{paiLite/paiLiteData.ts => pai_base/pai/paiData.ts} (92%) create mode 100644 src/nni_manager/training_service/pai_base/pai/paiJobRestServer.ts rename src/nni_manager/training_service/{paiLite/paiLiteTrainingService.ts => pai_base/pai/paiTrainingService.ts} (79%) rename src/nni_manager/training_service/{pai/paiData.ts => pai_base/paiBaseConfig.ts} (54%) rename src/nni_manager/training_service/{pai/paiJobInfoCollector.ts => pai_base/paiBaseJobInfoCollector.ts} (57%) rename src/nni_manager/training_service/{pai/paiJobRestServer.ts => pai_base/paiBaseJobRestServer.ts} (81%) create mode 100644 src/nni_manager/training_service/pai_base/paiBaseTrainingService.ts rename src/nni_manager/training_service/{pai => pai_base/paiYarn}/hdfsClientUtility.ts (97%) rename src/nni_manager/training_service/{pai/paiConfig.ts => pai_base/paiYarn/paiYarnConfig.ts} (86%) create mode 100644 src/nni_manager/training_service/pai_base/paiYarn/paiYarnData.ts create mode 100644 src/nni_manager/training_service/pai_base/paiYarn/paiYarnJobRestServer.ts rename src/nni_manager/training_service/{pai/paiTrainingService.ts => pai_base/paiYarn/paiYarnTrainingService.ts} (68%) rename src/nni_manager/training_service/{pai/paiTrialConfig.ts => pai_base/paiYarn/paiYarnTrialConfig.ts} (86%) rename src/nni_manager/training_service/test/{paiTrainingService.test.ts => paiYarnTrainingService.test.ts} (78%) diff --git a/src/nni_manager/main.ts b/src/nni_manager/main.ts index cea56e0189..ff34476580 100644 --- a/src/nni_manager/main.ts +++ b/src/nni_manager/main.ts @@ -20,8 +20,8 @@ import { NNIRestServer } from './rest_server/nniRestServer'; import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService'; import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService'; import { LocalTrainingService } from './training_service/local/localTrainingService'; -import { PAITrainingService } from './training_service/pai/paiTrainingService'; -import { PAILiteTrainingService } from './training_service/paiLite/paiLiteTrainingService'; +import { PAITrainingService } from './training_service/pai_base/pai/paiTrainingService'; +import { PAIYarnTrainingService } from './training_service/pai_base/paiYarn/paiYarnTrainingService'; import { RemoteMachineTrainingService } from './training_service/remote_machine/remoteMachineTrainingService'; @@ -47,9 +47,9 @@ async function initContainer(platformMode: string, logFileName?: string): Promis Container.bind(TrainingService) .to(PAITrainingService) .scope(Scope.Singleton); - } else if (platformMode === 'paiLite') { + } else if (platformMode === 'paiYarn') { Container.bind(TrainingService) - .to(PAILiteTrainingService) + .to(PAIYarnTrainingService) .scope(Scope.Singleton); } else if (platformMode === 'kubeflow') { Container.bind(TrainingService) @@ -81,7 +81,7 @@ async function initContainer(platformMode: string, logFileName?: string): Promis function usage(): void { console.info('usage: node main.js --port --mode \ - --start_mode --experiment_id '); + --start_mode --experiment_id '); } const strPort: string = parseArg(['--port', '-p']); @@ -93,7 +93,7 @@ if (!strPort || strPort.length === 0) { const port: number = parseInt(strPort, 10); const mode: string = parseArg(['--mode', '-m']); -if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiLite'].includes(mode)) { +if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn'].includes(mode)) { console.log(`FATAL: unknown mode: ${mode}`); usage(); process.exit(1); diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index fce5c1fb89..a9ad8cfd9a 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -92,13 +92,13 @@ export namespace ValidationSchemas { }) }) }), - pai_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase + pai_yarn_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase userName: joi.string().min(1).required(), passWord: joi.string().min(1), token: joi.string().min(1), host: joi.string().min(1).required() }), - pai_lite_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase + pai_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase userName: joi.string().min(1).required(), passWord: joi.string().min(1), token: joi.string().min(1), diff --git a/src/nni_manager/training_service/common/trialConfigMetadataKey.ts b/src/nni_manager/training_service/common/trialConfigMetadataKey.ts index 829bec85e3..b9acfbd454 100644 --- a/src/nni_manager/training_service/common/trialConfigMetadataKey.ts +++ b/src/nni_manager/training_service/common/trialConfigMetadataKey.ts @@ -13,8 +13,8 @@ export enum TrialConfigMetadataKey { EXPERIMENT_ID = 'experimentId', MULTI_PHASE = 'multiPhase', RANDOM_SCHEDULER = 'random_scheduler', + PAI_YARN_CLUSTER_CONFIG = 'pai_yarn_config', PAI_CLUSTER_CONFIG = 'pai_config', - PAI_LITE_CLUSTER_CONFIG = 'pai_lite_config', KUBEFLOW_CLUSTER_CONFIG = 'kubeflow_config', NNI_MANAGER_IP = 'nni_manager_ip', FRAMEWORKCONTROLLER_CLUSTER_CONFIG = 'frameworkcontroller_config', diff --git a/src/nni_manager/training_service/paiLite/paiLiteJobRestServer.ts b/src/nni_manager/training_service/paiLite/paiLiteJobRestServer.ts deleted file mode 100644 index 3457cc43d0..0000000000 --- a/src/nni_manager/training_service/paiLite/paiLiteJobRestServer.ts +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -'use strict'; - -import { Request, Response, Router } from 'express'; -import { Inject } from 'typescript-ioc'; -import * as component from '../../common/component'; -import { ClusterJobRestServer } from '../common/clusterJobRestServer'; -import { PAILiteTrainingService } from './paiLiteTrainingService'; - -export interface ParameterFileMeta { - readonly experimentId: string; - readonly trialId: string; - readonly filePath: string; -} - -/** - * PAILite Training service Rest server, provides rest API to support pai job metrics update - * - */ -@component.Singleton -export class PAILiteJobRestServer extends ClusterJobRestServer { - private parameterFileMetaList: ParameterFileMeta[] = []; - - @Inject - private readonly paiLiteTrainingService: PAILiteTrainingService; - - /** - * constructor to provide NNIRestServer's own rest property, e.g. port - */ - constructor() { - super(); - this.paiLiteTrainingService = component.get(PAILiteTrainingService); - } - - protected handleTrialMetrics(jobId: string, metrics: any[]): void { - // Split metrics array into single metric, then emit - // Warning: If not split metrics into single ones, the behavior will be UNKNOWN - for (const singleMetric of metrics) { - this.paiLiteTrainingService.MetricsEmitter.emit('metric', { - id : jobId, - data : singleMetric - }); - } - } - - protected createRestHandler(): Router { - const router: Router = super.createRestHandler(); - - router.post(`/parameter-file-meta`, (req: Request, res: Response) => { - try { - this.log.info(`POST /parameter-file-meta, body is ${JSON.stringify(req.body)}`); - this.parameterFileMetaList.push(req.body); - res.send(); - } catch (err) { - this.log.error(`POST parameter-file-meta error: ${err}`); - res.status(500); - res.send(err.message); - } - }); - - router.get(`/parameter-file-meta`, (req: Request, res: Response) => { - try { - this.log.info(`GET /parameter-file-meta`); - res.send(this.parameterFileMetaList); - } catch (err) { - this.log.error(`GET parameter-file-meta error: ${err}`); - res.status(500); - res.send(err.message); - } - }); - - return router; - } -} diff --git a/src/nni_manager/training_service/paiLite/paiLiteConfig.ts b/src/nni_manager/training_service/pai_base/pai/paiConfig.ts similarity index 93% rename from src/nni_manager/training_service/paiLite/paiLiteConfig.ts rename to src/nni_manager/training_service/pai_base/pai/paiConfig.ts index 5861b0adb6..de043e301f 100644 --- a/src/nni_manager/training_service/paiLite/paiLiteConfig.ts +++ b/src/nni_manager/training_service/pai_base/pai/paiConfig.ts @@ -18,12 +18,12 @@ */ 'use strict'; -import {TrialConfig} from '../common/trialConfig'; +import {TrialConfig} from '../../common/trialConfig'; /** - * PAILite trial configuration + * PAI trial configuration */ -export class NNIPAILiteTrialConfig extends TrialConfig { +export class NNIPAITrialConfig extends TrialConfig { public readonly cpuNum: number; public readonly memoryMB: number; public readonly image: string; diff --git a/src/nni_manager/training_service/paiLite/paiLiteData.ts b/src/nni_manager/training_service/pai_base/pai/paiData.ts similarity index 92% rename from src/nni_manager/training_service/paiLite/paiLiteData.ts rename to src/nni_manager/training_service/pai_base/pai/paiData.ts index 300ca0817e..7cbc4c6c73 100644 --- a/src/nni_manager/training_service/paiLite/paiLiteData.ts +++ b/src/nni_manager/training_service/pai_base/pai/paiData.ts @@ -30,7 +30,7 @@ else fi`; export const PAI_LITE_TRIAL_COMMAND_FORMAT: string = -`export NNI_PLATFORM=paiLite NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \ +`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \ && ls $NNI_SYS_DIR \ && cd $NNI_SYS_DIR && sh install_nni.sh \ && python3 -m nni_trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' \ diff --git a/src/nni_manager/training_service/pai_base/pai/paiJobRestServer.ts b/src/nni_manager/training_service/pai_base/pai/paiJobRestServer.ts new file mode 100644 index 0000000000..bc5653e540 --- /dev/null +++ b/src/nni_manager/training_service/pai_base/pai/paiJobRestServer.ts @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +'use strict'; + +import { Request, Response, Router } from 'express'; +import { Inject } from 'typescript-ioc'; +import * as component from '../../../common/component'; +import { ClusterJobRestServer } from '../../common/clusterJobRestServer'; +import { PAITrainingService } from './paiTrainingService'; +import { PAIBaseJobRestServer } from '../paiBaseJobRestServer'; + +export interface ParameterFileMeta { + readonly experimentId: string; + readonly trialId: string; + readonly filePath: string; +} + +/** + * PAI Training service Rest server, provides rest API to support pai job metrics update + * + */ +@component.Singleton +export class PAIJobRestServer extends PAIBaseJobRestServer { + protected parameterFileMetaList: ParameterFileMeta[] = []; + + /** + * constructor to provide NNIRestServer's own rest property, e.g. port + */ + constructor() { + super(component.get(PAITrainingService)); + } +} diff --git a/src/nni_manager/training_service/paiLite/paiLiteTrainingService.ts b/src/nni_manager/training_service/pai_base/pai/paiTrainingService.ts similarity index 79% rename from src/nni_manager/training_service/paiLite/paiLiteTrainingService.ts rename to src/nni_manager/training_service/pai_base/pai/paiTrainingService.ts index 4db823e51d..ec0549ab57 100644 --- a/src/nni_manager/training_service/paiLite/paiLiteTrainingService.ts +++ b/src/nni_manager/training_service/pai_base/pai/paiTrainingService.ts @@ -24,31 +24,25 @@ import * as fs from 'fs'; import * as path from 'path'; // tslint:disable-next-line:no-implicit-dependencies import * as request from 'request'; -import * as component from '../../common/component'; +import * as component from '../../../common/component'; -import { EventEmitter } from 'events'; import { Deferred } from 'ts-deferred'; import { String } from 'typescript-string-operations'; -import { MethodNotImplementedError } from '../../common/errors'; -import { getExperimentId } from '../../common/experimentStartupInfo'; -import { getLogger, Logger } from '../../common/log'; import { HyperParameters, NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric -} from '../../common/trainingService'; +} from '../../../common/trainingService'; import { delay, generateParamFileName, - getExperimentRootDir, getIPV4Address, getVersion, uniqueString, unixPathJoin } from '../../common/utils'; -import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; -import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; -import { execMkdir, validateCodeDir, execCopydir } from '../common/util'; -import { PAI_LITE_TRIAL_COMMAND_FORMAT } from './paiLiteData'; -import { PAITrainingService } from '../pai/paiTrainingService'; -import { NNIPAILiteTrialConfig } from './paiLiteConfig'; -import { PAITrialJobDetail } from '../pai/paiData'; -import { PAILiteJobRestServer } from './paiLiteJobRestServer'; - -import * as WebHDFS from 'webhdfs'; -import { PAIClusterConfig } from 'training_service/pai/paiConfig'; + getExperimentRootDir, getIPV4Address, getVersion, uniqueString, unixPathJoin } from '../../../common/utils'; +import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; +import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; +import { execMkdir, validateCodeDir, execCopydir } from '../../common/util'; +import { PAI_LITE_TRIAL_COMMAND_FORMAT } from './paiData'; +import { NNIPAITrialConfig } from './paiConfig'; +import { PAIJobRestServer } from './paiJobRestServer'; +import { PAIBaseTrainingService } from '../paiBaseTrainingService'; +import { PAIBaseClusterConfig, PAIBaseTrialJobDetail } from '../../pai_base/paiBaseConfig'; + const yaml = require('js-yaml'); /** @@ -56,37 +50,37 @@ const yaml = require('js-yaml'); * Refer https://github.com/Microsoft/pai for more info about OpenPAI */ @component.Singleton -class PAILiteTrainingService extends PAITrainingService { - protected paiTrialConfig: NNIPAILiteTrialConfig | undefined; +class PAITrainingService extends PAIBaseTrainingService { + protected paiTrialConfig: NNIPAITrialConfig | undefined; constructor() { super(); } public async run(): Promise { - this.log.info('Run PAILite training service.'); - const restServer: PAILiteJobRestServer = component.get(PAILiteJobRestServer); + this.log.info('Run PAI training service.'); + const restServer: PAIJobRestServer = component.get(PAIJobRestServer); await restServer.start(); restServer.setEnableVersionCheck = this.versionCheck; - this.log.info(`PAILite Training service rest server listening on: ${restServer.endPoint}`); + this.log.info(`PAI Training service rest server listening on: ${restServer.endPoint}`); await Promise.all([ this.statusCheckingLoop(), this.submitJobLoop()]); - this.log.info('PAILite training service exit.'); + this.log.info('PAI training service exit.'); } public async cleanUp(): Promise { - this.log.info('Stopping PAILite training service...'); + this.log.info('Stopping PAI training service...'); this.stopping = true; const deferred: Deferred = new Deferred(); - const restServer: PAILiteJobRestServer = component.get(PAILiteJobRestServer); + const restServer: PAIJobRestServer = component.get(PAIJobRestServer); try { await restServer.stop(); deferred.resolve(); - this.log.info('PAILite Training service rest server stopped successfully.'); + this.log.info('PAI Training service rest server stopped successfully.'); } catch (error) { - this.log.error(`PAILite Training service rest server stopped failed, error: ${error.message}`); + this.log.error(`PAI Training service rest server stopped failed, error: ${error.message}`); deferred.reject(error); } @@ -102,14 +96,14 @@ class PAILiteTrainingService extends PAITrainingService { deferred.resolve(); break; - case TrialConfigMetadataKey.PAI_LITE_CLUSTER_CONFIG: - this.paiClusterConfig = JSON.parse(value); + case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG: + this.paiBaseClusterConfig = JSON.parse(value); - if(this.paiClusterConfig.passWord) { + if(this.paiBaseClusterConfig.passWord) { // Get PAI authentication token await this.updatePaiToken(); - } else if(this.paiClusterConfig.token) { - this.paiToken = this.paiClusterConfig.token; + } else if(this.paiBaseClusterConfig.token) { + this.paiToken = this.paiBaseClusterConfig.token; } else { deferred.reject(new Error('pai cluster config format error, please set password or token!')); } @@ -118,12 +112,12 @@ class PAILiteTrainingService extends PAITrainingService { break; case TrialConfigMetadataKey.TRIAL_CONFIG: - if (this.paiClusterConfig === undefined) { + if (this.paiBaseClusterConfig === undefined) { this.log.error('pai cluster config is not initialized'); deferred.reject(new Error('pai cluster config is not initialized')); break; } - this.paiTrialConfig = JSON.parse(value); + this.paiTrialConfig = JSON.parse(value); // Validate to make sure codeDir doesn't have too many files try { @@ -153,10 +147,10 @@ class PAILiteTrainingService extends PAITrainingService { } public async submitTrialJob(form: TrialJobApplicationForm): Promise { - if (this.paiClusterConfig === undefined) { + if (this.paiBaseClusterConfig === undefined) { throw new Error(`paiClusterConfig not initialized!`); } - const deferred: Deferred = new Deferred(); + const deferred: Deferred = new Deferred(); this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`); @@ -165,7 +159,7 @@ class PAILiteTrainingService extends PAITrainingService { const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId); const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; const logPath: string = ''; - const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( + const trialJobDetail: PAIBaseTrialJobDetail = new PAIBaseTrialJobDetail( trialJobId, 'WAITING', paiJobName, @@ -234,13 +228,13 @@ class PAILiteTrainingService extends PAITrainingService { protected async submitTrialJobToPAI(trialJobId: string): Promise { const deferred: Deferred = new Deferred(); - const trialJobDetail: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); + const trialJobDetail: PAIBaseTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); if (trialJobDetail === undefined) { throw new Error(`Failed to find PAITrialJobDetail for job ${trialJobId}`); } - if (this.paiClusterConfig === undefined) { + if (this.paiBaseClusterConfig === undefined) { throw new Error('PAI Cluster config is not initialized'); } if (this.paiTrialConfig === undefined) { @@ -251,7 +245,7 @@ class PAILiteTrainingService extends PAITrainingService { } if (this.paiRestServerPort === undefined) { - const restServer: PAILiteJobRestServer = component.get(PAILiteJobRestServer); + const restServer: PAIJobRestServer = component.get(PAIJobRestServer); this.paiRestServerPort = restServer.clusterRestServerPort; } @@ -302,7 +296,7 @@ class PAILiteTrainingService extends PAITrainingService { // Step 3. Submit PAI job via Rest call // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API const submitJobRequest: request.Options = { - uri: `http://${this.paiClusterConfig.host}/rest-server/api/v2/jobs`, + uri: `http://${this.paiBaseClusterConfig.host}/rest-server/api/v2/jobs`, method: 'POST', body: paiJobConfig, headers: { @@ -326,4 +320,4 @@ class PAILiteTrainingService extends PAITrainingService { } } -export { PAILiteTrainingService }; +export { PAITrainingService }; diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai_base/paiBaseConfig.ts similarity index 54% rename from src/nni_manager/training_service/pai/paiData.ts rename to src/nni_manager/training_service/pai_base/paiBaseConfig.ts index 6433c1ffa6..4af2f6150f 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai_base/paiBaseConfig.ts @@ -3,12 +3,34 @@ 'use strict'; +import {TrialConfig} from '../common/trialConfig'; import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; +export class PAIBaseClusterConfig { + public readonly userName: string; + public readonly passWord?: string; + public readonly host: string; + public readonly token?: string; + + /** + * Constructor + * @param userName User name of PAI Cluster + * @param passWord password of PAI Cluster + * @param host Host IP of PAI Cluster + * @param token PAI token of PAI Cluster + */ + constructor(userName: string, host: string, passWord?: string, token?: string) { + this.userName = userName; + this.passWord = passWord; + this.host = host; + this.token = token; + } +} + /** * PAI trial job detail */ -export class PAITrialJobDetail implements TrialJobDetail { +export class PAIBaseTrialJobDetail implements TrialJobDetail { public id: string; public status: TrialJobStatus; public paiJobName: string; @@ -34,23 +56,3 @@ export class PAITrialJobDetail implements TrialJobDetail { this.logPath = logPath; } } - -export const PAI_INSTALL_NNI_SHELL_FORMAT: string = -`#!/bin/bash -if python3 -c 'import nni' > /dev/null 2>&1; then - # nni module is already installed, skip - return -else - # Install nni - python3 -m pip install --user nni -fi`; - -export const PAI_TRIAL_COMMAND_FORMAT: string = -`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \ -&& cd $NNI_SYS_DIR && sh install_nni.sh \ -&& python3 -m nni_trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' \ ---pai_hdfs_output_dir '{9}' --pai_hdfs_host '{10}' --pai_user_name {11} --nni_hdfs_exp_dir '{12}' --webhdfs_path '/webhdfs/api/v1' \ ---nni_manager_version '{13}' --log_collection '{14}'`; - -export const PAI_LOG_PATH_FORMAT: string = -`http://{0}/webhdfs/explorer.html#{1}`; diff --git a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts b/src/nni_manager/training_service/pai_base/paiBaseJobInfoCollector.ts similarity index 57% rename from src/nni_manager/training_service/pai/paiJobInfoCollector.ts rename to src/nni_manager/training_service/pai_base/paiBaseJobInfoCollector.ts index 6304db3639..5ff4f9a058 100644 --- a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts +++ b/src/nni_manager/training_service/pai_base/paiBaseJobInfoCollector.ts @@ -8,43 +8,42 @@ import { Deferred } from 'ts-deferred'; import { NNIError, NNIErrorNames } from '../../common/errors'; import { getLogger, Logger } from '../../common/log'; import { TrialJobStatus } from '../../common/trainingService'; -import { PAIClusterConfig } from './paiConfig'; -import { PAITrialJobDetail } from './paiData'; +import { PAIBaseClusterConfig, PAIBaseTrialJobDetail } from './paiBaseConfig'; /** * Collector PAI jobs info from PAI cluster, and update pai job status locally */ -export class PAIJobInfoCollector { - private readonly trialJobsMap: Map; +export class PAIBaseJobInfoCollector { + private readonly trialJobsMap: Map; private readonly log: Logger = getLogger(); private readonly statusesNeedToCheck: TrialJobStatus[]; private readonly finalStatuses: TrialJobStatus[]; - constructor(jobMap: Map) { + constructor(jobMap: Map) { this.trialJobsMap = jobMap; this.statusesNeedToCheck = ['RUNNING', 'UNKNOWN', 'WAITING']; this.finalStatuses = ['SUCCEEDED', 'FAILED', 'USER_CANCELED', 'SYS_CANCELED', 'EARLY_STOPPED']; } - public async retrieveTrialStatus(paiToken? : string, paiClusterConfig?: PAIClusterConfig): Promise { - if (paiClusterConfig === undefined || paiToken === undefined) { + public async retrieveTrialStatus(token? : string, paiBaseClusterConfig?: PAIBaseClusterConfig): Promise { + if (paiBaseClusterConfig === undefined || token === undefined) { return Promise.resolve(); } const updatePaiTrialJobs: Promise[] = []; - for (const [trialJobId, paiTrialJob] of this.trialJobsMap) { - if (paiTrialJob === undefined) { + for (const [trialJobId, paiBaseTrialJob] of this.trialJobsMap) { + if (paiBaseTrialJob === undefined) { throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`); } - updatePaiTrialJobs.push(this.getSinglePAITrialJobInfo(paiTrialJob, paiToken, paiClusterConfig)); + updatePaiTrialJobs.push(this.getSinglePAITrialJobInfo(paiBaseTrialJob, token, paiBaseClusterConfig)); } await Promise.all(updatePaiTrialJobs); } - private getSinglePAITrialJobInfo(paiTrialJob: PAITrialJobDetail, paiToken: string, paiClusterConfig: PAIClusterConfig): Promise { + private getSinglePAITrialJobInfo(paiBaseTrialJob: PAIBaseTrialJobDetail, paiToken: string, paiBaseClusterConfig: PAIBaseClusterConfig): Promise { const deferred: Deferred = new Deferred(); - if (!this.statusesNeedToCheck.includes(paiTrialJob.status)) { + if (!this.statusesNeedToCheck.includes(paiBaseTrialJob.status)) { deferred.resolve(); return deferred.promise; @@ -53,10 +52,10 @@ export class PAIJobInfoCollector { // Rest call to get PAI job info and update status // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API const getJobInfoRequest: request.Options = { - uri: `http://${paiClusterConfig.host}/rest-server/api/v1/user/${paiClusterConfig.userName}/jobs/${paiTrialJob.paiJobName}`, + uri: `http://${paiBaseClusterConfig.host}/rest-server/api/v1/user/${paiBaseClusterConfig.userName}/jobs/${paiBaseTrialJob.paiJobName}`, method: 'GET', json: true, - headers: { + headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${paiToken}` } @@ -65,57 +64,57 @@ export class PAIJobInfoCollector { //TODO : pass in request timeout param? request(getJobInfoRequest, (error: Error, response: request.Response, body: any) => { if ((error !== undefined && error !== null) || response.statusCode >= 500) { - this.log.error(`PAI Training service: get job info for trial ${paiTrialJob.id} from PAI Cluster failed!`); + this.log.error(`PAI Training service: get job info for trial ${paiBaseTrialJob.id} from PAI Cluster failed!`); // Queried PAI job info failed, set job status to UNKNOWN - if (paiTrialJob.status === 'WAITING' || paiTrialJob.status === 'RUNNING') { - paiTrialJob.status = 'UNKNOWN'; + if (paiBaseTrialJob.status === 'WAITING' || paiBaseTrialJob.status === 'RUNNING') { + paiBaseTrialJob.status = 'UNKNOWN'; } } else { if (response.body.jobStatus && response.body.jobStatus.state) { switch (response.body.jobStatus.state) { case 'WAITING': - paiTrialJob.status = 'WAITING'; + paiBaseTrialJob.status = 'WAITING'; break; case 'RUNNING': - paiTrialJob.status = 'RUNNING'; - if (paiTrialJob.startTime === undefined) { - paiTrialJob.startTime = response.body.jobStatus.appLaunchedTime; + paiBaseTrialJob.status = 'RUNNING'; + if (paiBaseTrialJob.startTime === undefined) { + paiBaseTrialJob.startTime = response.body.jobStatus.appLaunchedTime; } - if (paiTrialJob.url === undefined) { - paiTrialJob.url = response.body.jobStatus.appTrackingUrl; + if (paiBaseTrialJob.url === undefined) { + paiBaseTrialJob.url = response.body.jobStatus.appTrackingUrl; } break; case 'SUCCEEDED': - paiTrialJob.status = 'SUCCEEDED'; + paiBaseTrialJob.status = 'SUCCEEDED'; break; case 'STOPPED': - if (paiTrialJob.isEarlyStopped !== undefined) { - paiTrialJob.status = paiTrialJob.isEarlyStopped === true ? + if (paiBaseTrialJob.isEarlyStopped !== undefined) { + paiBaseTrialJob.status = paiBaseTrialJob.isEarlyStopped === true ? 'EARLY_STOPPED' : 'USER_CANCELED'; } else { /* if paiTrialJob's isEarlyStopped is undefined, that mean we didn't stop it via cancellation, * mark it as SYS_CANCELLED by PAI */ - paiTrialJob.status = 'SYS_CANCELED'; + paiBaseTrialJob.status = 'SYS_CANCELED'; } break; case 'FAILED': - paiTrialJob.status = 'FAILED'; + paiBaseTrialJob.status = 'FAILED'; break; default: - paiTrialJob.status = 'UNKNOWN'; + paiBaseTrialJob.status = 'UNKNOWN'; } // For final job statues, update startTime, endTime and url - if (this.finalStatuses.includes(paiTrialJob.status)) { - if (paiTrialJob.startTime === undefined) { - paiTrialJob.startTime = response.body.jobStatus.appLaunchedTime; + if (this.finalStatuses.includes(paiBaseTrialJob.status)) { + if (paiBaseTrialJob.startTime === undefined) { + paiBaseTrialJob.startTime = response.body.jobStatus.appLaunchedTime; } - if (paiTrialJob.endTime === undefined) { - paiTrialJob.endTime = response.body.jobStatus.completedTime; + if (paiBaseTrialJob.endTime === undefined) { + paiBaseTrialJob.endTime = response.body.jobStatus.completedTime; } // Set pai trial job's url to WebHDFS output path - if (paiTrialJob.logPath !== undefined) { - paiTrialJob.url += `,${paiTrialJob.logPath}`; + if (paiBaseTrialJob.logPath !== undefined) { + paiBaseTrialJob.url += `,${paiBaseTrialJob.logPath}`; } } } diff --git a/src/nni_manager/training_service/pai/paiJobRestServer.ts b/src/nni_manager/training_service/pai_base/paiBaseJobRestServer.ts similarity index 81% rename from src/nni_manager/training_service/pai/paiJobRestServer.ts rename to src/nni_manager/training_service/pai_base/paiBaseJobRestServer.ts index ca1fc070f5..a6fe305cdc 100644 --- a/src/nni_manager/training_service/pai/paiJobRestServer.ts +++ b/src/nni_manager/training_service/pai_base/paiBaseJobRestServer.ts @@ -7,7 +7,7 @@ import { Request, Response, Router } from 'express'; import { Inject } from 'typescript-ioc'; import * as component from '../../common/component'; import { ClusterJobRestServer } from '../common/clusterJobRestServer'; -import { PAITrainingService } from './paiTrainingService'; +import { PAIBaseTrainingService } from './paiBaseTrainingService'; export interface ParameterFileMeta { readonly experimentId: string; @@ -20,25 +20,25 @@ export interface ParameterFileMeta { * */ @component.Singleton -export class PAIJobRestServer extends ClusterJobRestServer { - private parameterFileMetaList: ParameterFileMeta[] = []; +export class PAIBaseJobRestServer extends ClusterJobRestServer { + protected parameterFileMetaList: ParameterFileMeta[] = []; @Inject - private readonly paiTrainingService: PAITrainingService; + protected readonly paiBaseTrainingService: PAIBaseTrainingService; /** * constructor to provide NNIRestServer's own rest property, e.g. port */ - constructor() { + constructor(paiBaseTrainingService :PAIBaseTrainingService) { super(); - this.paiTrainingService = component.get(PAITrainingService); + this.paiBaseTrainingService = paiBaseTrainingService; } protected handleTrialMetrics(jobId: string, metrics: any[]): void { // Split metrics array into single metric, then emit // Warning: If not split metrics into single ones, the behavior will be UNKNOWN for (const singleMetric of metrics) { - this.paiTrainingService.MetricsEmitter.emit('metric', { + this.paiBaseTrainingService.MetricsEmitter.emit('metric', { id : jobId, data : singleMetric }); diff --git a/src/nni_manager/training_service/pai_base/paiBaseTrainingService.ts b/src/nni_manager/training_service/pai_base/paiBaseTrainingService.ts new file mode 100644 index 0000000000..4a9216783f --- /dev/null +++ b/src/nni_manager/training_service/pai_base/paiBaseTrainingService.ts @@ -0,0 +1,297 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +'use strict'; + +import * as fs from 'fs'; +import * as path from 'path'; +import * as request from 'request'; +import * as component from '../../common/component'; + +import { EventEmitter } from 'events'; +import { Deferred } from 'ts-deferred'; +import { String } from 'typescript-string-operations'; +import { getExperimentId } from '../../common/experimentStartupInfo'; +import { getLogger, Logger } from '../../common/log'; +import { + HyperParameters, NNIManagerIpConfig, TrainingService, + TrialJobApplicationForm, TrialJobDetail, TrialJobMetric +} from '../../common/trainingService'; +import { delay, generateParamFileName, + getExperimentRootDir, getIPV4Address, getVersion, uniqueString, unixPathJoin } from '../../common/utils'; +import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; +import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; +import { execMkdir, validateCodeDir } from '../common/util'; +import { PAIBaseJobInfoCollector } from './paiBaseJobInfoCollector'; +import { PAIBaseJobRestServer, ParameterFileMeta } from './paiBaseJobRestServer'; +import { PAIBaseClusterConfig, PAIBaseTrialJobDetail } from './paiBaseConfig'; + +/** + * Training Service implementation for OpenPAI (Open Platform for AI) + * Refer https://github.com/Microsoft/pai for more info about OpenPAI + */ +@component.Singleton +class PAIBaseTrainingService implements TrainingService { + protected readonly log!: Logger; + protected readonly metricsEmitter: EventEmitter; + protected readonly trialJobsMap: Map; + protected readonly expRootDir: string; + protected paiBaseClusterConfig?: PAIBaseClusterConfig; + protected readonly jobQueue: string[]; + protected stopping: boolean = false; + protected paiToken? : string; + protected paiTokenUpdateTime?: number; + protected readonly paiTokenUpdateInterval: number; + protected readonly experimentId!: string; + protected readonly paiJobCollector: PAIBaseJobInfoCollector; + protected paiRestServerPort?: number; + protected nniManagerIpConfig?: NNIManagerIpConfig; + protected versionCheck: boolean = true; + protected logCollection: string; + protected isMultiPhase: boolean = false; + protected authFileHdfsPath: string | undefined = undefined; + protected portList?: string | undefined; + + constructor() { + this.log = getLogger(); + this.metricsEmitter = new EventEmitter(); + this.trialJobsMap = new Map(); + this.jobQueue = []; + this.expRootDir = path.join('/nni', 'experiments', getExperimentId()); + this.experimentId = getExperimentId(); + this.paiJobCollector = new PAIBaseJobInfoCollector(this.trialJobsMap); + this.paiTokenUpdateInterval = 7200000; //2hours + this.logCollection = 'none'; + this.log.info('Construct paiBase training service.'); + } + + public async run(): Promise { + return; + } + + public async submitTrialJob(form: TrialJobApplicationForm): Promise { + return null; + } + + public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise { + const trialJobDetail: undefined | TrialJobDetail = this.trialJobsMap.get(trialJobId); + if (trialJobDetail === undefined) { + throw new Error(`updateTrialJob failed: ${trialJobId} not found`); + } + + return trialJobDetail; + } + + protected async submitTrialJobToPAI(trialJobId: string): Promise { + return true; + } + + protected async submitJobLoop(): Promise { + while (!this.stopping) { + while (!this.stopping && this.jobQueue.length > 0) { + const trialJobId: string = this.jobQueue[0]; + if (await this.submitTrialJobToPAI(trialJobId)) { + // Remove trial job with trialJobId from job queue + this.jobQueue.shift(); + } else { + // Break the while loop since failed to submitJob + break; + } + } + await delay(3000); + } + } + + public async setClusterMetadata(key: string, value: string): Promise { + return; + } + + public async listTrialJobs(): Promise { + const jobs: TrialJobDetail[] = []; + + for (const [key, value] of this.trialJobsMap) { + jobs.push(await this.getTrialJob(key)); + } + + return Promise.resolve(jobs); + } + + public async getTrialJob(trialJobId: string): Promise { + if (this.paiBaseClusterConfig === undefined) { + throw new Error('PAI Cluster config is not initialized'); + } + + const paiBaseTrialJob: PAIBaseTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); + + if (paiBaseTrialJob === undefined) { + return Promise.reject(`trial job ${trialJobId} not found`); + } + + return Promise.resolve(paiBaseTrialJob); + } + + public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { + this.metricsEmitter.on('metric', listener); + } + + public removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { + this.metricsEmitter.off('metric', listener); + } + + public get isMultiPhaseJobSupported(): boolean { + return true; + } + + public cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise { + const trialJobDetail: PAIBaseTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); + const deferred: Deferred = new Deferred(); + if (trialJobDetail === undefined) { + this.log.error(`cancelTrialJob: trial job id ${trialJobId} not found`); + + return Promise.reject(); + } + + if (this.paiBaseClusterConfig === undefined) { + throw new Error('PAI Cluster config is not initialized'); + } + if (this.paiToken === undefined) { + throw new Error('PAI token is not initialized'); + } + + const stopJobRequest: request.Options = { + uri: `http://${this.paiBaseClusterConfig.host}/rest-server/api/v1/user/${this.paiBaseClusterConfig.userName}\ +/jobs/${trialJobDetail.paiJobName}/executionType`, + method: 'PUT', + json: true, + body: {value: 'STOP'}, + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${this.paiToken}` + } + }; + + // Set trialjobDetail's early stopped field, to mark the job's cancellation source + trialJobDetail.isEarlyStopped = isEarlyStopped; + + request(stopJobRequest, (error: Error, response: request.Response, body: any) => { + if ((error !== undefined && error !== null) || response.statusCode >= 400) { + this.log.error(`PAI Training service: stop trial ${trialJobId} to PAI Cluster failed!`); + deferred.reject((error !== undefined && error !== null) ? error.message : + `Stop trial failed, http code: ${response.statusCode}`); + } else { + deferred.resolve(); + } + }); + + return deferred.promise; + } + + public getClusterMetadata(key: string): Promise { + const deferred: Deferred = new Deferred(); + + deferred.resolve(); + + return deferred.promise; + } + + public async cleanUp(): Promise { + this.log.info('Stopping PAI training service...'); + this.stopping = true; + + const deferred: Deferred = new Deferred(); + const restServer: PAIBaseJobRestServer = component.get(PAIBaseJobRestServer); + try { + await restServer.stop(); + deferred.resolve(); + this.log.info('PAI Training service rest server stopped successfully.'); + } catch (error) { + this.log.error(`PAI Training service rest server stopped failed, error: ${error.message}`); + deferred.reject(error); + } + + return deferred.promise; + } + + public get MetricsEmitter(): EventEmitter { + return this.metricsEmitter; + } + + protected async statusCheckingLoop(): Promise { + while (!this.stopping) { + if(this.paiBaseClusterConfig && this.paiBaseClusterConfig.passWord) { + try { + await this.updatePaiToken(); + } catch (error) { + this.log.error(`${error}`); + //only throw error when initlize paiToken first time + if (this.paiToken === undefined) { + throw new Error(error); + } + } + } + await this.paiJobCollector.retrieveTrialStatus(this.paiToken, this.paiBaseClusterConfig); + const restServer: PAIBaseJobRestServer = component.get(PAIBaseJobRestServer); + if (restServer.getErrorMessage !== undefined) { + throw new Error(restServer.getErrorMessage); + } + await delay(3000); + } + } + + /** + * Update pai token by the interval time or initialize the pai token + */ + protected async updatePaiToken(): Promise { + const deferred: Deferred = new Deferred(); + + const currentTime: number = new Date().getTime(); + //If pai token initialized and not reach the interval time, do not update + if (this.paiTokenUpdateTime !== undefined && (currentTime - this.paiTokenUpdateTime) < this.paiTokenUpdateInterval) { + return Promise.resolve(); + } + + if (this.paiBaseClusterConfig === undefined) { + const paiClusterConfigError: string = `pai cluster config not initialized!`; + this.log.error(`${paiClusterConfigError}`); + throw Error(`${paiClusterConfigError}`); + } + + const authenticationReq: request.Options = { + uri: `http://${this.paiBaseClusterConfig.host}/rest-server/api/v1/token`, + method: 'POST', + json: true, + body: { + username: this.paiBaseClusterConfig.userName, + password: this.paiBaseClusterConfig.passWord + } + }; + + request(authenticationReq, (error: Error, response: request.Response, body: any) => { + if (error !== undefined && error !== null) { + this.log.error(`Get PAI token failed: ${error.message}`); + deferred.reject(new Error(`Get PAI token failed: ${error.message}`)); + } else { + if (response.statusCode !== 200) { + this.log.error(`Get PAI token failed: get PAI Rest return code ${response.statusCode}`); + deferred.reject(new Error(`Get PAI token failed: ${response.body}, please check paiConfig username or password`)); + } + this.paiToken = body.token; + this.paiTokenUpdateTime = new Date().getTime(); + deferred.resolve(); + } + }); + + let timeoutId: NodeJS.Timer; + const timeoutDelay: Promise = new Promise((resolve: Function, reject: Function): void => { + // Set timeout and reject the promise once reach timeout (5 seconds) + timeoutId = setTimeout( + () => reject(new Error('Get PAI token timeout. Please check your PAI cluster.')), + 5000); + }); + + return Promise.race([timeoutDelay, deferred.promise]) + .finally(() => { clearTimeout(timeoutId); }); + } +} + +export { PAIBaseTrainingService }; diff --git a/src/nni_manager/training_service/pai/hdfsClientUtility.ts b/src/nni_manager/training_service/pai_base/paiYarn/hdfsClientUtility.ts similarity index 97% rename from src/nni_manager/training_service/pai/hdfsClientUtility.ts rename to src/nni_manager/training_service/pai_base/paiYarn/hdfsClientUtility.ts index 876cd9e0ac..aa00a15519 100644 --- a/src/nni_manager/training_service/pai/hdfsClientUtility.ts +++ b/src/nni_manager/training_service/pai_base/paiYarn/hdfsClientUtility.ts @@ -4,9 +4,9 @@ import * as fs from 'fs'; import * as path from 'path'; import { Deferred } from 'ts-deferred'; -import { getExperimentId } from '../../common/experimentStartupInfo'; -import { getLogger } from '../../common/log'; -import { unixPathJoin } from '../../common/utils'; +import { getExperimentId } from '../../../common/experimentStartupInfo'; +import { getLogger } from '../../../common/log'; +import { unixPathJoin } from '../../../common/utils'; /** * HDFS client utility, including copy file/directory diff --git a/src/nni_manager/training_service/pai/paiConfig.ts b/src/nni_manager/training_service/pai_base/paiYarn/paiYarnConfig.ts similarity index 86% rename from src/nni_manager/training_service/pai/paiConfig.ts rename to src/nni_manager/training_service/pai_base/paiYarn/paiYarnConfig.ts index 7110c7f63e..918f9c3a1f 100644 --- a/src/nni_manager/training_service/pai/paiConfig.ts +++ b/src/nni_manager/training_service/pai_base/paiYarn/paiYarnConfig.ts @@ -3,7 +3,7 @@ 'use strict'; -import {TrialConfig} from '../common/trialConfig'; +import {TrialConfig} from '../../common/trialConfig'; /** * Task role for PAI @@ -86,30 +86,6 @@ export class PAIJobConfig { } } -/** - * PAI cluster configuration - */ -export class PAIClusterConfig { - public readonly userName: string; - public readonly passWord?: string; - public readonly host: string; - public readonly token?: string; - - /** - * Constructor - * @param userName User name of PAI Cluster - * @param passWord password of PAI Cluster - * @param host Host IP of PAI Cluster - * @param token PAI token of PAI Cluster - */ - constructor(userName: string, host: string, passWord?: string, token?: string) { - this.userName = userName; - this.passWord = passWord; - this.host = host; - this.token = token; - } -} - /** * portList data structure used in PAI taskRole */ diff --git a/src/nni_manager/training_service/pai_base/paiYarn/paiYarnData.ts b/src/nni_manager/training_service/pai_base/paiYarn/paiYarnData.ts new file mode 100644 index 0000000000..3ba015298f --- /dev/null +++ b/src/nni_manager/training_service/pai_base/paiYarn/paiYarnData.ts @@ -0,0 +1,26 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +'use strict'; + +import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../../common/trainingService'; + +export const PAI_INSTALL_NNI_SHELL_FORMAT: string = +`#!/bin/bash +if python3 -c 'import nni' > /dev/null 2>&1; then + # nni module is already installed, skip + return +else + # Install nni + python3 -m pip install --user nni +fi`; + +export const PAI_TRIAL_COMMAND_FORMAT: string = +`export NNI_PLATFORM=paiYarn NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \ +&& cd $NNI_SYS_DIR && sh install_nni.sh \ +&& python3 -m nni_trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' \ +--pai_hdfs_output_dir '{9}' --pai_hdfs_host '{10}' --pai_user_name {11} --nni_hdfs_exp_dir '{12}' --webhdfs_path '/webhdfs/api/v1' \ +--nni_manager_version '{13}' --log_collection '{14}'`; + +export const PAI_LOG_PATH_FORMAT: string = +`http://{0}/webhdfs/explorer.html#{1}`; diff --git a/src/nni_manager/training_service/pai_base/paiYarn/paiYarnJobRestServer.ts b/src/nni_manager/training_service/pai_base/paiYarn/paiYarnJobRestServer.ts new file mode 100644 index 0000000000..15876664c0 --- /dev/null +++ b/src/nni_manager/training_service/pai_base/paiYarn/paiYarnJobRestServer.ts @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +'use strict'; + +import { Request, Response, Router } from 'express'; +import { Inject } from 'typescript-ioc'; +import * as component from '../../../common/component'; +import { ClusterJobRestServer } from '../../common/clusterJobRestServer'; +import { PAIYarnTrainingService } from './paiYarnTrainingService'; +import { PAIBaseJobRestServer } from '../paiBaseJobRestServer'; + +export interface ParameterFileMeta { + readonly experimentId: string; + readonly trialId: string; + readonly filePath: string; +} + +/** + * PAI Training service Rest server, provides rest API to support pai job metrics update + * + */ +@component.Singleton +export class PAIYarnJobRestServer extends PAIBaseJobRestServer { + protected parameterFileMetaList: ParameterFileMeta[] = []; + + /** + * constructor to provide NNIRestServer's own rest property, e.g. port + */ + constructor() { + super(component.get(PAIYarnTrainingService)); + } +} diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai_base/paiYarn/paiYarnTrainingService.ts similarity index 68% rename from src/nni_manager/training_service/pai/paiTrainingService.ts rename to src/nni_manager/training_service/pai_base/paiYarn/paiYarnTrainingService.ts index 3bdaa6983f..949c09cf99 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai_base/paiYarn/paiYarnTrainingService.ts @@ -6,27 +6,29 @@ import * as fs from 'fs'; import * as path from 'path'; import * as request from 'request'; -import * as component from '../../common/component'; +import * as component from '../../../common/component'; import { EventEmitter } from 'events'; import { Deferred } from 'ts-deferred'; import { String } from 'typescript-string-operations'; -import { getExperimentId } from '../../common/experimentStartupInfo'; -import { getLogger, Logger } from '../../common/log'; +import { getExperimentId } from '../../../common/experimentStartupInfo'; +import { getLogger, Logger } from '../../../common/log'; import { HyperParameters, NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric -} from '../../common/trainingService'; +} from '../../../common/trainingService'; import { delay, generateParamFileName, - getExperimentRootDir, getIPV4Address, getVersion, uniqueString, unixPathJoin } from '../../common/utils'; -import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; -import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; -import { execMkdir, validateCodeDir } from '../common/util'; + getExperimentRootDir, getIPV4Address, getVersion, uniqueString, unixPathJoin } from '../../../common/utils'; +import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; +import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; +import { execMkdir, validateCodeDir } from '../../common/util'; import { HDFSClientUtility } from './hdfsClientUtility'; -import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig'; -import { PAI_LOG_PATH_FORMAT, PAI_TRIAL_COMMAND_FORMAT, PAITrialJobDetail } from './paiData'; -import { PAIJobInfoCollector } from './paiJobInfoCollector'; -import { PAIJobRestServer, ParameterFileMeta } from './paiJobRestServer'; +import { NNIPAITrialConfig, PAIJobConfig, PAITaskRole } from './paiYarnConfig'; +import { PAI_LOG_PATH_FORMAT, PAI_TRIAL_COMMAND_FORMAT } from './paiYarnData'; +import { PAIBaseJobInfoCollector } from '../paiBaseJobInfoCollector'; +import { PAIYarnJobRestServer, ParameterFileMeta } from './paiYarnJobRestServer'; +import { PAIBaseTrainingService } from '../paiBaseTrainingService'; +import { PAIBaseClusterConfig, PAIBaseTrialJobDetail } from '../paiBaseConfig'; import * as WebHDFS from 'webhdfs'; @@ -35,48 +37,19 @@ import * as WebHDFS from 'webhdfs'; * Refer https://github.com/Microsoft/pai for more info about OpenPAI */ @component.Singleton -class PAITrainingService implements TrainingService { - protected readonly log!: Logger; - protected readonly metricsEmitter: EventEmitter; - protected readonly trialJobsMap: Map; - protected readonly expRootDir: string; - protected paiTrialConfig: NNIPAITrialConfig | undefined; - protected paiClusterConfig?: PAIClusterConfig; - protected readonly jobQueue: string[]; - protected stopping: boolean = false; +class PAIYarnTrainingService extends PAIBaseTrainingService { private hdfsClient: any; - protected paiToken? : string; - protected paiTokenUpdateTime?: number; - protected readonly paiTokenUpdateInterval: number; - protected readonly experimentId!: string; - protected readonly paiJobCollector: PAIJobInfoCollector; - protected paiRestServerPort?: number; - protected nniManagerIpConfig?: NNIManagerIpConfig; private copyExpCodeDirPromise?: Promise; private copyAuthFilePromise?: Promise; - protected versionCheck: boolean = true; - protected logCollection: string; - protected isMultiPhase: boolean = false; - protected authFileHdfsPath: string | undefined = undefined; - protected portList?: string | undefined; + private paiTrialConfig?: NNIPAITrialConfig; constructor() { - this.log = getLogger(); - this.metricsEmitter = new EventEmitter(); - this.trialJobsMap = new Map(); - this.jobQueue = []; - // Root dir on HDFS - this.expRootDir = path.join('/nni', 'experiments', getExperimentId()); - this.experimentId = getExperimentId(); - this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap); - this.paiTokenUpdateInterval = 7200000; //2hours - this.logCollection = 'none'; - this.log.info('Construct OpenPAI training service.'); + super(); } public async run(): Promise { - this.log.info('Run PAI training service.'); - const restServer: PAIJobRestServer = component.get(PAIJobRestServer); + this.log.info('Run PAIYarn training service.'); + const restServer: PAIYarnJobRestServer = component.get(PAIYarnJobRestServer); await restServer.start(); restServer.setEnableVersionCheck = this.versionCheck; this.log.info(`PAI Training service rest server listening on: ${restServer.endPoint}`); @@ -86,30 +59,6 @@ class PAITrainingService implements TrainingService { this.log.info('PAI training service exit.'); } - public async listTrialJobs(): Promise { - const jobs: TrialJobDetail[] = []; - - for (const [key, value] of this.trialJobsMap) { - jobs.push(await this.getTrialJob(key)); - } - - return Promise.resolve(jobs); - } - - public async getTrialJob(trialJobId: string): Promise { - if (this.paiClusterConfig === undefined) { - throw new Error('PAI Cluster config is not initialized'); - } - - const paiTrialJob: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); - - if (paiTrialJob === undefined) { - return Promise.reject(`trial job ${trialJobId} not found`); - } - - return Promise.resolve(paiTrialJob); - } - public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { this.metricsEmitter.on('metric', listener); } @@ -119,10 +68,10 @@ class PAITrainingService implements TrainingService { } public async submitTrialJob(form: TrialJobApplicationForm): Promise { - if (this.paiClusterConfig === undefined) { - throw new Error(`paiClusterConfig not initialized!`); + if (this.paiBaseClusterConfig === undefined) { + throw new Error(`paiBaseClusterConfig not initialized!`); } - const deferred: Deferred = new Deferred(); + const deferred: Deferred = new Deferred(); this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`); @@ -130,16 +79,16 @@ class PAITrainingService implements TrainingService { //TODO: use HDFS working folder instead const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId); const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; - const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId); + const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiBaseClusterConfig.userName, trialJobId); const hdfsOutputDir: string = unixPathJoin(hdfsCodeDir, 'nnioutput'); const hdfsLogPath: string = String.Format( PAI_LOG_PATH_FORMAT, - this.paiClusterConfig.host, + this.paiBaseClusterConfig.host, hdfsOutputDir ); - const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( + const trialJobDetail: PAIBaseTrialJobDetail = new PAIBaseTrialJobDetail( trialJobId, 'WAITING', paiJobName, @@ -155,64 +104,6 @@ class PAITrainingService implements TrainingService { return deferred.promise; } - public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise { - const trialJobDetail: undefined | TrialJobDetail = this.trialJobsMap.get(trialJobId); - if (trialJobDetail === undefined) { - throw new Error(`updateTrialJob failed: ${trialJobId} not found`); - } - await this.writeParameterFile(trialJobId, form.hyperParameters); - - return trialJobDetail; - } - - public get isMultiPhaseJobSupported(): boolean { - return true; - } - - public cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise { - const trialJobDetail: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); - const deferred: Deferred = new Deferred(); - if (trialJobDetail === undefined) { - this.log.error(`cancelTrialJob: trial job id ${trialJobId} not found`); - - return Promise.reject(); - } - - if (this.paiClusterConfig === undefined) { - throw new Error('PAI Cluster config is not initialized'); - } - if (this.paiToken === undefined) { - throw new Error('PAI token is not initialized'); - } - - const stopJobRequest: request.Options = { - uri: `http://${this.paiClusterConfig.host}/rest-server/api/v1/user/${this.paiClusterConfig.userName}\ -/jobs/${trialJobDetail.paiJobName}/executionType`, - method: 'PUT', - json: true, - body: {value: 'STOP'}, - headers: { - 'Content-Type': 'application/json', - Authorization: `Bearer ${this.paiToken}` - } - }; - - // Set trialjobDetail's early stopped field, to mark the job's cancellation source - trialJobDetail.isEarlyStopped = isEarlyStopped; - - request(stopJobRequest, (error: Error, response: request.Response, body: any) => { - if ((error !== undefined && error !== null) || response.statusCode >= 400) { - this.log.error(`PAI Training service: stop trial ${trialJobId} to PAI Cluster failed!`); - deferred.reject((error !== undefined && error !== null) ? error.message : - `Stop trial failed, http code: ${response.statusCode}`); - } else { - deferred.resolve(); - } - }); - - return deferred.promise; - } - public async setClusterMetadata(key: string, value: string): Promise { const deferred: Deferred = new Deferred(); @@ -222,30 +113,30 @@ class PAITrainingService implements TrainingService { deferred.resolve(); break; - case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG: - this.paiClusterConfig = JSON.parse(value); + case TrialConfigMetadataKey.PAI_YARN_CLUSTER_CONFIG: + this.paiBaseClusterConfig = JSON.parse(value); this.hdfsClient = WebHDFS.createClient({ - user: this.paiClusterConfig.userName, + user: this.paiBaseClusterConfig.userName, // Refer PAI document for Pylon mapping https://github.com/Microsoft/pai/tree/master/docs/pylon port: 80, path: '/webhdfs/api/v1', - host: this.paiClusterConfig.host + host: this.paiBaseClusterConfig.host }); - if(this.paiClusterConfig.passWord) { + if(this.paiBaseClusterConfig.passWord) { // Get PAI authentication token await this.updatePaiToken(); - } else if(this.paiClusterConfig.token) { - this.paiToken = this.paiClusterConfig.token; + } else if(this.paiBaseClusterConfig.token) { + this.paiToken = this.paiBaseClusterConfig.token; } else { - deferred.reject(new Error('pai cluster config format error, please set password or token!')); + deferred.reject(new Error('paiBase cluster config format error, please set password or token!')); } deferred.resolve(); break; case TrialConfigMetadataKey.TRIAL_CONFIG: - if (this.paiClusterConfig === undefined) { + if (this.paiBaseClusterConfig === undefined) { this.log.error('pai cluster config is not initialized'); deferred.reject(new Error('pai cluster config is not initialized')); break; @@ -264,13 +155,13 @@ class PAITrainingService implements TrainingService { // Copy experiment files from local folder to HDFS this.copyExpCodeDirPromise = HDFSClientUtility.copyDirectoryToHdfs( this.paiTrialConfig.codeDir, - HDFSClientUtility.getHdfsExpCodeDir(this.paiClusterConfig.userName), + HDFSClientUtility.getHdfsExpCodeDir(this.paiBaseClusterConfig.userName), this.hdfsClient ); // Upload authFile to hdfs if (this.paiTrialConfig.authFile) { - this.authFileHdfsPath = unixPathJoin(HDFSClientUtility.hdfsExpRootDir(this.paiClusterConfig.userName), 'authFile'); + this.authFileHdfsPath = unixPathJoin(HDFSClientUtility.hdfsExpRootDir(this.paiBaseClusterConfig.userName), 'authFile'); this.copyAuthFilePromise = HDFSClientUtility.copyFileToHdfs(this.paiTrialConfig.authFile, this.authFileHdfsPath, this.hdfsClient); } @@ -306,7 +197,7 @@ class PAITrainingService implements TrainingService { this.stopping = true; const deferred: Deferred = new Deferred(); - const restServer: PAIJobRestServer = component.get(PAIJobRestServer); + const restServer: PAIYarnJobRestServer = component.get(PAIYarnJobRestServer); try { await restServer.stop(); deferred.resolve(); @@ -325,13 +216,13 @@ class PAITrainingService implements TrainingService { protected async submitTrialJobToPAI(trialJobId: string): Promise { const deferred: Deferred = new Deferred(); - const trialJobDetail: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); + const trialJobDetail: PAIBaseTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); if (trialJobDetail === undefined) { throw new Error(`Failed to find PAITrialJobDetail for job ${trialJobId}`); } - if (this.paiClusterConfig === undefined) { + if (this.paiBaseClusterConfig === undefined) { throw new Error('PAI Cluster config is not initialized'); } if (this.paiTrialConfig === undefined) { @@ -342,7 +233,7 @@ class PAITrainingService implements TrainingService { } if (this.paiRestServerPort === undefined) { - const restServer: PAIJobRestServer = component.get(PAIJobRestServer); + const restServer: PAIYarnJobRestServer = component.get(PAIYarnJobRestServer); this.paiRestServerPort = restServer.clusterRestServerPort; } @@ -372,7 +263,7 @@ class PAITrainingService implements TrainingService { trialJobDetail.form.hyperParameters.value, { encoding: 'utf8' } ); } - const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId); + const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiBaseClusterConfig.userName, trialJobId); const hdfsOutputDir: string = unixPathJoin(hdfsCodeDir, 'nnioutput'); const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address(); const version: string = this.versionCheck ? await getVersion() : ''; @@ -389,9 +280,9 @@ class PAITrainingService implements TrainingService { nniManagerIp, this.paiRestServerPort, hdfsOutputDir, - this.paiClusterConfig.host, - this.paiClusterConfig.userName, - HDFSClientUtility.getHdfsExpCodeDir(this.paiClusterConfig.userName), + this.paiBaseClusterConfig.host, + this.paiBaseClusterConfig.userName, + HDFSClientUtility.getHdfsExpCodeDir(this.paiBaseClusterConfig.userName), version, this.logCollection ) @@ -447,7 +338,7 @@ class PAITrainingService implements TrainingService { // Step 3. Submit PAI job via Rest call // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API const submitJobRequest: request.Options = { - uri: `http://${this.paiClusterConfig.host}/rest-server/api/v1/user/${this.paiClusterConfig.userName}/jobs`, + uri: `http://${this.paiBaseClusterConfig.host}/rest-server/api/v1/user/${this.paiBaseClusterConfig.userName}/jobs`, method: 'POST', json: true, body: paiJobConfig, @@ -473,7 +364,7 @@ class PAITrainingService implements TrainingService { protected async statusCheckingLoop(): Promise { while (!this.stopping) { - if(this.paiClusterConfig && this.paiClusterConfig.passWord) { + if(this.paiBaseClusterConfig && this.paiBaseClusterConfig.passWord) { try { await this.updatePaiToken(); } catch (error) { @@ -484,8 +375,8 @@ class PAITrainingService implements TrainingService { } } } - await this.paiJobCollector.retrieveTrialStatus(this.paiToken, this.paiClusterConfig); - const restServer: PAIJobRestServer = component.get(PAIJobRestServer); + await this.paiJobCollector.retrieveTrialStatus(this.paiToken, this.paiBaseClusterConfig); + const restServer: PAIYarnJobRestServer = component.get(PAIYarnJobRestServer); if (restServer.getErrorMessage !== undefined) { throw new Error(restServer.getErrorMessage); } @@ -493,22 +384,6 @@ class PAITrainingService implements TrainingService { } } - protected async submitJobLoop(): Promise { - while (!this.stopping) { - while (!this.stopping && this.jobQueue.length > 0) { - const trialJobId: string = this.jobQueue[0]; - if (await this.submitTrialJobToPAI(trialJobId)) { - // Remove trial job with trialJobId from job queue - this.jobQueue.shift(); - } else { - // Break the while loop since failed to submitJob - break; - } - } - await delay(3000); - } - } - /** * Update pai token by the interval time or initialize the pai token */ @@ -521,19 +396,19 @@ class PAITrainingService implements TrainingService { return Promise.resolve(); } - if (this.paiClusterConfig === undefined) { + if (this.paiBaseClusterConfig === undefined) { const paiClusterConfigError: string = `pai cluster config not initialized!`; this.log.error(`${paiClusterConfigError}`); throw Error(`${paiClusterConfigError}`); } const authenticationReq: request.Options = { - uri: `http://${this.paiClusterConfig.host}/rest-server/api/v1/token`, + uri: `http://${this.paiBaseClusterConfig.host}/rest-server/api/v1/token`, method: 'POST', json: true, body: { - username: this.paiClusterConfig.userName, - password: this.paiClusterConfig.passWord + username: this.paiBaseClusterConfig.userName, + password: this.paiBaseClusterConfig.passWord } }; @@ -564,8 +439,19 @@ class PAITrainingService implements TrainingService { .finally(() => { clearTimeout(timeoutId); }); } + + public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise { + const trialJobDetail: undefined | TrialJobDetail = this.trialJobsMap.get(trialJobId); + if (trialJobDetail === undefined) { + throw new Error(`updateTrialJob failed: ${trialJobId} not found`); + } + await this.writeParameterFile(trialJobId, form.hyperParameters); + + return trialJobDetail; + } + protected async writeParameterFile(trialJobId: string, hyperParameters: HyperParameters): Promise { - if (this.paiClusterConfig === undefined) { + if (this.paiBaseClusterConfig === undefined) { throw new Error('PAI Cluster config is not initialized'); } if (this.paiTrialConfig === undefined) { @@ -576,7 +462,7 @@ class PAITrainingService implements TrainingService { const hpFileName: string = generateParamFileName(hyperParameters); const localFilepath: string = path.join(trialLocalTempFolder, hpFileName); await fs.promises.writeFile(localFilepath, hyperParameters.value, { encoding: 'utf8' }); - const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId); + const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiBaseClusterConfig.userName, trialJobId); const hdfsHpFilePath: string = path.join(hdfsCodeDir, hpFileName); await HDFSClientUtility.copyFileToHdfs(localFilepath, hdfsHpFilePath, this.hdfsClient); @@ -590,7 +476,7 @@ class PAITrainingService implements TrainingService { protected postParameterFileMeta(parameterFileMeta: ParameterFileMeta): Promise { const deferred: Deferred = new Deferred(); - const restServer: PAIJobRestServer = component.get(PAIJobRestServer); + const restServer: PAIYarnJobRestServer = component.get(PAIYarnJobRestServer); const req: request.Options = { uri: `${restServer.endPoint}${restServer.apiRootUrl}/parameter-file-meta`, method: 'POST', @@ -609,4 +495,4 @@ class PAITrainingService implements TrainingService { } } -export { PAITrainingService }; +export { PAIYarnTrainingService }; diff --git a/src/nni_manager/training_service/pai/paiTrialConfig.ts b/src/nni_manager/training_service/pai_base/paiYarn/paiYarnTrialConfig.ts similarity index 86% rename from src/nni_manager/training_service/pai/paiTrialConfig.ts rename to src/nni_manager/training_service/pai_base/paiYarn/paiYarnTrialConfig.ts index 8278dae3bb..51b898d6e5 100644 --- a/src/nni_manager/training_service/pai/paiTrialConfig.ts +++ b/src/nni_manager/training_service/pai_base/paiYarn/paiYarnTrialConfig.ts @@ -3,12 +3,12 @@ 'use strict'; -import {TrialConfig} from '../common/trialConfig'; +import {TrialConfig} from '../../common/trialConfig'; /** * PAI configuration to run trials */ -export class PAITrialConfig extends TrialConfig { +export class PAIYarnTrialConfig extends TrialConfig { public readonly cpuNum: number; public readonly memoryMB: number; public readonly image: string; diff --git a/src/nni_manager/training_service/test/hdfsClientUtility.test.ts b/src/nni_manager/training_service/test/hdfsClientUtility.test.ts index 013442019c..dfa6878016 100644 --- a/src/nni_manager/training_service/test/hdfsClientUtility.test.ts +++ b/src/nni_manager/training_service/test/hdfsClientUtility.test.ts @@ -9,7 +9,7 @@ import * as os from 'os'; import * as path from 'path'; import * as tmp from 'tmp'; import { cleanupUnitTest, prepareUnitTest, uniqueString } from '../../common/utils'; -import { HDFSClientUtility } from '../pai/hdfsClientUtility'; +import { HDFSClientUtility } from '../pai_base/paiYarn/hdfsClientUtility'; var WebHDFS = require('webhdfs'); var rmdir = require('rmdir'); diff --git a/src/nni_manager/training_service/test/paiTrainingService.test.ts b/src/nni_manager/training_service/test/paiYarnTrainingService.test.ts similarity index 78% rename from src/nni_manager/training_service/test/paiTrainingService.test.ts rename to src/nni_manager/training_service/test/paiYarnTrainingService.test.ts index c276b03435..5f9ef7c06e 100644 --- a/src/nni_manager/training_service/test/paiTrainingService.test.ts +++ b/src/nni_manager/training_service/test/paiYarnTrainingService.test.ts @@ -11,14 +11,14 @@ import * as component from '../../common/component'; import { TrialJobApplicationForm } from '../../common/trainingService'; import { cleanupUnitTest, prepareUnitTest } from '../../common/utils'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; -import { PAITrainingService } from '../pai/paiTrainingService'; +import { PAIYarnTrainingService } from '../pai_base/paiYarn/paiYarnTrainingService'; // TODO: copy mockedTrail.py to local folder const localCodeDir: string = tmp.dirSync().name const mockedTrialPath: string = './training_service/test/mockedTrial.py' fs.copyFileSync(mockedTrialPath, localCodeDir + '/mockedTrial.py') -describe('Unit Test for PAITrainingService', () => { +describe('Unit Test for PAIYarnTrainingService', () => { let skip: boolean = false; let testPaiClusterInfo: any; let paiCluster: any; @@ -33,7 +33,7 @@ describe('Unit Test for PAITrainingService', () => { skip = true; } - let paiTrainingService: PAITrainingService; + let paiYarnTrainingService: PAIYarnTrainingService; console.log(tmp.dirSync().name); @@ -51,15 +51,15 @@ describe('Unit Test for PAITrainingService', () => { if (skip) { return; } - paiTrainingService = component.get(PAITrainingService); - paiTrainingService.run(); + paiYarnTrainingService = component.get(PAIYarnTrainingService); + paiYarnTrainingService.run(); }); afterEach(() => { if (skip) { return; } - paiTrainingService.cleanUp(); + paiYarnTrainingService.cleanUp(); }); it('Get PAI token', async () => { @@ -67,14 +67,14 @@ describe('Unit Test for PAITrainingService', () => { return; } console.log(`paiCluster is ${paiCluster}`) - await paiTrainingService.setClusterMetadata(TrialConfigMetadataKey.PAI_CLUSTER_CONFIG, paiCluster); - await paiTrainingService.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, paiTrialConfig); + await paiYarnTrainingService.setClusterMetadata(TrialConfigMetadataKey.PAI_YARN_CLUSTER_CONFIG, paiCluster); + await paiYarnTrainingService.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, paiTrialConfig); const form: TrialJobApplicationForm = { sequenceId: 0, hyperParameters: { value: '', index: 0 } }; try { - const trialDetail = await paiTrainingService.submitTrialJob(form); + const trialDetail = await paiYarnTrainingService.submitTrialJob(form); chai.expect(trialDetail.status).to.be.equals('WAITING'); } catch(error) { console.log('Submit job failed:' + error); diff --git a/src/sdk/pynni/nni/platform/__init__.py b/src/sdk/pynni/nni/platform/__init__.py index 2a2ac7128e..6f79fd3077 100644 --- a/src/sdk/pynni/nni/platform/__init__.py +++ b/src/sdk/pynni/nni/platform/__init__.py @@ -7,7 +7,7 @@ from .standalone import * elif trial_env_vars.NNI_PLATFORM == 'unittest': from .test import * -elif trial_env_vars.NNI_PLATFORM in ('local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiLite'): +elif trial_env_vars.NNI_PLATFORM in ('local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn'): from .local import * else: raise RuntimeError('Unknown platform %s' % trial_env_vars.NNI_PLATFORM) diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index c61be32ea0..d6dcdb64ae 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -32,7 +32,7 @@ def setPathCheck(key): 'trialConcurrency': setNumberRange('trialConcurrency', int, 1, 99999), Optional('maxExecDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$', error='ERROR: maxExecDuration format is [digit]{s,m,h,d}')), Optional('maxTrialNum'): setNumberRange('maxTrialNum', int, 1, 99999), - 'trainingServicePlatform': setChoice('trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'paiLite'), + 'trainingServicePlatform': setChoice('trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn'), Optional('searchSpacePath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'searchSpacePath'), Optional('multiPhase'): setType('multiPhase', bool), Optional('multiThread'): setType('multiThread', bool), @@ -223,7 +223,7 @@ def setPathCheck(key): } } -pai_trial_schema = { +pai_yarn_trial_schema = { 'trial':{ 'command': setType('command', str), 'codeDir': setPathCheck('codeDir'), @@ -247,8 +247,8 @@ def setPathCheck(key): } } -pai_config_schema = { - 'paiConfig': Or({ +pai_yarn_config_schema = { + 'paiYarnConfig': Or({ 'userName': setType('userName', str), 'passWord': setType('passWord', str), 'host': setType('host', str) @@ -260,7 +260,7 @@ def setPathCheck(key): } -pai_lite_trial_schema = { +pai_trial_schema = { 'trial':{ 'command': setType('command', str), 'codeDir': setPathCheck('codeDir'), @@ -275,8 +275,8 @@ def setPathCheck(key): } } -pai_lite_config_schema = { - 'paiLiteConfig': Or({ +pai_config_schema = { + 'paiConfig': Or({ 'userName': setType('userName', str), 'passWord': setType('passWord', str), 'host': setType('host', str) @@ -424,7 +424,7 @@ def setPathCheck(key): PAI_CONFIG_SCHEMA = Schema({**common_schema, **pai_trial_schema, **pai_config_schema}) -PAI_LITE_CONFIG_SCHEMA = Schema({**common_schema, **pai_lite_trial_schema, **pai_lite_config_schema}) +PAI_YARN_CONFIG_SCHEMA = Schema({**common_schema, **pai_yarn_trial_schema, **pai_yarn_config_schema}) KUBEFLOW_CONFIG_SCHEMA = Schema({**common_schema, **kubeflow_trial_schema, **kubeflow_config_schema}) diff --git a/tools/nni_cmd/launcher.py b/tools/nni_cmd/launcher.py index dff4e5840f..3b52030f2f 100644 --- a/tools/nni_cmd/launcher.py +++ b/tools/nni_cmd/launcher.py @@ -224,11 +224,11 @@ def set_pai_config(experiment_config, port, config_file_name): #set trial_config return set_trial_config(experiment_config, port, config_file_name), err_message -def set_pai_lite_config(experiment_config, port, config_file_name): - '''set pai configuration''' - pai_lite_config_data = dict() - pai_lite_config_data['pai_lite_config'] = experiment_config['paiLiteConfig'] - response = rest_put(cluster_metadata_url(port), json.dumps(pai_lite_config_data), REST_TIME_OUT) +def set_pai_yarn_config(experiment_config, port, config_file_name): + '''set paiYarn configuration''' + pai_yarn_config_data = dict() + pai_yarn_config_data['pai_yarn_config'] = experiment_config['paiYarnConfig'] + response = rest_put(cluster_metadata_url(port), json.dumps(pai_yarn_config_data), REST_TIME_OUT) err_message = None if not response or not response.status_code == 200: if response is not None: @@ -339,9 +339,9 @@ def set_experiment(experiment_config, mode, port, config_file_name): {'key': 'pai_config', 'value': experiment_config['paiConfig']}) request_data['clusterMetaData'].append( {'key': 'trial_config', 'value': experiment_config['trial']}) - elif experiment_config['trainingServicePlatform'] == 'paiLite': + elif experiment_config['trainingServicePlatform'] == 'paiYarn': request_data['clusterMetaData'].append( - {'key': 'pai_lite_config', 'value': experiment_config['paiLiteConfig']}) + {'key': 'pai_yarn_config', 'value': experiment_config['paiYarnConfig']}) request_data['clusterMetaData'].append( {'key': 'trial_config', 'value': experiment_config['trial']}) elif experiment_config['trainingServicePlatform'] == 'kubeflow': @@ -375,8 +375,8 @@ def set_platform_config(platform, experiment_config, port, config_file_name, res config_result, err_msg = set_remote_config(experiment_config, port, config_file_name) elif platform == 'pai': config_result, err_msg = set_pai_config(experiment_config, port, config_file_name) - elif platform == 'paiLite': - config_result, err_msg = set_pai_lite_config(experiment_config, port, config_file_name) + elif platform == 'paiYarn': + config_result, err_msg = set_pai_yarn_config(experiment_config, port, config_file_name) elif platform == 'kubeflow': config_result, err_msg = set_kubeflow_config(experiment_config, port, config_file_name) elif platform == 'frameworkcontroller': diff --git a/tools/nni_cmd/launcher_utils.py b/tools/nni_cmd/launcher_utils.py index 661201e718..5c1178c979 100644 --- a/tools/nni_cmd/launcher_utils.py +++ b/tools/nni_cmd/launcher_utils.py @@ -5,7 +5,7 @@ import json from schema import SchemaError from schema import Schema -from .config_schema import LOCAL_CONFIG_SCHEMA, REMOTE_CONFIG_SCHEMA, PAI_CONFIG_SCHEMA, PAI_LITE_CONFIG_SCHEMA, KUBEFLOW_CONFIG_SCHEMA,\ +from .config_schema import LOCAL_CONFIG_SCHEMA, REMOTE_CONFIG_SCHEMA, PAI_CONFIG_SCHEMA, PAI_YARN_CONFIG_SCHEMA, KUBEFLOW_CONFIG_SCHEMA,\ FRAMEWORKCONTROLLER_CONFIG_SCHEMA, tuner_schema_dict, advisor_schema_dict, assessor_schema_dict from .common_utils import print_error, print_warning, print_normal @@ -143,14 +143,14 @@ def validate_kubeflow_operators(experiment_config): def validate_common_content(experiment_config): '''Validate whether the common values in experiment_config is valid''' if not experiment_config.get('trainingServicePlatform') or \ - experiment_config.get('trainingServicePlatform') not in ['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiLite']: + experiment_config.get('trainingServicePlatform') not in ['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn']: print_error('Please set correct trainingServicePlatform!') exit(1) schema_dict = { 'local': LOCAL_CONFIG_SCHEMA, 'remote': REMOTE_CONFIG_SCHEMA, 'pai': PAI_CONFIG_SCHEMA, - 'paiLite': PAI_LITE_CONFIG_SCHEMA, + 'paiYarn': PAI_YARN_CONFIG_SCHEMA, 'kubeflow': KUBEFLOW_CONFIG_SCHEMA, 'frameworkcontroller': FRAMEWORKCONTROLLER_CONFIG_SCHEMA } @@ -262,7 +262,7 @@ def validate_machine_list(experiment_config): def validate_pai_trial_conifg(experiment_config): '''validate the trial config in pai platform''' - if experiment_config.get('trainingServicePlatform') in ['pai', 'paiLite']: + if experiment_config.get('trainingServicePlatform') in ['pai', 'PAIYarn']: if experiment_config.get('trial').get('shmMB') and \ experiment_config['trial']['shmMB'] > experiment_config['trial']['memoryMB']: print_error('shmMB should be no more than memoryMB!') From 6ac3e18101dd9077ac16ed418ff722dd9c41f3c0 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Sun, 15 Dec 2019 22:53:23 +0800 Subject: [PATCH 08/18] add doc --- docs/en_US/TrainingService/PaiMode.md | 62 ++-------- docs/en_US/TrainingService/PaiYarnMode.md | 136 ++++++++++++++++++++++ 2 files changed, 148 insertions(+), 50 deletions(-) create mode 100644 docs/en_US/TrainingService/PaiYarnMode.md diff --git a/docs/en_US/TrainingService/PaiMode.md b/docs/en_US/TrainingService/PaiMode.md index 4a8b5ab87e..0f8e9b58b2 100644 --- a/docs/en_US/TrainingService/PaiMode.md +++ b/docs/en_US/TrainingService/PaiMode.md @@ -34,10 +34,13 @@ trial: cpuNum: 1 memoryMB: 8196 image: msranni/nni:latest + virtualCluster: default + nniManagerNFSMountPath: /home/user/mnt + containerNFSMountPath: /mnt/data/user # Configuration to access OpenPAI Cluster paiConfig: userName: your_pai_nni_user - passWord: your_pai_password + token: your_pai_token host: 10.1.1.1 ``` @@ -52,53 +55,14 @@ Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMod * Required key. In pai mode, your trial program will be scheduled by OpenPAI to run in [Docker container](https://www.docker.com/). This key is used to specify the Docker image used to create the container in which your trial will run. * We already build a docker image [nnimsra/nni](https://hub.docker.com/r/msranni/nni/) on [Docker Hub](https://hub.docker.com/). It contains NNI python packages, Node modules and javascript artifact files required to start experiment, and all of NNI dependencies. The docker file used to build this image can be found at [here](https://github.com/Microsoft/nni/tree/master/deployment/docker/Dockerfile). You can either use this image directly in your config file, or build your own image based on it. * virtualCluster - * Optional key. Set the virtualCluster of OpenPAI. If omitted, the job will run on default virtual cluster. -* shmMB - * Optional key. Set the shmMB configuration of OpenPAI, it set the shared memory for one task in the task role. -* authFile - * Optional key, Set the auth file path for private registry while using PAI mode, [Refer](https://github.com/microsoft/pai/blob/2ea69b45faa018662bc164ed7733f6fdbb4c42b3/docs/faq.md#q-how-to-use-private-docker-registry-job-image-when-submitting-an-openpai-job), you can prepare the authFile and simply provide the local path of this file, NNI will upload this file to HDFS for you. -* portList - * Optional key. Set the portList configuration of OpenPAI, it specifies a list of port used in container, [Refer](https://github.com/microsoft/pai/blob/b2324866d0280a2d22958717ea6025740f71b9f0/docs/job_tutorial.md#specification). - The config schema in NNI is shown below: - ``` - portList: - - label: test - beginAt: 8080 - portNumber: 2 - ``` - Let's say you want to launch a tensorboard in the mnist example using the port. So the first step is to write a wrapper script `launch_pai.sh` of `mnist.py`. - - ```bash - export TENSORBOARD_PORT=PAI_PORT_LIST_${PAI_CURRENT_TASK_ROLE_NAME}_0_tensorboard - tensorboard --logdir . --port ${!TENSORBOARD_PORT} & - python3 mnist.py - ``` - The config file of portList should be filled as following: + * Required key. Set the virtualCluster of OpenPAI. If omitted, the job will run on default virtual cluster. +* nniManagerNFSMountPath + * Required key. Set the mount path in your nniManager machine. +* containerNFSMountPath + * Required key. Set the mount path in your container used in PAI. +* paiStoragePlugin + * Required key. Set the sotrage plugin name used in PAI. - ```yaml - trial: - command: bash launch_pai.sh - portList: - - label: tensorboard - beginAt: 0 - portNumber: 1 - ``` - -NNI support two kind of authorization method in PAI, including password and PAI token, [refer](https://github.com/microsoft/pai/blob/b6bd2ab1c8890f91b7ac5859743274d2aa923c22/docs/rest-server/API.md#2-authentication). The authorization is configured in `paiConfig` field. -For password authorization, the `paiConfig` schema is: -``` -paiConfig: - userName: your_pai_nni_user - passWord: your_pai_password - host: 10.1.1.1 -``` -For pai token authorization, the `paiConfig` schema is: -``` -paiConfig: - userName: your_pai_nni_user - token: your_pai_token - host: 10.1.1.1 -``` Once complete to fill NNI experiment config file and save (for example, save as exp_pai.yml), then run the following command ``` @@ -121,9 +85,7 @@ And you will be redirected to HDFS web portal to browse the output files of that You can see there're three fils in output folder: stderr, stdout, and trial.log ## data management -If your training data is not too large, it could be put into codeDir, and nni will upload the data to hdfs, or you could build your own docker image with the data. If you have large dataset, it's not appropriate to put the data in codeDir, and you could follow the [guidance](https://github.com/microsoft/pai/blob/master/docs/user/storage.md) to mount the data folder in container. - -If you also want to save trial's other output into HDFS, like model files, you can use environment variable `NNI_OUTPUT_DIR` in your trial code to save your own output files, and NNI SDK will copy all the files in `NNI_OUTPUT_DIR` from trial's container to HDFS, the target path is `hdfs://host:port/{username}/nni/{experiments}/{experimentId}/trials/{trialId}/nnioutput` +Befour using NNI to start your experiment, users should set the corresponding mount data path in your nniManager machine. PAI has their own storage(NFS, AzureBlob ...), and the storage will used in PAI will be mounted to the container when it start a job. Users should set the PAI storage type by `paiStoragePlugin` field to choose a storage in PAI. Then users should mount the storage to their nniManager machine, and set the `nniManagerNFSMountPath` field in configuration file, NNI will generate bash files and copy data in `codeDir` to the `nniManagerNFSMountPath` folder, then NNI will start a trial job. The data in `nniManagerNFSMountPath` will be sync to PAI storage, and will be mounted to PAI's container. The data path in container is set in `containerNFSMountPath`, NNI will enter this folder first, and then run scripts to start a trial job. ## version check NNI support version check feature in since version 0.6. It is a policy to insure the version of NNIManager is consistent with trialKeeper, and avoid errors caused by version incompatibility. diff --git a/docs/en_US/TrainingService/PaiYarnMode.md b/docs/en_US/TrainingService/PaiYarnMode.md new file mode 100644 index 0000000000..64b96f14a6 --- /dev/null +++ b/docs/en_US/TrainingService/PaiYarnMode.md @@ -0,0 +1,136 @@ +**Run an Experiment on OpenpaiYarn** +=== +The original `pai` mode is modificated to `paiYarnYarn` mode, which is a distributed training platform based on Yarn. + +## Setup environment +Install NNI, follow the install guide [here](../Tutorial/QuickStart.md). + +## Run an experiment +Use `examples/trials/mnist-annotation` as an example. The NNI config YAML file's content is like: + +```yaml +authorName: your_name +experimentName: auto_mnist +# how many trials could be concurrently running +trialConcurrency: 2 +# maximum experiment running duration +maxExecDuration: 3h +# empty means never stop +maxTrialNum: 100 +# choice: local, remote, pai, paiYarn +trainingServicePlatform: paiYarn +# search space file +searchSpacePath: search_space.json +# choice: true, false +useAnnotation: true +tuner: + builtinTunerName: TPE + classArgs: + optimize_mode: maximize +trial: + command: python3 mnist.py + codeDir: ~/nni/examples/trials/mnist-annotation + gpuNum: 0 + cpuNum: 1 + memoryMB: 8196 + image: msranni/nni:latest +# Configuration to access OpenpaiYarn Cluster +paiYarnConfig: + userName: your_paiYarn_nni_user + passWord: your_paiYarn_password + host: 10.1.1.1 +``` + +Note: You should set `trainingServicePlatform: paiYarn` in NNI config YAML file if you want to start experiment in paiYarn mode. + +Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMode.md), trial configuration in paiYarn mode have these additional keys: +* cpuNum + * Required key. Should be positive number based on your trial program's CPU requirement +* memoryMB + * Required key. Should be positive number based on your trial program's memory requirement +* image + * Required key. In paiYarn mode, your trial program will be scheduled by OpenpaiYarn to run in [Docker container](https://www.docker.com/). This key is used to specify the Docker image used to create the container in which your trial will run. + * We already build a docker image [nnimsra/nni](https://hub.docker.com/r/msranni/nni/) on [Docker Hub](https://hub.docker.com/). It contains NNI python packages, Node modules and javascript artifact files required to start experiment, and all of NNI dependencies. The docker file used to build this image can be found at [here](https://github.com/Microsoft/nni/tree/master/deployment/docker/Dockerfile). You can either use this image directly in your config file, or build your own image based on it. +* virtualCluster + * Optional key. Set the virtualCluster of OpenpaiYarn. If omitted, the job will run on default virtual cluster. +* shmMB + * Optional key. Set the shmMB configuration of OpenpaiYarn, it set the shared memory for one task in the task role. +* authFile + * Optional key, Set the auth file path for private registry while using paiYarn mode, [Refer](https://github.com/microsoft/paiYarn/blob/2ea69b45faa018662bc164ed7733f6fdbb4c42b3/docs/faq.md#q-how-to-use-private-docker-registry-job-image-when-submitting-an-openpaiYarn-job), you can prepare the authFile and simply provide the local path of this file, NNI will upload this file to HDFS for you. +* portList + * Optional key. Set the portList configuration of OpenpaiYarn, it specifies a list of port used in container, [Refer](https://github.com/microsoft/paiYarn/blob/b2324866d0280a2d22958717ea6025740f71b9f0/docs/job_tutorial.md#specification). + The config schema in NNI is shown below: + ``` + portList: + - label: test + beginAt: 8080 + portNumber: 2 + ``` + Let's say you want to launch a tensorboard in the mnist example using the port. So the first step is to write a wrapper script `launch_paiYarn.sh` of `mnist.py`. + + ```bash + export TENSORBOARD_PORT=paiYarn_PORT_LIST_${paiYarn_CURRENT_TASK_ROLE_NAME}_0_tensorboard + tensorboard --logdir . --port ${!TENSORBOARD_PORT} & + python3 mnist.py + ``` + The config file of portList should be filled as following: + + ```yaml + trial: + command: bash launch_paiYarn.sh + portList: + - label: tensorboard + beginAt: 0 + portNumber: 1 + ``` + +NNI support two kind of authorization method in paiYarn, including password and paiYarn token, [refer](https://github.com/microsoft/paiYarn/blob/b6bd2ab1c8890f91b7ac5859743274d2aa923c22/docs/rest-server/API.md#2-authentication). The authorization is configured in `paiYarnConfig` field. +For password authorization, the `paiYarnConfig` schema is: +``` +paiYarnConfig: + userName: your_paiYarn_nni_user + passWord: your_paiYarn_password + host: 10.1.1.1 +``` +For paiYarn token authorization, the `paiYarnConfig` schema is: +``` +paiYarnConfig: + userName: your_paiYarn_nni_user + token: your_paiYarn_token + host: 10.1.1.1 +``` + +Once complete to fill NNI experiment config file and save (for example, save as exp_paiYarn.yml), then run the following command +``` +nnictl create --config exp_paiYarn.yml +``` +to start the experiment in paiYarn mode. NNI will create OpenpaiYarn job for each trial, and the job name format is something like `nni_exp_{experiment_id}_trial_{trial_id}`. +You can see jobs created by NNI in the OpenpaiYarn cluster's web portal, like: +![](../../img/nni_paiYarn_joblist.jpg) + +Notice: In paiYarn mode, NNIManager will start a rest server and listen on a port which is your NNI WebUI's port plus 1. For example, if your WebUI port is `8080`, the rest server will listen on `8081`, to receive metrics from trial job running in Kubernetes. So you should `enable 8081` TCP port in your firewall rule to allow incoming traffic. + +Once a trial job is completed, you can goto NNI WebUI's overview page (like http://localhost:8080/oview) to check trial's information. + +Expand a trial information in trial list view, click the logPath link like: +![](../../img/nni_webui_joblist.jpg) + +And you will be redirected to HDFS web portal to browse the output files of that trial in HDFS: +![](../../img/nni_trial_hdfs_output.jpg) + +You can see there're three fils in output folder: stderr, stdout, and trial.log + +## data management +If your training data is not too large, it could be put into codeDir, and nni will upload the data to hdfs, or you could build your own docker image with the data. If you have large dataset, it's not appropriate to put the data in codeDir, and you could follow the [guidance](https://github.com/microsoft/paiYarn/blob/master/docs/user/storage.md) to mount the data folder in container. + +If you also want to save trial's other output into HDFS, like model files, you can use environment variable `NNI_OUTPUT_DIR` in your trial code to save your own output files, and NNI SDK will copy all the files in `NNI_OUTPUT_DIR` from trial's container to HDFS, the target path is `hdfs://host:port/{username}/nni/{experiments}/{experimentId}/trials/{trialId}/nnioutput` + +## version check +NNI support version check feature in since version 0.6. It is a policy to insure the version of NNIManager is consistent with trialKeeper, and avoid errors caused by version incompatibility. +Check policy: +1. NNIManager before v0.6 could run any version of trialKeeper, trialKeeper support backward compatibility. +2. Since version 0.6, NNIManager version should keep same with triakKeeper version. For example, if NNIManager version is 0.6, trialKeeper version should be 0.6 too. +3. Note that the version check feature only check first two digits of version.For example, NNIManager v0.6.1 could use trialKeeper v0.6 or trialKeeper v0.6.2, but could not use trialKeeper v0.5.1 or trialKeeper v0.7. + +If you could not run your experiment and want to know if it is caused by version check, you could check your webUI, and there will be an error message about version check. +![](../../img/version_check.png) From d0192827b0dfa762d48a63917753066d93f4cdcd Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Sun, 15 Dec 2019 23:06:29 +0800 Subject: [PATCH 09/18] fix virtualCluster --- docs/en_US/TrainingService/PaiMode.md | 2 +- .../training_service/pai_base/pai/paiConfig.ts | 4 ++-- .../pai_base/pai/paiTrainingService.ts | 11 +++++++---- tools/nni_cmd/config_schema.py | 2 +- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/docs/en_US/TrainingService/PaiMode.md b/docs/en_US/TrainingService/PaiMode.md index 0f8e9b58b2..fe3e779f01 100644 --- a/docs/en_US/TrainingService/PaiMode.md +++ b/docs/en_US/TrainingService/PaiMode.md @@ -55,7 +55,7 @@ Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMod * Required key. In pai mode, your trial program will be scheduled by OpenPAI to run in [Docker container](https://www.docker.com/). This key is used to specify the Docker image used to create the container in which your trial will run. * We already build a docker image [nnimsra/nni](https://hub.docker.com/r/msranni/nni/) on [Docker Hub](https://hub.docker.com/). It contains NNI python packages, Node modules and javascript artifact files required to start experiment, and all of NNI dependencies. The docker file used to build this image can be found at [here](https://github.com/Microsoft/nni/tree/master/deployment/docker/Dockerfile). You can either use this image directly in your config file, or build your own image based on it. * virtualCluster - * Required key. Set the virtualCluster of OpenPAI. If omitted, the job will run on default virtual cluster. + * Optional key. Set the virtualCluster of OpenPAI. If omitted, the job will run on default virtual cluster. * nniManagerNFSMountPath * Required key. Set the mount path in your nniManager machine. * containerNFSMountPath diff --git a/src/nni_manager/training_service/pai_base/pai/paiConfig.ts b/src/nni_manager/training_service/pai_base/pai/paiConfig.ts index de043e301f..3463867078 100644 --- a/src/nni_manager/training_service/pai_base/pai/paiConfig.ts +++ b/src/nni_manager/training_service/pai_base/pai/paiConfig.ts @@ -33,8 +33,8 @@ export class NNIPAITrialConfig extends TrialConfig { public readonly paiStoragePlugin: string; constructor(command: string, codeDir: string, gpuNum: number, cpuNum: number, memoryMB: number, - image: string, virtualCluster: string, nniManagerNFSMountPath: string, containerNFSMountPath: string, - paiStoragePlugin: string) { + image: string, nniManagerNFSMountPath: string, containerNFSMountPath: string, + paiStoragePlugin: string, virtualCluster?: string) { super(command, codeDir, gpuNum); this.cpuNum = cpuNum; this.memoryMB = memoryMB; diff --git a/src/nni_manager/training_service/pai_base/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai_base/pai/paiTrainingService.ts index ec0549ab57..f26b01e478 100644 --- a/src/nni_manager/training_service/pai_base/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai_base/pai/paiTrainingService.ts @@ -180,7 +180,7 @@ class PAITrainingService extends PAIBaseTrainingService { throw new Error('trial config is not initialized'); } const jobName = `nni_exp_${this.experimentId}_trial_${trialJobId}` - const paiJobConfig = { + const paiJobConfig: any = { protocolVersion: 2, name: jobName, type: 'job', @@ -211,9 +211,6 @@ class PAITrainingService extends PAIBaseTrainingService { ] } }, - defaults: { - virtualCluster: this.paiTrialConfig.virtualCluster - }, extras: { 'com.microsoft.pai.runtimeplugin': [ { @@ -223,6 +220,12 @@ class PAITrainingService extends PAIBaseTrainingService { submitFrom: 'submit-job-v2' } } + if (this.paiTrialConfig.virtualCluster) { + paiJobConfig.defaults= { + virtualCluster: this.paiTrialConfig.virtualCluster + } + } + return yaml.safeDump(paiJobConfig); } diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index d6dcdb64ae..649bae307c 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -268,7 +268,7 @@ def setPathCheck(key): 'cpuNum': setNumberRange('cpuNum', int, 0, 99999), 'memoryMB': setType('memoryMB', int), 'image': setType('image', str), - 'virtualCluster': setType('virtualCluster', str), + Optional('virtualCluster'): setType('virtualCluster', str), 'nniManagerNFSMountPath': setPathCheck('nniManagerNFSMountPath'), 'containerNFSMountPath': setType('containerNFSMountPath', str), 'paiStoragePlugin': setType('paiStoragePlugin', str) From 3c77067854fd0a3be5d4844298e4892a6c83a7b3 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Mon, 16 Dec 2019 09:49:54 +0800 Subject: [PATCH 10/18] fix eslint --- .../training_service/pai_base/paiBaseJobRestServer.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/pai_base/paiBaseJobRestServer.ts b/src/nni_manager/training_service/pai_base/paiBaseJobRestServer.ts index a6fe305cdc..fe7b9f0ad6 100644 --- a/src/nni_manager/training_service/pai_base/paiBaseJobRestServer.ts +++ b/src/nni_manager/training_service/pai_base/paiBaseJobRestServer.ts @@ -29,7 +29,7 @@ export class PAIBaseJobRestServer extends ClusterJobRestServer { /** * constructor to provide NNIRestServer's own rest property, e.g. port */ - constructor(paiBaseTrainingService :PAIBaseTrainingService) { + constructor (paiBaseTrainingService: PAIBaseTrainingService) { super(); this.paiBaseTrainingService = paiBaseTrainingService; } From 8fab408341764ee436c816b5a31f4e29ab743260 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Thu, 19 Dec 2019 23:35:03 +0800 Subject: [PATCH 11/18] fix comments --- docs/en_US/TrainingService/PaiMode.md | 2 +- docs/en_US/TrainingService/PaiYarnMode.md | 2 +- .../pai_base/pai/paiJobRestServer.ts | 2 - .../pai_base/pai/paiTrainingService.ts | 41 +++--- .../pai_base/paiBaseTrainingService.ts | 33 ++--- .../pai_base/paiYarn/paiYarnJobRestServer.ts | 2 - .../paiYarn/paiYarnTrainingService.ts | 133 ++---------------- 7 files changed, 47 insertions(+), 168 deletions(-) diff --git a/docs/en_US/TrainingService/PaiMode.md b/docs/en_US/TrainingService/PaiMode.md index fe3e779f01..3174e1079b 100644 --- a/docs/en_US/TrainingService/PaiMode.md +++ b/docs/en_US/TrainingService/PaiMode.md @@ -61,7 +61,7 @@ Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMod * containerNFSMountPath * Required key. Set the mount path in your container used in PAI. * paiStoragePlugin - * Required key. Set the sotrage plugin name used in PAI. + * Required key. Set the storage plugin name used in PAI. Once complete to fill NNI experiment config file and save (for example, save as exp_pai.yml), then run the following command diff --git a/docs/en_US/TrainingService/PaiYarnMode.md b/docs/en_US/TrainingService/PaiYarnMode.md index 64b96f14a6..078ac1dc02 100644 --- a/docs/en_US/TrainingService/PaiYarnMode.md +++ b/docs/en_US/TrainingService/PaiYarnMode.md @@ -1,6 +1,6 @@ **Run an Experiment on OpenpaiYarn** === -The original `pai` mode is modificated to `paiYarnYarn` mode, which is a distributed training platform based on Yarn. +The original `pai` mode is modificated to `paiYarn` mode, which is a distributed training platform based on Yarn. ## Setup environment Install NNI, follow the install guide [here](../Tutorial/QuickStart.md). diff --git a/src/nni_manager/training_service/pai_base/pai/paiJobRestServer.ts b/src/nni_manager/training_service/pai_base/pai/paiJobRestServer.ts index bc5653e540..a89bdacc88 100644 --- a/src/nni_manager/training_service/pai_base/pai/paiJobRestServer.ts +++ b/src/nni_manager/training_service/pai_base/pai/paiJobRestServer.ts @@ -22,8 +22,6 @@ export interface ParameterFileMeta { */ @component.Singleton export class PAIJobRestServer extends PAIBaseJobRestServer { - protected parameterFileMetaList: ParameterFileMeta[] = []; - /** * constructor to provide NNIRestServer's own rest property, e.g. port */ diff --git a/src/nni_manager/training_service/pai_base/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai_base/pai/paiTrainingService.ts index f26b01e478..badd58b870 100644 --- a/src/nni_manager/training_service/pai_base/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai_base/pai/paiTrainingService.ts @@ -58,35 +58,20 @@ class PAITrainingService extends PAIBaseTrainingService { } public async run(): Promise { - this.log.info('Run PAI training service.'); - const restServer: PAIJobRestServer = component.get(PAIJobRestServer); - await restServer.start(); - restServer.setEnableVersionCheck = this.versionCheck; - this.log.info(`PAI Training service rest server listening on: ${restServer.endPoint}`); + this.log.info('Run PAIYarn training service.'); + this.paiBaseJobRestServer = component.get(PAIJobRestServer); + if (this.paiBaseJobRestServer === undefined) { + throw new Error('paiBaseJobRestServer not initialized!'); + } + await this.paiBaseJobRestServer.start(); + this.paiBaseJobRestServer.setEnableVersionCheck = this.versionCheck; + this.log.info(`PAI Training service rest server listening on: ${this.paiBaseJobRestServer.endPoint}`); await Promise.all([ this.statusCheckingLoop(), this.submitJobLoop()]); this.log.info('PAI training service exit.'); } - public async cleanUp(): Promise { - this.log.info('Stopping PAI training service...'); - this.stopping = true; - - const deferred: Deferred = new Deferred(); - const restServer: PAIJobRestServer = component.get(PAIJobRestServer); - try { - await restServer.stop(); - deferred.resolve(); - this.log.info('PAI Training service rest server stopped successfully.'); - } catch (error) { - this.log.error(`PAI Training service rest server stopped failed, error: ${error.message}`); - deferred.reject(error); - } - - return deferred.promise; - } - public async setClusterMetadata(key: string, value: string): Promise { const deferred: Deferred = new Deferred(); @@ -145,6 +130,15 @@ class PAITrainingService extends PAIBaseTrainingService { return deferred.promise; } + + //TODO: update trial parameters + public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise { + const trialJobDetail: undefined | TrialJobDetail = this.trialJobsMap.get(trialJobId); + if (trialJobDetail === undefined) { + throw new Error(`updateTrialJob failed: ${trialJobId} not found`); + } + return trialJobDetail; + } public async submitTrialJob(form: TrialJobApplicationForm): Promise { if (this.paiBaseClusterConfig === undefined) { @@ -294,7 +288,6 @@ class PAITrainingService extends PAIBaseTrainingService { this.log.info(`nniPAItrial command is ${nniPaiTrialCommand.trim()}`); const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobId, nniPaiTrialCommand); - console.log(paiJobConfig); // Step 3. Submit PAI job via Rest call // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API diff --git a/src/nni_manager/training_service/pai_base/paiBaseTrainingService.ts b/src/nni_manager/training_service/pai_base/paiBaseTrainingService.ts index 4a9216783f..d05e1896d3 100644 --- a/src/nni_manager/training_service/pai_base/paiBaseTrainingService.ts +++ b/src/nni_manager/training_service/pai_base/paiBaseTrainingService.ts @@ -25,13 +25,14 @@ import { execMkdir, validateCodeDir } from '../common/util'; import { PAIBaseJobInfoCollector } from './paiBaseJobInfoCollector'; import { PAIBaseJobRestServer, ParameterFileMeta } from './paiBaseJobRestServer'; import { PAIBaseClusterConfig, PAIBaseTrialJobDetail } from './paiBaseConfig'; +import { PAIJobRestServer } from './pai/paiJobRestServer'; /** * Training Service implementation for OpenPAI (Open Platform for AI) * Refer https://github.com/Microsoft/pai for more info about OpenPAI */ @component.Singleton -class PAIBaseTrainingService implements TrainingService { +abstract class PAIBaseTrainingService implements TrainingService { protected readonly log!: Logger; protected readonly metricsEmitter: EventEmitter; protected readonly trialJobsMap: Map; @@ -51,6 +52,7 @@ class PAIBaseTrainingService implements TrainingService { protected isMultiPhase: boolean = false; protected authFileHdfsPath: string | undefined = undefined; protected portList?: string | undefined; + protected paiBaseJobRestServer?: PAIJobRestServer; constructor() { this.log = getLogger(); @@ -66,24 +68,19 @@ class PAIBaseTrainingService implements TrainingService { } public async run(): Promise { - return; + throw new Error('Not implemented!'); } public async submitTrialJob(form: TrialJobApplicationForm): Promise { - return null; + throw new Error('Not implemented!'); } public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise { - const trialJobDetail: undefined | TrialJobDetail = this.trialJobsMap.get(trialJobId); - if (trialJobDetail === undefined) { - throw new Error(`updateTrialJob failed: ${trialJobId} not found`); - } - - return trialJobDetail; + throw new Error('Not implemented!'); } protected async submitTrialJobToPAI(trialJobId: string): Promise { - return true; + throw new Error('Not implemented!'); } protected async submitJobLoop(): Promise { @@ -103,7 +100,7 @@ class PAIBaseTrainingService implements TrainingService { } public async setClusterMetadata(key: string, value: string): Promise { - return; + throw new Error('Not implemented!'); } public async listTrialJobs(): Promise { @@ -197,11 +194,13 @@ class PAIBaseTrainingService implements TrainingService { public async cleanUp(): Promise { this.log.info('Stopping PAI training service...'); this.stopping = true; + if (this.paiBaseJobRestServer === undefined) { + throw new Error('paiBaseJobRestServer not initialized!'); + } const deferred: Deferred = new Deferred(); - const restServer: PAIBaseJobRestServer = component.get(PAIBaseJobRestServer); try { - await restServer.stop(); + await this.paiBaseJobRestServer.stop(); deferred.resolve(); this.log.info('PAI Training service rest server stopped successfully.'); } catch (error) { @@ -230,9 +229,11 @@ class PAIBaseTrainingService implements TrainingService { } } await this.paiJobCollector.retrieveTrialStatus(this.paiToken, this.paiBaseClusterConfig); - const restServer: PAIBaseJobRestServer = component.get(PAIBaseJobRestServer); - if (restServer.getErrorMessage !== undefined) { - throw new Error(restServer.getErrorMessage); + if (this.paiBaseJobRestServer === undefined) { + throw new Error('paiBaseJobRestServer not implemented!'); + } + if (this.paiBaseJobRestServer.getErrorMessage !== undefined) { + throw new Error(this.paiBaseJobRestServer.getErrorMessage); } await delay(3000); } diff --git a/src/nni_manager/training_service/pai_base/paiYarn/paiYarnJobRestServer.ts b/src/nni_manager/training_service/pai_base/paiYarn/paiYarnJobRestServer.ts index 15876664c0..72e9965c5e 100644 --- a/src/nni_manager/training_service/pai_base/paiYarn/paiYarnJobRestServer.ts +++ b/src/nni_manager/training_service/pai_base/paiYarn/paiYarnJobRestServer.ts @@ -22,8 +22,6 @@ export interface ParameterFileMeta { */ @component.Singleton export class PAIYarnJobRestServer extends PAIBaseJobRestServer { - protected parameterFileMetaList: ParameterFileMeta[] = []; - /** * constructor to provide NNIRestServer's own rest property, e.g. port */ diff --git a/src/nni_manager/training_service/pai_base/paiYarn/paiYarnTrainingService.ts b/src/nni_manager/training_service/pai_base/paiYarn/paiYarnTrainingService.ts index 949c09cf99..2d9c36b28b 100644 --- a/src/nni_manager/training_service/pai_base/paiYarn/paiYarnTrainingService.ts +++ b/src/nni_manager/training_service/pai_base/paiYarn/paiYarnTrainingService.ts @@ -49,24 +49,19 @@ class PAIYarnTrainingService extends PAIBaseTrainingService { public async run(): Promise { this.log.info('Run PAIYarn training service.'); - const restServer: PAIYarnJobRestServer = component.get(PAIYarnJobRestServer); - await restServer.start(); - restServer.setEnableVersionCheck = this.versionCheck; - this.log.info(`PAI Training service rest server listening on: ${restServer.endPoint}`); + this.paiBaseJobRestServer = component.get(PAIYarnJobRestServer); + if (this.paiBaseJobRestServer === undefined) { + throw new Error('paiBaseJobRestServer not initialized!'); + } + await this.paiBaseJobRestServer.start(); + this.paiBaseJobRestServer.setEnableVersionCheck = this.versionCheck; + this.log.info(`PAI Training service rest server listening on: ${this.paiBaseJobRestServer.endPoint}`); await Promise.all([ this.statusCheckingLoop(), this.submitJobLoop()]); this.log.info('PAI training service exit.'); } - public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { - this.metricsEmitter.on('metric', listener); - } - - public removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { - this.metricsEmitter.off('metric', listener); - } - public async submitTrialJob(form: TrialJobApplicationForm): Promise { if (this.paiBaseClusterConfig === undefined) { throw new Error(`paiBaseClusterConfig not initialized!`); @@ -184,36 +179,6 @@ class PAIYarnTrainingService extends PAIBaseTrainingService { return deferred.promise; } - public getClusterMetadata(key: string): Promise { - const deferred: Deferred = new Deferred(); - - deferred.resolve(); - - return deferred.promise; - } - - public async cleanUp(): Promise { - this.log.info('Stopping PAI training service...'); - this.stopping = true; - - const deferred: Deferred = new Deferred(); - const restServer: PAIYarnJobRestServer = component.get(PAIYarnJobRestServer); - try { - await restServer.stop(); - deferred.resolve(); - this.log.info('PAI Training service rest server stopped successfully.'); - } catch (error) { - this.log.error(`PAI Training service rest server stopped failed, error: ${error.message}`); - deferred.reject(error); - } - - return deferred.promise; - } - - public get MetricsEmitter(): EventEmitter { - return this.metricsEmitter; - } - protected async submitTrialJobToPAI(trialJobId: string): Promise { const deferred: Deferred = new Deferred(); const trialJobDetail: PAIBaseTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); @@ -362,84 +327,6 @@ class PAIYarnTrainingService extends PAIBaseTrainingService { return deferred.promise; } - protected async statusCheckingLoop(): Promise { - while (!this.stopping) { - if(this.paiBaseClusterConfig && this.paiBaseClusterConfig.passWord) { - try { - await this.updatePaiToken(); - } catch (error) { - this.log.error(`${error}`); - //only throw error when initlize paiToken first time - if (this.paiToken === undefined) { - throw new Error(error); - } - } - } - await this.paiJobCollector.retrieveTrialStatus(this.paiToken, this.paiBaseClusterConfig); - const restServer: PAIYarnJobRestServer = component.get(PAIYarnJobRestServer); - if (restServer.getErrorMessage !== undefined) { - throw new Error(restServer.getErrorMessage); - } - await delay(3000); - } - } - - /** - * Update pai token by the interval time or initialize the pai token - */ - protected async updatePaiToken(): Promise { - const deferred: Deferred = new Deferred(); - - const currentTime: number = new Date().getTime(); - //If pai token initialized and not reach the interval time, do not update - if (this.paiTokenUpdateTime !== undefined && (currentTime - this.paiTokenUpdateTime) < this.paiTokenUpdateInterval) { - return Promise.resolve(); - } - - if (this.paiBaseClusterConfig === undefined) { - const paiClusterConfigError: string = `pai cluster config not initialized!`; - this.log.error(`${paiClusterConfigError}`); - throw Error(`${paiClusterConfigError}`); - } - - const authenticationReq: request.Options = { - uri: `http://${this.paiBaseClusterConfig.host}/rest-server/api/v1/token`, - method: 'POST', - json: true, - body: { - username: this.paiBaseClusterConfig.userName, - password: this.paiBaseClusterConfig.passWord - } - }; - - request(authenticationReq, (error: Error, response: request.Response, body: any) => { - if (error !== undefined && error !== null) { - this.log.error(`Get PAI token failed: ${error.message}`); - deferred.reject(new Error(`Get PAI token failed: ${error.message}`)); - } else { - if (response.statusCode !== 200) { - this.log.error(`Get PAI token failed: get PAI Rest return code ${response.statusCode}`); - deferred.reject(new Error(`Get PAI token failed: ${response.body}, please check paiConfig username or password`)); - } - this.paiToken = body.token; - this.paiTokenUpdateTime = new Date().getTime(); - deferred.resolve(); - } - }); - - let timeoutId: NodeJS.Timer; - const timeoutDelay: Promise = new Promise((resolve: Function, reject: Function): void => { - // Set timeout and reject the promise once reach timeout (5 seconds) - timeoutId = setTimeout( - () => reject(new Error('Get PAI token timeout. Please check your PAI cluster.')), - 5000); - }); - - return Promise.race([timeoutDelay, deferred.promise]) - .finally(() => { clearTimeout(timeoutId); }); - } - - public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise { const trialJobDetail: undefined | TrialJobDetail = this.trialJobsMap.get(trialJobId); if (trialJobDetail === undefined) { @@ -476,9 +363,11 @@ class PAIYarnTrainingService extends PAIBaseTrainingService { protected postParameterFileMeta(parameterFileMeta: ParameterFileMeta): Promise { const deferred: Deferred = new Deferred(); - const restServer: PAIYarnJobRestServer = component.get(PAIYarnJobRestServer); + if (this.paiBaseJobRestServer === undefined) { + throw new Error('paiBaseJobRestServer not implemented!'); + } const req: request.Options = { - uri: `${restServer.endPoint}${restServer.apiRootUrl}/parameter-file-meta`, + uri: `${this.paiBaseJobRestServer.endPoint}${this.paiBaseJobRestServer.apiRootUrl}/parameter-file-meta`, method: 'POST', json: true, body: parameterFileMeta From 9a82fcc6796c5de1c21dca9188ac98492068ac6f Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Fri, 20 Dec 2019 11:06:18 +0800 Subject: [PATCH 12/18] fix pipeline --- .../training_service/pai_base/pai/paiTrainingService.ts | 3 ++- test/generate_ts_config.py | 6 +++--- test/training_service.yml | 4 ++-- tools/nni_trial_tool/trial_keeper.py | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/nni_manager/training_service/pai_base/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai_base/pai/paiTrainingService.ts index badd58b870..a83a7535ae 100644 --- a/src/nni_manager/training_service/pai_base/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai_base/pai/paiTrainingService.ts @@ -303,7 +303,8 @@ class PAITrainingService extends PAIBaseTrainingService { request(submitJobRequest, (error: Error, response: request.Response, body: any) => { if ((error !== undefined && error !== null) || response.statusCode >= 400) { const errorMessage: string = (error !== undefined && error !== null) ? error.message : - `Submit trial ${trialJobId} failed, http code:${response.statusCode}, http body: ${response.body.message}`; + `Submit trial ${trialJobId} failed, http code:${response.statusCode}, http body: ${body}`; + this.log.error(errorMessage); trialJobDetail.status = 'FAILED'; } else { diff --git a/test/generate_ts_config.py b/test/generate_ts_config.py index f4a61019b3..53de5d8d0d 100644 --- a/test/generate_ts_config.py +++ b/test/generate_ts_config.py @@ -14,11 +14,11 @@ def update_training_service_config(args): config[args.ts]['nniManagerIp'] = args.nni_manager_ip if args.ts == 'pai': if args.pai_user is not None: - config[args.ts]['paiConfig']['userName'] = args.pai_user + config[args.ts]['paiYarnConfig']['userName'] = args.pai_user if args.pai_pwd is not None: - config[args.ts]['paiConfig']['passWord'] = args.pai_pwd + config[args.ts]['paiYarnConfig']['passWord'] = args.pai_pwd if args.pai_host is not None: - config[args.ts]['paiConfig']['host'] = args.pai_host + config[args.ts]['paiYarnConfig']['host'] = args.pai_host if args.nni_docker_image is not None: config[args.ts]['trial']['image'] = args.nni_docker_image if args.data_dir is not None: diff --git a/test/training_service.yml b/test/training_service.yml index a68954499a..9fe8a85a0b 100644 --- a/test/training_service.yml +++ b/test/training_service.yml @@ -29,11 +29,11 @@ local: pai: nniManagerIp: maxExecDuration: 15m - paiConfig: + paiYarnConfig: host: passWord: userName: - trainingServicePlatform: pai + trainingServicePlatform: paiYarn trial: gpuNum: 1 cpuNum: 1 diff --git a/tools/nni_trial_tool/trial_keeper.py b/tools/nni_trial_tool/trial_keeper.py index c83bfb9ffd..6da3a7bfc2 100644 --- a/tools/nni_trial_tool/trial_keeper.py +++ b/tools/nni_trial_tool/trial_keeper.py @@ -223,7 +223,7 @@ def run(self): exit(1) check_version(args) try: - if NNI_PLATFORM == 'pai' and is_multi_phase(): + if NNI_PLATFORM == 'paiYarn' and is_multi_phase(): fetch_parameter_file(args) main_loop(args) except SystemExit as se: From 217f454e8499b3bc8d286d1e557dff6f93b99767 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Fri, 20 Dec 2019 14:05:58 +0800 Subject: [PATCH 13/18] refactor folder structure --- src/nni_manager/main.ts | 6 +- .../paiBaseConfig.ts => pai/paiConfig.ts} | 4 +- .../paiJobInfoCollector.ts} | 12 +-- .../paiJobRestServer.ts} | 12 +-- .../paiK8S/paiK8SConfig.ts} | 2 +- .../paiData.ts => pai/paiK8S/paiK8SData.ts} | 2 +- .../paiK8S/paiK8SJobRestServer.ts} | 8 +- .../paiK8S/paiK8STrainingService.ts} | 65 +++++++--------- .../paiTrainingService.ts} | 67 +++++++++-------- .../paiYarn/hdfsClientUtility.ts | 0 .../paiYarn/paiYarnConfig.ts | 0 .../{pai_base => pai}/paiYarn/paiYarnData.ts | 0 .../paiYarn/paiYarnJobRestServer.ts | 4 +- .../paiYarn/paiYarnTrainingService.ts | 74 ++++++++----------- .../paiYarn/paiYarnTrialConfig.ts | 0 .../test/hdfsClientUtility.test.ts | 2 +- .../test/paiYarnTrainingService.test.ts | 2 +- 17 files changed, 121 insertions(+), 139 deletions(-) rename src/nni_manager/training_service/{pai_base/paiBaseConfig.ts => pai/paiConfig.ts} (94%) rename src/nni_manager/training_service/{pai_base/paiBaseJobInfoCollector.ts => pai/paiJobInfoCollector.ts} (92%) rename src/nni_manager/training_service/{pai_base/paiBaseJobRestServer.ts => pai/paiJobRestServer.ts} (84%) rename src/nni_manager/training_service/{pai_base/pai/paiConfig.ts => pai/paiK8S/paiK8SConfig.ts} (97%) rename src/nni_manager/training_service/{pai_base/pai/paiData.ts => pai/paiK8S/paiK8SData.ts} (97%) rename src/nni_manager/training_service/{pai_base/pai/paiJobRestServer.ts => pai/paiK8S/paiK8SJobRestServer.ts} (74%) rename src/nni_manager/training_service/{pai_base/pai/paiTrainingService.ts => pai/paiK8S/paiK8STrainingService.ts} (82%) rename src/nni_manager/training_service/{pai_base/paiBaseTrainingService.ts => pai/paiTrainingService.ts} (81%) rename src/nni_manager/training_service/{pai_base => pai}/paiYarn/hdfsClientUtility.ts (100%) rename src/nni_manager/training_service/{pai_base => pai}/paiYarn/paiYarnConfig.ts (100%) rename src/nni_manager/training_service/{pai_base => pai}/paiYarn/paiYarnData.ts (100%) rename src/nni_manager/training_service/{pai_base => pai}/paiYarn/paiYarnJobRestServer.ts (86%) rename src/nni_manager/training_service/{pai_base => pai}/paiYarn/paiYarnTrainingService.ts (84%) rename src/nni_manager/training_service/{pai_base => pai}/paiYarn/paiYarnTrialConfig.ts (100%) diff --git a/src/nni_manager/main.ts b/src/nni_manager/main.ts index ff34476580..f707304382 100644 --- a/src/nni_manager/main.ts +++ b/src/nni_manager/main.ts @@ -20,8 +20,8 @@ import { NNIRestServer } from './rest_server/nniRestServer'; import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService'; import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService'; import { LocalTrainingService } from './training_service/local/localTrainingService'; -import { PAITrainingService } from './training_service/pai_base/pai/paiTrainingService'; -import { PAIYarnTrainingService } from './training_service/pai_base/paiYarn/paiYarnTrainingService'; +import { PAIK8STrainingService } from './training_service/pai/paiK8S/paiK8STrainingService'; +import { PAIYarnTrainingService } from './training_service/pai/paiYarn/paiYarnTrainingService'; import { RemoteMachineTrainingService } from './training_service/remote_machine/remoteMachineTrainingService'; @@ -45,7 +45,7 @@ async function initContainer(platformMode: string, logFileName?: string): Promis .scope(Scope.Singleton); } else if (platformMode === 'pai') { Container.bind(TrainingService) - .to(PAITrainingService) + .to(PAIK8STrainingService) .scope(Scope.Singleton); } else if (platformMode === 'paiYarn') { Container.bind(TrainingService) diff --git a/src/nni_manager/training_service/pai_base/paiBaseConfig.ts b/src/nni_manager/training_service/pai/paiConfig.ts similarity index 94% rename from src/nni_manager/training_service/pai_base/paiBaseConfig.ts rename to src/nni_manager/training_service/pai/paiConfig.ts index 4af2f6150f..c8f1c414fc 100644 --- a/src/nni_manager/training_service/pai_base/paiBaseConfig.ts +++ b/src/nni_manager/training_service/pai/paiConfig.ts @@ -6,7 +6,7 @@ import {TrialConfig} from '../common/trialConfig'; import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; -export class PAIBaseClusterConfig { +export class PAIClusterConfig { public readonly userName: string; public readonly passWord?: string; public readonly host: string; @@ -30,7 +30,7 @@ export class PAIBaseClusterConfig { /** * PAI trial job detail */ -export class PAIBaseTrialJobDetail implements TrialJobDetail { +export class PAITrialJobDetail implements TrialJobDetail { public id: string; public status: TrialJobStatus; public paiJobName: string; diff --git a/src/nni_manager/training_service/pai_base/paiBaseJobInfoCollector.ts b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts similarity index 92% rename from src/nni_manager/training_service/pai_base/paiBaseJobInfoCollector.ts rename to src/nni_manager/training_service/pai/paiJobInfoCollector.ts index 5ff4f9a058..33e98afede 100644 --- a/src/nni_manager/training_service/pai_base/paiBaseJobInfoCollector.ts +++ b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts @@ -8,24 +8,24 @@ import { Deferred } from 'ts-deferred'; import { NNIError, NNIErrorNames } from '../../common/errors'; import { getLogger, Logger } from '../../common/log'; import { TrialJobStatus } from '../../common/trainingService'; -import { PAIBaseClusterConfig, PAIBaseTrialJobDetail } from './paiBaseConfig'; +import { PAIClusterConfig, PAITrialJobDetail } from './paiConfig'; /** * Collector PAI jobs info from PAI cluster, and update pai job status locally */ -export class PAIBaseJobInfoCollector { - private readonly trialJobsMap: Map; +export class PAIJobInfoCollector { + private readonly trialJobsMap: Map; private readonly log: Logger = getLogger(); private readonly statusesNeedToCheck: TrialJobStatus[]; private readonly finalStatuses: TrialJobStatus[]; - constructor(jobMap: Map) { + constructor(jobMap: Map) { this.trialJobsMap = jobMap; this.statusesNeedToCheck = ['RUNNING', 'UNKNOWN', 'WAITING']; this.finalStatuses = ['SUCCEEDED', 'FAILED', 'USER_CANCELED', 'SYS_CANCELED', 'EARLY_STOPPED']; } - public async retrieveTrialStatus(token? : string, paiBaseClusterConfig?: PAIBaseClusterConfig): Promise { + public async retrieveTrialStatus(token? : string, paiBaseClusterConfig?: PAIClusterConfig): Promise { if (paiBaseClusterConfig === undefined || token === undefined) { return Promise.resolve(); } @@ -41,7 +41,7 @@ export class PAIBaseJobInfoCollector { await Promise.all(updatePaiTrialJobs); } - private getSinglePAITrialJobInfo(paiBaseTrialJob: PAIBaseTrialJobDetail, paiToken: string, paiBaseClusterConfig: PAIBaseClusterConfig): Promise { + private getSinglePAITrialJobInfo(paiBaseTrialJob: PAITrialJobDetail, paiToken: string, paiBaseClusterConfig: PAIClusterConfig): Promise { const deferred: Deferred = new Deferred(); if (!this.statusesNeedToCheck.includes(paiBaseTrialJob.status)) { deferred.resolve(); diff --git a/src/nni_manager/training_service/pai_base/paiBaseJobRestServer.ts b/src/nni_manager/training_service/pai/paiJobRestServer.ts similarity index 84% rename from src/nni_manager/training_service/pai_base/paiBaseJobRestServer.ts rename to src/nni_manager/training_service/pai/paiJobRestServer.ts index fe7b9f0ad6..78ab123fbf 100644 --- a/src/nni_manager/training_service/pai_base/paiBaseJobRestServer.ts +++ b/src/nni_manager/training_service/pai/paiJobRestServer.ts @@ -7,7 +7,7 @@ import { Request, Response, Router } from 'express'; import { Inject } from 'typescript-ioc'; import * as component from '../../common/component'; import { ClusterJobRestServer } from '../common/clusterJobRestServer'; -import { PAIBaseTrainingService } from './paiBaseTrainingService'; +import { PAITrainingService } from './paiTrainingService'; export interface ParameterFileMeta { readonly experimentId: string; @@ -20,25 +20,25 @@ export interface ParameterFileMeta { * */ @component.Singleton -export class PAIBaseJobRestServer extends ClusterJobRestServer { +export class PAIJobRestServer extends ClusterJobRestServer { protected parameterFileMetaList: ParameterFileMeta[] = []; @Inject - protected readonly paiBaseTrainingService: PAIBaseTrainingService; + protected readonly paiTrainingService: PAITrainingService; /** * constructor to provide NNIRestServer's own rest property, e.g. port */ - constructor (paiBaseTrainingService: PAIBaseTrainingService) { + constructor (paiBaseTrainingService: PAITrainingService) { super(); - this.paiBaseTrainingService = paiBaseTrainingService; + this.paiTrainingService = paiBaseTrainingService; } protected handleTrialMetrics(jobId: string, metrics: any[]): void { // Split metrics array into single metric, then emit // Warning: If not split metrics into single ones, the behavior will be UNKNOWN for (const singleMetric of metrics) { - this.paiBaseTrainingService.MetricsEmitter.emit('metric', { + this.paiTrainingService.MetricsEmitter.emit('metric', { id : jobId, data : singleMetric }); diff --git a/src/nni_manager/training_service/pai_base/pai/paiConfig.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8SConfig.ts similarity index 97% rename from src/nni_manager/training_service/pai_base/pai/paiConfig.ts rename to src/nni_manager/training_service/pai/paiK8S/paiK8SConfig.ts index 3463867078..70f175683e 100644 --- a/src/nni_manager/training_service/pai_base/pai/paiConfig.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8SConfig.ts @@ -23,7 +23,7 @@ import {TrialConfig} from '../../common/trialConfig'; /** * PAI trial configuration */ -export class NNIPAITrialConfig extends TrialConfig { +export class NNIPAIK8STrialConfig extends TrialConfig { public readonly cpuNum: number; public readonly memoryMB: number; public readonly image: string; diff --git a/src/nni_manager/training_service/pai_base/pai/paiData.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts similarity index 97% rename from src/nni_manager/training_service/pai_base/pai/paiData.ts rename to src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts index 7cbc4c6c73..a1733f99cd 100644 --- a/src/nni_manager/training_service/pai_base/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts @@ -29,7 +29,7 @@ else python3 -m pip install --user nni fi`; -export const PAI_LITE_TRIAL_COMMAND_FORMAT: string = +export const PAI_K8S_TRIAL_COMMAND_FORMAT: string = `export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \ && ls $NNI_SYS_DIR \ && cd $NNI_SYS_DIR && sh install_nni.sh \ diff --git a/src/nni_manager/training_service/pai_base/pai/paiJobRestServer.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8SJobRestServer.ts similarity index 74% rename from src/nni_manager/training_service/pai_base/pai/paiJobRestServer.ts rename to src/nni_manager/training_service/pai/paiK8S/paiK8SJobRestServer.ts index a89bdacc88..979570805b 100644 --- a/src/nni_manager/training_service/pai_base/pai/paiJobRestServer.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8SJobRestServer.ts @@ -7,8 +7,8 @@ import { Request, Response, Router } from 'express'; import { Inject } from 'typescript-ioc'; import * as component from '../../../common/component'; import { ClusterJobRestServer } from '../../common/clusterJobRestServer'; -import { PAITrainingService } from './paiTrainingService'; -import { PAIBaseJobRestServer } from '../paiBaseJobRestServer'; +import { PAIK8STrainingService } from './paiK8STrainingService'; +import { PAIJobRestServer } from '../paiJobRestServer'; export interface ParameterFileMeta { readonly experimentId: string; @@ -21,11 +21,11 @@ export interface ParameterFileMeta { * */ @component.Singleton -export class PAIJobRestServer extends PAIBaseJobRestServer { +export class PAIK8SJobRestServer extends PAIJobRestServer { /** * constructor to provide NNIRestServer's own rest property, e.g. port */ constructor() { - super(component.get(PAITrainingService)); + super(component.get(PAIK8STrainingService)); } } diff --git a/src/nni_manager/training_service/pai_base/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts similarity index 82% rename from src/nni_manager/training_service/pai_base/pai/paiTrainingService.ts rename to src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index a83a7535ae..9af56a1fdb 100644 --- a/src/nni_manager/training_service/pai_base/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -37,11 +37,11 @@ import { delay, generateParamFileName, import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; import { execMkdir, validateCodeDir, execCopydir } from '../../common/util'; -import { PAI_LITE_TRIAL_COMMAND_FORMAT } from './paiData'; -import { NNIPAITrialConfig } from './paiConfig'; -import { PAIJobRestServer } from './paiJobRestServer'; -import { PAIBaseTrainingService } from '../paiBaseTrainingService'; -import { PAIBaseClusterConfig, PAIBaseTrialJobDetail } from '../../pai_base/paiBaseConfig'; +import { PAI_K8S_TRIAL_COMMAND_FORMAT } from './paiK8SData'; +import { NNIPAIK8STrialConfig } from './paiK8SConfig'; +import { PAIK8SJobRestServer } from './paiK8SJobRestServer'; +import { PAITrainingService } from '../paiTrainingService'; +import { PAIClusterConfig, PAITrialJobDetail } from '../paiConfig'; const yaml = require('js-yaml'); @@ -50,28 +50,13 @@ const yaml = require('js-yaml'); * Refer https://github.com/Microsoft/pai for more info about OpenPAI */ @component.Singleton -class PAITrainingService extends PAIBaseTrainingService { - protected paiTrialConfig: NNIPAITrialConfig | undefined; +class PAIK8STrainingService extends PAITrainingService { + protected paiTrialConfig: NNIPAIK8STrialConfig | undefined; constructor() { super(); } - public async run(): Promise { - this.log.info('Run PAIYarn training service.'); - this.paiBaseJobRestServer = component.get(PAIJobRestServer); - if (this.paiBaseJobRestServer === undefined) { - throw new Error('paiBaseJobRestServer not initialized!'); - } - await this.paiBaseJobRestServer.start(); - this.paiBaseJobRestServer.setEnableVersionCheck = this.versionCheck; - this.log.info(`PAI Training service rest server listening on: ${this.paiBaseJobRestServer.endPoint}`); - await Promise.all([ - this.statusCheckingLoop(), - this.submitJobLoop()]); - this.log.info('PAI training service exit.'); - } - public async setClusterMetadata(key: string, value: string): Promise { const deferred: Deferred = new Deferred(); @@ -82,13 +67,14 @@ class PAITrainingService extends PAIBaseTrainingService { break; case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG: - this.paiBaseClusterConfig = JSON.parse(value); + this.paiJobRestServer = component.get(PAIK8SJobRestServer); + this.paiClusterConfig = JSON.parse(value); - if(this.paiBaseClusterConfig.passWord) { + if(this.paiClusterConfig.passWord) { // Get PAI authentication token await this.updatePaiToken(); - } else if(this.paiBaseClusterConfig.token) { - this.paiToken = this.paiBaseClusterConfig.token; + } else if(this.paiClusterConfig.token) { + this.paiToken = this.paiClusterConfig.token; } else { deferred.reject(new Error('pai cluster config format error, please set password or token!')); } @@ -97,12 +83,12 @@ class PAITrainingService extends PAIBaseTrainingService { break; case TrialConfigMetadataKey.TRIAL_CONFIG: - if (this.paiBaseClusterConfig === undefined) { + if (this.paiClusterConfig === undefined) { this.log.error('pai cluster config is not initialized'); deferred.reject(new Error('pai cluster config is not initialized')); break; } - this.paiTrialConfig = JSON.parse(value); + this.paiTrialConfig = JSON.parse(value); // Validate to make sure codeDir doesn't have too many files try { @@ -141,10 +127,10 @@ class PAITrainingService extends PAIBaseTrainingService { } public async submitTrialJob(form: TrialJobApplicationForm): Promise { - if (this.paiBaseClusterConfig === undefined) { + if (this.paiClusterConfig === undefined) { throw new Error(`paiClusterConfig not initialized!`); } - const deferred: Deferred = new Deferred(); + const deferred: Deferred = new Deferred(); this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`); @@ -153,7 +139,7 @@ class PAITrainingService extends PAIBaseTrainingService { const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId); const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; const logPath: string = ''; - const trialJobDetail: PAIBaseTrialJobDetail = new PAIBaseTrialJobDetail( + const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( trialJobId, 'WAITING', paiJobName, @@ -225,13 +211,13 @@ class PAITrainingService extends PAIBaseTrainingService { protected async submitTrialJobToPAI(trialJobId: string): Promise { const deferred: Deferred = new Deferred(); - const trialJobDetail: PAIBaseTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); + const trialJobDetail: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); if (trialJobDetail === undefined) { throw new Error(`Failed to find PAITrialJobDetail for job ${trialJobId}`); } - if (this.paiBaseClusterConfig === undefined) { + if (this.paiClusterConfig === undefined) { throw new Error('PAI Cluster config is not initialized'); } if (this.paiTrialConfig === undefined) { @@ -241,11 +227,12 @@ class PAITrainingService extends PAIBaseTrainingService { throw new Error('PAI token is not initialized'); } - if (this.paiRestServerPort === undefined) { - const restServer: PAIJobRestServer = component.get(PAIJobRestServer); - this.paiRestServerPort = restServer.clusterRestServerPort; + if (this.paiJobRestServer === undefined) { + throw new Error('paiJobRestServer is not initialized'); } + this.paiRestServerPort = this.paiJobRestServer.clusterRestServerPort; + // Step 1. Prepare PAI job configuration const trialLocalFolder: string = path.join(this.paiTrialConfig.nniManagerNFSMountPath, this.experimentId, trialJobId); //create trial local working folder locally. @@ -270,7 +257,7 @@ class PAITrainingService extends PAIBaseTrainingService { const version: string = this.versionCheck ? await getVersion() : ''; const containerWorkingDir: string = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}/${trialJobId}`; const nniPaiTrialCommand: string = String.Format( - PAI_LITE_TRIAL_COMMAND_FORMAT, + PAI_K8S_TRIAL_COMMAND_FORMAT, `${containerWorkingDir}`, `${containerWorkingDir}/nnioutput`, trialJobId, @@ -292,7 +279,7 @@ class PAITrainingService extends PAIBaseTrainingService { // Step 3. Submit PAI job via Rest call // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API const submitJobRequest: request.Options = { - uri: `http://${this.paiBaseClusterConfig.host}/rest-server/api/v2/jobs`, + uri: `http://${this.paiClusterConfig.host}/rest-server/api/v2/jobs`, method: 'POST', body: paiJobConfig, headers: { @@ -317,4 +304,4 @@ class PAITrainingService extends PAIBaseTrainingService { } } -export { PAITrainingService }; +export { PAIK8STrainingService }; diff --git a/src/nni_manager/training_service/pai_base/paiBaseTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts similarity index 81% rename from src/nni_manager/training_service/pai_base/paiBaseTrainingService.ts rename to src/nni_manager/training_service/pai/paiTrainingService.ts index d05e1896d3..b0c8d08fe2 100644 --- a/src/nni_manager/training_service/pai_base/paiBaseTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -22,29 +22,28 @@ import { delay, generateParamFileName, import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { execMkdir, validateCodeDir } from '../common/util'; -import { PAIBaseJobInfoCollector } from './paiBaseJobInfoCollector'; -import { PAIBaseJobRestServer, ParameterFileMeta } from './paiBaseJobRestServer'; -import { PAIBaseClusterConfig, PAIBaseTrialJobDetail } from './paiBaseConfig'; -import { PAIJobRestServer } from './pai/paiJobRestServer'; +import { PAIJobInfoCollector } from './paiJobInfoCollector'; +import { PAIJobRestServer, ParameterFileMeta } from './paiJobRestServer'; +import { PAIClusterConfig, PAITrialJobDetail } from './paiConfig'; /** * Training Service implementation for OpenPAI (Open Platform for AI) * Refer https://github.com/Microsoft/pai for more info about OpenPAI */ @component.Singleton -abstract class PAIBaseTrainingService implements TrainingService { +abstract class PAITrainingService implements TrainingService { protected readonly log!: Logger; protected readonly metricsEmitter: EventEmitter; - protected readonly trialJobsMap: Map; + protected readonly trialJobsMap: Map; protected readonly expRootDir: string; - protected paiBaseClusterConfig?: PAIBaseClusterConfig; + protected paiClusterConfig?: PAIClusterConfig; protected readonly jobQueue: string[]; protected stopping: boolean = false; protected paiToken? : string; protected paiTokenUpdateTime?: number; protected readonly paiTokenUpdateInterval: number; protected readonly experimentId!: string; - protected readonly paiJobCollector: PAIBaseJobInfoCollector; + protected readonly paiJobCollector: PAIJobInfoCollector; protected paiRestServerPort?: number; protected nniManagerIpConfig?: NNIManagerIpConfig; protected versionCheck: boolean = true; @@ -52,23 +51,33 @@ abstract class PAIBaseTrainingService implements TrainingService { protected isMultiPhase: boolean = false; protected authFileHdfsPath: string | undefined = undefined; protected portList?: string | undefined; - protected paiBaseJobRestServer?: PAIJobRestServer; + protected paiJobRestServer?: PAIJobRestServer; constructor() { this.log = getLogger(); this.metricsEmitter = new EventEmitter(); - this.trialJobsMap = new Map(); + this.trialJobsMap = new Map(); this.jobQueue = []; this.expRootDir = path.join('/nni', 'experiments', getExperimentId()); this.experimentId = getExperimentId(); - this.paiJobCollector = new PAIBaseJobInfoCollector(this.trialJobsMap); + this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap); this.paiTokenUpdateInterval = 7200000; //2hours this.logCollection = 'none'; this.log.info('Construct paiBase training service.'); } public async run(): Promise { - throw new Error('Not implemented!'); + this.log.info('Run PAI training service.'); + if (this.paiJobRestServer === undefined) { + throw new Error('paiJobRestServer not initialized!'); + } + await this.paiJobRestServer.start(); + this.paiJobRestServer.setEnableVersionCheck = this.versionCheck; + this.log.info(`PAI Training service rest server listening on: ${this.paiJobRestServer.endPoint}`); + await Promise.all([ + this.statusCheckingLoop(), + this.submitJobLoop()]); + this.log.info('PAI training service exit.'); } public async submitTrialJob(form: TrialJobApplicationForm): Promise { @@ -114,11 +123,11 @@ abstract class PAIBaseTrainingService implements TrainingService { } public async getTrialJob(trialJobId: string): Promise { - if (this.paiBaseClusterConfig === undefined) { + if (this.paiClusterConfig === undefined) { throw new Error('PAI Cluster config is not initialized'); } - const paiBaseTrialJob: PAIBaseTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); + const paiBaseTrialJob: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); if (paiBaseTrialJob === undefined) { return Promise.reject(`trial job ${trialJobId} not found`); @@ -140,7 +149,7 @@ abstract class PAIBaseTrainingService implements TrainingService { } public cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise { - const trialJobDetail: PAIBaseTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); + const trialJobDetail: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); const deferred: Deferred = new Deferred(); if (trialJobDetail === undefined) { this.log.error(`cancelTrialJob: trial job id ${trialJobId} not found`); @@ -148,7 +157,7 @@ abstract class PAIBaseTrainingService implements TrainingService { return Promise.reject(); } - if (this.paiBaseClusterConfig === undefined) { + if (this.paiClusterConfig === undefined) { throw new Error('PAI Cluster config is not initialized'); } if (this.paiToken === undefined) { @@ -156,7 +165,7 @@ abstract class PAIBaseTrainingService implements TrainingService { } const stopJobRequest: request.Options = { - uri: `http://${this.paiBaseClusterConfig.host}/rest-server/api/v1/user/${this.paiBaseClusterConfig.userName}\ + uri: `http://${this.paiClusterConfig.host}/rest-server/api/v1/user/${this.paiClusterConfig.userName}\ /jobs/${trialJobDetail.paiJobName}/executionType`, method: 'PUT', json: true, @@ -194,13 +203,13 @@ abstract class PAIBaseTrainingService implements TrainingService { public async cleanUp(): Promise { this.log.info('Stopping PAI training service...'); this.stopping = true; - if (this.paiBaseJobRestServer === undefined) { + if (this.paiJobRestServer === undefined) { throw new Error('paiBaseJobRestServer not initialized!'); } const deferred: Deferred = new Deferred(); try { - await this.paiBaseJobRestServer.stop(); + await this.paiJobRestServer.stop(); deferred.resolve(); this.log.info('PAI Training service rest server stopped successfully.'); } catch (error) { @@ -217,7 +226,7 @@ abstract class PAIBaseTrainingService implements TrainingService { protected async statusCheckingLoop(): Promise { while (!this.stopping) { - if(this.paiBaseClusterConfig && this.paiBaseClusterConfig.passWord) { + if(this.paiClusterConfig && this.paiClusterConfig.passWord) { try { await this.updatePaiToken(); } catch (error) { @@ -228,12 +237,12 @@ abstract class PAIBaseTrainingService implements TrainingService { } } } - await this.paiJobCollector.retrieveTrialStatus(this.paiToken, this.paiBaseClusterConfig); - if (this.paiBaseJobRestServer === undefined) { + await this.paiJobCollector.retrieveTrialStatus(this.paiToken, this.paiClusterConfig); + if (this.paiJobRestServer === undefined) { throw new Error('paiBaseJobRestServer not implemented!'); } - if (this.paiBaseJobRestServer.getErrorMessage !== undefined) { - throw new Error(this.paiBaseJobRestServer.getErrorMessage); + if (this.paiJobRestServer.getErrorMessage !== undefined) { + throw new Error(this.paiJobRestServer.getErrorMessage); } await delay(3000); } @@ -251,19 +260,19 @@ abstract class PAIBaseTrainingService implements TrainingService { return Promise.resolve(); } - if (this.paiBaseClusterConfig === undefined) { + if (this.paiClusterConfig === undefined) { const paiClusterConfigError: string = `pai cluster config not initialized!`; this.log.error(`${paiClusterConfigError}`); throw Error(`${paiClusterConfigError}`); } const authenticationReq: request.Options = { - uri: `http://${this.paiBaseClusterConfig.host}/rest-server/api/v1/token`, + uri: `http://${this.paiClusterConfig.host}/rest-server/api/v1/token`, method: 'POST', json: true, body: { - username: this.paiBaseClusterConfig.userName, - password: this.paiBaseClusterConfig.passWord + username: this.paiClusterConfig.userName, + password: this.paiClusterConfig.passWord } }; @@ -295,4 +304,4 @@ abstract class PAIBaseTrainingService implements TrainingService { } } -export { PAIBaseTrainingService }; +export { PAITrainingService }; diff --git a/src/nni_manager/training_service/pai_base/paiYarn/hdfsClientUtility.ts b/src/nni_manager/training_service/pai/paiYarn/hdfsClientUtility.ts similarity index 100% rename from src/nni_manager/training_service/pai_base/paiYarn/hdfsClientUtility.ts rename to src/nni_manager/training_service/pai/paiYarn/hdfsClientUtility.ts diff --git a/src/nni_manager/training_service/pai_base/paiYarn/paiYarnConfig.ts b/src/nni_manager/training_service/pai/paiYarn/paiYarnConfig.ts similarity index 100% rename from src/nni_manager/training_service/pai_base/paiYarn/paiYarnConfig.ts rename to src/nni_manager/training_service/pai/paiYarn/paiYarnConfig.ts diff --git a/src/nni_manager/training_service/pai_base/paiYarn/paiYarnData.ts b/src/nni_manager/training_service/pai/paiYarn/paiYarnData.ts similarity index 100% rename from src/nni_manager/training_service/pai_base/paiYarn/paiYarnData.ts rename to src/nni_manager/training_service/pai/paiYarn/paiYarnData.ts diff --git a/src/nni_manager/training_service/pai_base/paiYarn/paiYarnJobRestServer.ts b/src/nni_manager/training_service/pai/paiYarn/paiYarnJobRestServer.ts similarity index 86% rename from src/nni_manager/training_service/pai_base/paiYarn/paiYarnJobRestServer.ts rename to src/nni_manager/training_service/pai/paiYarn/paiYarnJobRestServer.ts index 72e9965c5e..deee798717 100644 --- a/src/nni_manager/training_service/pai_base/paiYarn/paiYarnJobRestServer.ts +++ b/src/nni_manager/training_service/pai/paiYarn/paiYarnJobRestServer.ts @@ -8,7 +8,7 @@ import { Inject } from 'typescript-ioc'; import * as component from '../../../common/component'; import { ClusterJobRestServer } from '../../common/clusterJobRestServer'; import { PAIYarnTrainingService } from './paiYarnTrainingService'; -import { PAIBaseJobRestServer } from '../paiBaseJobRestServer'; +import { PAIJobRestServer } from '../paiJobRestServer'; export interface ParameterFileMeta { readonly experimentId: string; @@ -21,7 +21,7 @@ export interface ParameterFileMeta { * */ @component.Singleton -export class PAIYarnJobRestServer extends PAIBaseJobRestServer { +export class PAIYarnJobRestServer extends PAIJobRestServer { /** * constructor to provide NNIRestServer's own rest property, e.g. port */ diff --git a/src/nni_manager/training_service/pai_base/paiYarn/paiYarnTrainingService.ts b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts similarity index 84% rename from src/nni_manager/training_service/pai_base/paiYarn/paiYarnTrainingService.ts rename to src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts index 2d9c36b28b..9c05a3c3e8 100644 --- a/src/nni_manager/training_service/pai_base/paiYarn/paiYarnTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts @@ -25,10 +25,10 @@ import { execMkdir, validateCodeDir } from '../../common/util'; import { HDFSClientUtility } from './hdfsClientUtility'; import { NNIPAITrialConfig, PAIJobConfig, PAITaskRole } from './paiYarnConfig'; import { PAI_LOG_PATH_FORMAT, PAI_TRIAL_COMMAND_FORMAT } from './paiYarnData'; -import { PAIBaseJobInfoCollector } from '../paiBaseJobInfoCollector'; +import { PAIJobInfoCollector } from '../paiJobInfoCollector'; import { PAIYarnJobRestServer, ParameterFileMeta } from './paiYarnJobRestServer'; -import { PAIBaseTrainingService } from '../paiBaseTrainingService'; -import { PAIBaseClusterConfig, PAIBaseTrialJobDetail } from '../paiBaseConfig'; +import { PAITrainingService } from '../paiTrainingService'; +import { PAIClusterConfig, PAITrialJobDetail } from '../paiConfig'; import * as WebHDFS from 'webhdfs'; @@ -37,7 +37,7 @@ import * as WebHDFS from 'webhdfs'; * Refer https://github.com/Microsoft/pai for more info about OpenPAI */ @component.Singleton -class PAIYarnTrainingService extends PAIBaseTrainingService { +class PAIYarnTrainingService extends PAITrainingService { private hdfsClient: any; private copyExpCodeDirPromise?: Promise; private copyAuthFilePromise?: Promise; @@ -47,26 +47,11 @@ class PAIYarnTrainingService extends PAIBaseTrainingService { super(); } - public async run(): Promise { - this.log.info('Run PAIYarn training service.'); - this.paiBaseJobRestServer = component.get(PAIYarnJobRestServer); - if (this.paiBaseJobRestServer === undefined) { - throw new Error('paiBaseJobRestServer not initialized!'); - } - await this.paiBaseJobRestServer.start(); - this.paiBaseJobRestServer.setEnableVersionCheck = this.versionCheck; - this.log.info(`PAI Training service rest server listening on: ${this.paiBaseJobRestServer.endPoint}`); - await Promise.all([ - this.statusCheckingLoop(), - this.submitJobLoop()]); - this.log.info('PAI training service exit.'); - } - public async submitTrialJob(form: TrialJobApplicationForm): Promise { - if (this.paiBaseClusterConfig === undefined) { + if (this.paiClusterConfig === undefined) { throw new Error(`paiBaseClusterConfig not initialized!`); } - const deferred: Deferred = new Deferred(); + const deferred: Deferred = new Deferred(); this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`); @@ -74,16 +59,16 @@ class PAIYarnTrainingService extends PAIBaseTrainingService { //TODO: use HDFS working folder instead const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId); const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; - const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiBaseClusterConfig.userName, trialJobId); + const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId); const hdfsOutputDir: string = unixPathJoin(hdfsCodeDir, 'nnioutput'); const hdfsLogPath: string = String.Format( PAI_LOG_PATH_FORMAT, - this.paiBaseClusterConfig.host, + this.paiClusterConfig.host, hdfsOutputDir ); - const trialJobDetail: PAIBaseTrialJobDetail = new PAIBaseTrialJobDetail( + const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( trialJobId, 'WAITING', paiJobName, @@ -109,20 +94,21 @@ class PAIYarnTrainingService extends PAIBaseTrainingService { break; case TrialConfigMetadataKey.PAI_YARN_CLUSTER_CONFIG: - this.paiBaseClusterConfig = JSON.parse(value); + this.paiJobRestServer = component.get(PAIYarnJobRestServer); + this.paiClusterConfig = JSON.parse(value); this.hdfsClient = WebHDFS.createClient({ - user: this.paiBaseClusterConfig.userName, + user: this.paiClusterConfig.userName, // Refer PAI document for Pylon mapping https://github.com/Microsoft/pai/tree/master/docs/pylon port: 80, path: '/webhdfs/api/v1', - host: this.paiBaseClusterConfig.host + host: this.paiClusterConfig.host }); - if(this.paiBaseClusterConfig.passWord) { + if(this.paiClusterConfig.passWord) { // Get PAI authentication token await this.updatePaiToken(); - } else if(this.paiBaseClusterConfig.token) { - this.paiToken = this.paiBaseClusterConfig.token; + } else if(this.paiClusterConfig.token) { + this.paiToken = this.paiClusterConfig.token; } else { deferred.reject(new Error('paiBase cluster config format error, please set password or token!')); } @@ -131,7 +117,7 @@ class PAIYarnTrainingService extends PAIBaseTrainingService { break; case TrialConfigMetadataKey.TRIAL_CONFIG: - if (this.paiBaseClusterConfig === undefined) { + if (this.paiClusterConfig === undefined) { this.log.error('pai cluster config is not initialized'); deferred.reject(new Error('pai cluster config is not initialized')); break; @@ -150,13 +136,13 @@ class PAIYarnTrainingService extends PAIBaseTrainingService { // Copy experiment files from local folder to HDFS this.copyExpCodeDirPromise = HDFSClientUtility.copyDirectoryToHdfs( this.paiTrialConfig.codeDir, - HDFSClientUtility.getHdfsExpCodeDir(this.paiBaseClusterConfig.userName), + HDFSClientUtility.getHdfsExpCodeDir(this.paiClusterConfig.userName), this.hdfsClient ); // Upload authFile to hdfs if (this.paiTrialConfig.authFile) { - this.authFileHdfsPath = unixPathJoin(HDFSClientUtility.hdfsExpRootDir(this.paiBaseClusterConfig.userName), 'authFile'); + this.authFileHdfsPath = unixPathJoin(HDFSClientUtility.hdfsExpRootDir(this.paiClusterConfig.userName), 'authFile'); this.copyAuthFilePromise = HDFSClientUtility.copyFileToHdfs(this.paiTrialConfig.authFile, this.authFileHdfsPath, this.hdfsClient); } @@ -181,13 +167,13 @@ class PAIYarnTrainingService extends PAIBaseTrainingService { protected async submitTrialJobToPAI(trialJobId: string): Promise { const deferred: Deferred = new Deferred(); - const trialJobDetail: PAIBaseTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); + const trialJobDetail: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); if (trialJobDetail === undefined) { throw new Error(`Failed to find PAITrialJobDetail for job ${trialJobId}`); } - if (this.paiBaseClusterConfig === undefined) { + if (this.paiClusterConfig === undefined) { throw new Error('PAI Cluster config is not initialized'); } if (this.paiTrialConfig === undefined) { @@ -228,7 +214,7 @@ class PAIYarnTrainingService extends PAIBaseTrainingService { trialJobDetail.form.hyperParameters.value, { encoding: 'utf8' } ); } - const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiBaseClusterConfig.userName, trialJobId); + const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId); const hdfsOutputDir: string = unixPathJoin(hdfsCodeDir, 'nnioutput'); const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address(); const version: string = this.versionCheck ? await getVersion() : ''; @@ -245,9 +231,9 @@ class PAIYarnTrainingService extends PAIBaseTrainingService { nniManagerIp, this.paiRestServerPort, hdfsOutputDir, - this.paiBaseClusterConfig.host, - this.paiBaseClusterConfig.userName, - HDFSClientUtility.getHdfsExpCodeDir(this.paiBaseClusterConfig.userName), + this.paiClusterConfig.host, + this.paiClusterConfig.userName, + HDFSClientUtility.getHdfsExpCodeDir(this.paiClusterConfig.userName), version, this.logCollection ) @@ -303,7 +289,7 @@ class PAIYarnTrainingService extends PAIBaseTrainingService { // Step 3. Submit PAI job via Rest call // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API const submitJobRequest: request.Options = { - uri: `http://${this.paiBaseClusterConfig.host}/rest-server/api/v1/user/${this.paiBaseClusterConfig.userName}/jobs`, + uri: `http://${this.paiClusterConfig.host}/rest-server/api/v1/user/${this.paiClusterConfig.userName}/jobs`, method: 'POST', json: true, body: paiJobConfig, @@ -338,7 +324,7 @@ class PAIYarnTrainingService extends PAIBaseTrainingService { } protected async writeParameterFile(trialJobId: string, hyperParameters: HyperParameters): Promise { - if (this.paiBaseClusterConfig === undefined) { + if (this.paiClusterConfig === undefined) { throw new Error('PAI Cluster config is not initialized'); } if (this.paiTrialConfig === undefined) { @@ -349,7 +335,7 @@ class PAIYarnTrainingService extends PAIBaseTrainingService { const hpFileName: string = generateParamFileName(hyperParameters); const localFilepath: string = path.join(trialLocalTempFolder, hpFileName); await fs.promises.writeFile(localFilepath, hyperParameters.value, { encoding: 'utf8' }); - const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiBaseClusterConfig.userName, trialJobId); + const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId); const hdfsHpFilePath: string = path.join(hdfsCodeDir, hpFileName); await HDFSClientUtility.copyFileToHdfs(localFilepath, hdfsHpFilePath, this.hdfsClient); @@ -363,11 +349,11 @@ class PAIYarnTrainingService extends PAIBaseTrainingService { protected postParameterFileMeta(parameterFileMeta: ParameterFileMeta): Promise { const deferred: Deferred = new Deferred(); - if (this.paiBaseJobRestServer === undefined) { + if (this.paiJobRestServer === undefined) { throw new Error('paiBaseJobRestServer not implemented!'); } const req: request.Options = { - uri: `${this.paiBaseJobRestServer.endPoint}${this.paiBaseJobRestServer.apiRootUrl}/parameter-file-meta`, + uri: `${this.paiJobRestServer.endPoint}${this.paiJobRestServer.apiRootUrl}/parameter-file-meta`, method: 'POST', json: true, body: parameterFileMeta diff --git a/src/nni_manager/training_service/pai_base/paiYarn/paiYarnTrialConfig.ts b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrialConfig.ts similarity index 100% rename from src/nni_manager/training_service/pai_base/paiYarn/paiYarnTrialConfig.ts rename to src/nni_manager/training_service/pai/paiYarn/paiYarnTrialConfig.ts diff --git a/src/nni_manager/training_service/test/hdfsClientUtility.test.ts b/src/nni_manager/training_service/test/hdfsClientUtility.test.ts index dfa6878016..2bc7fa4daa 100644 --- a/src/nni_manager/training_service/test/hdfsClientUtility.test.ts +++ b/src/nni_manager/training_service/test/hdfsClientUtility.test.ts @@ -9,7 +9,7 @@ import * as os from 'os'; import * as path from 'path'; import * as tmp from 'tmp'; import { cleanupUnitTest, prepareUnitTest, uniqueString } from '../../common/utils'; -import { HDFSClientUtility } from '../pai_base/paiYarn/hdfsClientUtility'; +import { HDFSClientUtility } from '../pai/paiYarn/hdfsClientUtility'; var WebHDFS = require('webhdfs'); var rmdir = require('rmdir'); diff --git a/src/nni_manager/training_service/test/paiYarnTrainingService.test.ts b/src/nni_manager/training_service/test/paiYarnTrainingService.test.ts index 5f9ef7c06e..d595b92f2f 100644 --- a/src/nni_manager/training_service/test/paiYarnTrainingService.test.ts +++ b/src/nni_manager/training_service/test/paiYarnTrainingService.test.ts @@ -11,7 +11,7 @@ import * as component from '../../common/component'; import { TrialJobApplicationForm } from '../../common/trainingService'; import { cleanupUnitTest, prepareUnitTest } from '../../common/utils'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; -import { PAIYarnTrainingService } from '../pai_base/paiYarn/paiYarnTrainingService'; +import { PAIYarnTrainingService } from '../pai/paiYarn/paiYarnTrainingService'; // TODO: copy mockedTrail.py to local folder const localCodeDir: string = tmp.dirSync().name From cc802b0864fccb84f2c6fa402754580d9c1059dc Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Fri, 20 Dec 2019 17:37:54 +0800 Subject: [PATCH 14/18] remove paiYarnJobRestserver and paiK8SJobRestserver --- .../training_service/pai/paiJobRestServer.ts | 5 ++- .../pai/paiK8S/paiK8SJobRestServer.ts | 31 ------------------- .../pai/paiK8S/paiK8STrainingService.ts | 4 +-- .../pai/paiYarn/paiYarnJobRestServer.ts | 31 ------------------- .../pai/paiYarn/paiYarnTrainingService.ts | 11 ++++--- 5 files changed, 10 insertions(+), 72 deletions(-) delete mode 100644 src/nni_manager/training_service/pai/paiK8S/paiK8SJobRestServer.ts delete mode 100644 src/nni_manager/training_service/pai/paiYarn/paiYarnJobRestServer.ts diff --git a/src/nni_manager/training_service/pai/paiJobRestServer.ts b/src/nni_manager/training_service/pai/paiJobRestServer.ts index 78ab123fbf..ba4809a9b2 100644 --- a/src/nni_manager/training_service/pai/paiJobRestServer.ts +++ b/src/nni_manager/training_service/pai/paiJobRestServer.ts @@ -23,15 +23,14 @@ export interface ParameterFileMeta { export class PAIJobRestServer extends ClusterJobRestServer { protected parameterFileMetaList: ParameterFileMeta[] = []; - @Inject protected readonly paiTrainingService: PAITrainingService; /** * constructor to provide NNIRestServer's own rest property, e.g. port */ - constructor (paiBaseTrainingService: PAITrainingService) { + constructor (paiTrainingService: PAITrainingService) { super(); - this.paiTrainingService = paiBaseTrainingService; + this.paiTrainingService = paiTrainingService; } protected handleTrialMetrics(jobId: string, metrics: any[]): void { diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8SJobRestServer.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8SJobRestServer.ts deleted file mode 100644 index 979570805b..0000000000 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8SJobRestServer.ts +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -'use strict'; - -import { Request, Response, Router } from 'express'; -import { Inject } from 'typescript-ioc'; -import * as component from '../../../common/component'; -import { ClusterJobRestServer } from '../../common/clusterJobRestServer'; -import { PAIK8STrainingService } from './paiK8STrainingService'; -import { PAIJobRestServer } from '../paiJobRestServer'; - -export interface ParameterFileMeta { - readonly experimentId: string; - readonly trialId: string; - readonly filePath: string; -} - -/** - * PAI Training service Rest server, provides rest API to support pai job metrics update - * - */ -@component.Singleton -export class PAIK8SJobRestServer extends PAIJobRestServer { - /** - * constructor to provide NNIRestServer's own rest property, e.g. port - */ - constructor() { - super(component.get(PAIK8STrainingService)); - } -} diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index 9af56a1fdb..d35abaa0d5 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -39,9 +39,9 @@ import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; import { execMkdir, validateCodeDir, execCopydir } from '../../common/util'; import { PAI_K8S_TRIAL_COMMAND_FORMAT } from './paiK8SData'; import { NNIPAIK8STrialConfig } from './paiK8SConfig'; -import { PAIK8SJobRestServer } from './paiK8SJobRestServer'; import { PAITrainingService } from '../paiTrainingService'; import { PAIClusterConfig, PAITrialJobDetail } from '../paiConfig'; +import { PAIJobRestServer } from '../paiJobRestServer'; const yaml = require('js-yaml'); @@ -67,7 +67,7 @@ class PAIK8STrainingService extends PAITrainingService { break; case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG: - this.paiJobRestServer = component.get(PAIK8SJobRestServer); + this.paiJobRestServer = new PAIJobRestServer(component.get(PAIK8STrainingService)); this.paiClusterConfig = JSON.parse(value); if(this.paiClusterConfig.passWord) { diff --git a/src/nni_manager/training_service/pai/paiYarn/paiYarnJobRestServer.ts b/src/nni_manager/training_service/pai/paiYarn/paiYarnJobRestServer.ts deleted file mode 100644 index deee798717..0000000000 --- a/src/nni_manager/training_service/pai/paiYarn/paiYarnJobRestServer.ts +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -'use strict'; - -import { Request, Response, Router } from 'express'; -import { Inject } from 'typescript-ioc'; -import * as component from '../../../common/component'; -import { ClusterJobRestServer } from '../../common/clusterJobRestServer'; -import { PAIYarnTrainingService } from './paiYarnTrainingService'; -import { PAIJobRestServer } from '../paiJobRestServer'; - -export interface ParameterFileMeta { - readonly experimentId: string; - readonly trialId: string; - readonly filePath: string; -} - -/** - * PAI Training service Rest server, provides rest API to support pai job metrics update - * - */ -@component.Singleton -export class PAIYarnJobRestServer extends PAIJobRestServer { - /** - * constructor to provide NNIRestServer's own rest property, e.g. port - */ - constructor() { - super(component.get(PAIYarnTrainingService)); - } -} diff --git a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts index 9c05a3c3e8..af1025f3a5 100644 --- a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts @@ -26,11 +26,11 @@ import { HDFSClientUtility } from './hdfsClientUtility'; import { NNIPAITrialConfig, PAIJobConfig, PAITaskRole } from './paiYarnConfig'; import { PAI_LOG_PATH_FORMAT, PAI_TRIAL_COMMAND_FORMAT } from './paiYarnData'; import { PAIJobInfoCollector } from '../paiJobInfoCollector'; -import { PAIYarnJobRestServer, ParameterFileMeta } from './paiYarnJobRestServer'; import { PAITrainingService } from '../paiTrainingService'; import { PAIClusterConfig, PAITrialJobDetail } from '../paiConfig'; import * as WebHDFS from 'webhdfs'; +import { PAIJobRestServer, ParameterFileMeta } from '../paiJobRestServer'; /** * Training Service implementation for OpenPAI (Open Platform for AI) @@ -94,7 +94,7 @@ class PAIYarnTrainingService extends PAITrainingService { break; case TrialConfigMetadataKey.PAI_YARN_CLUSTER_CONFIG: - this.paiJobRestServer = component.get(PAIYarnJobRestServer); + this.paiJobRestServer = new PAIJobRestServer(component.get(PAIYarnTrainingService)); this.paiClusterConfig = JSON.parse(value); this.hdfsClient = WebHDFS.createClient({ @@ -183,11 +183,12 @@ class PAIYarnTrainingService extends PAITrainingService { throw new Error('PAI token is not initialized'); } - if (this.paiRestServerPort === undefined) { - const restServer: PAIYarnJobRestServer = component.get(PAIYarnJobRestServer); - this.paiRestServerPort = restServer.clusterRestServerPort; + if (this.paiJobRestServer === undefined) { + throw new Error('paiJobRestServer is not initialized'); } + this.paiRestServerPort = this.paiJobRestServer.clusterRestServerPort; + // Make sure experiment code files is copied from local to HDFS if (this.copyExpCodeDirPromise !== undefined) { await this.copyExpCodeDirPromise; From 14ba08a44c83074cc546b0660ae9c51b4e01036e Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Fri, 20 Dec 2019 17:39:02 +0800 Subject: [PATCH 15/18] remove singleton --- src/nni_manager/training_service/pai/paiJobRestServer.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/nni_manager/training_service/pai/paiJobRestServer.ts b/src/nni_manager/training_service/pai/paiJobRestServer.ts index ba4809a9b2..38087f574c 100644 --- a/src/nni_manager/training_service/pai/paiJobRestServer.ts +++ b/src/nni_manager/training_service/pai/paiJobRestServer.ts @@ -19,7 +19,6 @@ export interface ParameterFileMeta { * PAI Training service Rest server, provides rest API to support pai job metrics update * */ -@component.Singleton export class PAIJobRestServer extends ClusterJobRestServer { protected parameterFileMetaList: ParameterFileMeta[] = []; From 0bf5f692eea9e8c2a07b02a7f5258a5916a2202c Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Fri, 20 Dec 2019 19:53:46 +0800 Subject: [PATCH 16/18] add logPath --- .../pai/paiJobInfoCollector.ts | 60 ++++++++++--------- .../pai/paiK8S/paiK8STrainingService.ts | 5 +- 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts index 33e98afede..ce50d4cd57 100644 --- a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts +++ b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts @@ -31,19 +31,19 @@ export class PAIJobInfoCollector { } const updatePaiTrialJobs: Promise[] = []; - for (const [trialJobId, paiBaseTrialJob] of this.trialJobsMap) { - if (paiBaseTrialJob === undefined) { + for (const [trialJobId, paiTrialJob] of this.trialJobsMap) { + if (paiTrialJob === undefined) { throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`); } - updatePaiTrialJobs.push(this.getSinglePAITrialJobInfo(paiBaseTrialJob, token, paiBaseClusterConfig)); + updatePaiTrialJobs.push(this.getSinglePAITrialJobInfo(paiTrialJob, token, paiBaseClusterConfig)); } await Promise.all(updatePaiTrialJobs); } - private getSinglePAITrialJobInfo(paiBaseTrialJob: PAITrialJobDetail, paiToken: string, paiBaseClusterConfig: PAIClusterConfig): Promise { + private getSinglePAITrialJobInfo(paiTrialJob: PAITrialJobDetail, paiToken: string, paiClusterConfig: PAIClusterConfig): Promise { const deferred: Deferred = new Deferred(); - if (!this.statusesNeedToCheck.includes(paiBaseTrialJob.status)) { + if (!this.statusesNeedToCheck.includes(paiTrialJob.status)) { deferred.resolve(); return deferred.promise; @@ -52,7 +52,7 @@ export class PAIJobInfoCollector { // Rest call to get PAI job info and update status // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API const getJobInfoRequest: request.Options = { - uri: `http://${paiBaseClusterConfig.host}/rest-server/api/v1/user/${paiBaseClusterConfig.userName}/jobs/${paiBaseTrialJob.paiJobName}`, + uri: `http://${paiClusterConfig.host}/rest-server/api/v1/user/${paiClusterConfig.userName}/jobs/${paiTrialJob.paiJobName}`, method: 'GET', json: true, headers: { @@ -64,57 +64,61 @@ export class PAIJobInfoCollector { //TODO : pass in request timeout param? request(getJobInfoRequest, (error: Error, response: request.Response, body: any) => { if ((error !== undefined && error !== null) || response.statusCode >= 500) { - this.log.error(`PAI Training service: get job info for trial ${paiBaseTrialJob.id} from PAI Cluster failed!`); + this.log.error(`PAI Training service: get job info for trial ${paiTrialJob.id} from PAI Cluster failed!`); // Queried PAI job info failed, set job status to UNKNOWN - if (paiBaseTrialJob.status === 'WAITING' || paiBaseTrialJob.status === 'RUNNING') { - paiBaseTrialJob.status = 'UNKNOWN'; + if (paiTrialJob.status === 'WAITING' || paiTrialJob.status === 'RUNNING') { + paiTrialJob.status = 'UNKNOWN'; } } else { if (response.body.jobStatus && response.body.jobStatus.state) { switch (response.body.jobStatus.state) { case 'WAITING': - paiBaseTrialJob.status = 'WAITING'; + paiTrialJob.status = 'WAITING'; break; case 'RUNNING': - paiBaseTrialJob.status = 'RUNNING'; - if (paiBaseTrialJob.startTime === undefined) { - paiBaseTrialJob.startTime = response.body.jobStatus.appLaunchedTime; + paiTrialJob.status = 'RUNNING'; + if (paiTrialJob.startTime === undefined) { + paiTrialJob.startTime = response.body.jobStatus.appLaunchedTime; } - if (paiBaseTrialJob.url === undefined) { - paiBaseTrialJob.url = response.body.jobStatus.appTrackingUrl; + if (paiTrialJob.url === undefined) { + paiTrialJob.url = response.body.jobStatus.appTrackingUrl; } break; case 'SUCCEEDED': - paiBaseTrialJob.status = 'SUCCEEDED'; + paiTrialJob.status = 'SUCCEEDED'; break; case 'STOPPED': - if (paiBaseTrialJob.isEarlyStopped !== undefined) { - paiBaseTrialJob.status = paiBaseTrialJob.isEarlyStopped === true ? + if (paiTrialJob.isEarlyStopped !== undefined) { + paiTrialJob.status = paiTrialJob.isEarlyStopped === true ? 'EARLY_STOPPED' : 'USER_CANCELED'; } else { /* if paiTrialJob's isEarlyStopped is undefined, that mean we didn't stop it via cancellation, * mark it as SYS_CANCELLED by PAI */ - paiBaseTrialJob.status = 'SYS_CANCELED'; + paiTrialJob.status = 'SYS_CANCELED'; } break; case 'FAILED': - paiBaseTrialJob.status = 'FAILED'; + paiTrialJob.status = 'FAILED'; break; default: - paiBaseTrialJob.status = 'UNKNOWN'; + paiTrialJob.status = 'UNKNOWN'; } // For final job statues, update startTime, endTime and url - if (this.finalStatuses.includes(paiBaseTrialJob.status)) { - if (paiBaseTrialJob.startTime === undefined) { - paiBaseTrialJob.startTime = response.body.jobStatus.appLaunchedTime; + if (this.finalStatuses.includes(paiTrialJob.status)) { + if (paiTrialJob.startTime === undefined) { + paiTrialJob.startTime = response.body.jobStatus.appLaunchedTime; } - if (paiBaseTrialJob.endTime === undefined) { - paiBaseTrialJob.endTime = response.body.jobStatus.completedTime; + if (paiTrialJob.endTime === undefined) { + paiTrialJob.endTime = response.body.jobStatus.completedTime; } // Set pai trial job's url to WebHDFS output path - if (paiBaseTrialJob.logPath !== undefined) { - paiBaseTrialJob.url += `,${paiBaseTrialJob.logPath}`; + if (paiTrialJob.logPath !== undefined) { + if (paiTrialJob.url) { + paiTrialJob.url += `,${paiTrialJob.logPath}`; + } else { + paiTrialJob.url = `${paiTrialJob.logPath}`; + } } } } diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index d35abaa0d5..3fc849fd2f 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -130,6 +130,9 @@ class PAIK8STrainingService extends PAITrainingService { if (this.paiClusterConfig === undefined) { throw new Error(`paiClusterConfig not initialized!`); } + if (this.paiTrialConfig === undefined) { + throw new Error(`paiTrialConfig not initialized!`); + } const deferred: Deferred = new Deferred(); this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`); @@ -138,7 +141,7 @@ class PAIK8STrainingService extends PAITrainingService { //TODO: use HDFS working folder instead const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId); const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; - const logPath: string = ''; + const logPath: string = path.join(this.paiTrialConfig.nniManagerNFSMountPath, this.experimentId, trialJobId); const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( trialJobId, 'WAITING', From d09769e8703ac1c7c35c69a65ce381792911d0db Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Fri, 20 Dec 2019 20:22:21 +0800 Subject: [PATCH 17/18] fix comments --- .../pai/paiK8S/paiK8STrainingService.ts | 31 ++----------------- .../pai/paiTrainingService.ts | 24 +++++--------- .../pai/paiYarn/paiYarnTrainingService.ts | 4 +-- 3 files changed, 12 insertions(+), 47 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index 3fc849fd2f..4b0a0f81a2 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -58,47 +58,26 @@ class PAIK8STrainingService extends PAITrainingService { } public async setClusterMetadata(key: string, value: string): Promise { - const deferred: Deferred = new Deferred(); - switch (key) { - case TrialConfigMetadataKey.NNI_MANAGER_IP: - this.nniManagerIpConfig = JSON.parse(value); - deferred.resolve(); - break; - case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG: this.paiJobRestServer = new PAIJobRestServer(component.get(PAIK8STrainingService)); this.paiClusterConfig = JSON.parse(value); - if(this.paiClusterConfig.passWord) { // Get PAI authentication token await this.updatePaiToken(); } else if(this.paiClusterConfig.token) { this.paiToken = this.paiClusterConfig.token; - } else { - deferred.reject(new Error('pai cluster config format error, please set password or token!')); } - - deferred.resolve(); break; case TrialConfigMetadataKey.TRIAL_CONFIG: if (this.paiClusterConfig === undefined) { this.log.error('pai cluster config is not initialized'); - deferred.reject(new Error('pai cluster config is not initialized')); break; } this.paiTrialConfig = JSON.parse(value); - // Validate to make sure codeDir doesn't have too many files - try { - await validateCodeDir(this.paiTrialConfig.codeDir); - } catch (error) { - this.log.error(error); - deferred.reject(new Error(error)); - break; - } - deferred.resolve(); + await validateCodeDir(this.paiTrialConfig.codeDir); break; case TrialConfigMetadataKey.VERSION_CHECK: this.versionCheck = (value === 'true' || value === 'True'); @@ -111,10 +90,8 @@ class PAIK8STrainingService extends PAITrainingService { break; default: //Reject for unknown keys - deferred.reject(new Error(`Uknown key: ${key}`)); + this.log.error(`Uknown key: ${key}`); } - - return deferred.promise; } //TODO: update trial parameters @@ -133,7 +110,6 @@ class PAIK8STrainingService extends PAITrainingService { if (this.paiTrialConfig === undefined) { throw new Error(`paiTrialConfig not initialized!`); } - const deferred: Deferred = new Deferred(); this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`); @@ -153,9 +129,8 @@ class PAIK8STrainingService extends PAITrainingService { this.trialJobsMap.set(trialJobId, trialJobDetail); this.jobQueue.push(trialJobId); - deferred.resolve(trialJobDetail); - return deferred.promise; + return trialJobDetail; } public generateJobConfigInYamlFormat(trialJobId: string, command: string) { diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index b0c8d08fe2..0edbc66966 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -119,7 +119,7 @@ abstract class PAITrainingService implements TrainingService { jobs.push(await this.getTrialJob(key)); } - return Promise.resolve(jobs); + return jobs; } public async getTrialJob(trialJobId: string): Promise { @@ -127,13 +127,13 @@ abstract class PAITrainingService implements TrainingService { throw new Error('PAI Cluster config is not initialized'); } - const paiBaseTrialJob: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); + const paiTrialJob: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); - if (paiBaseTrialJob === undefined) { - return Promise.reject(`trial job ${trialJobId} not found`); + if (paiTrialJob === undefined) { + throw new Error(`trial job ${trialJobId} not found`); } - return Promise.resolve(paiBaseTrialJob); + return paiTrialJob; } public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { @@ -193,31 +193,23 @@ abstract class PAITrainingService implements TrainingService { } public getClusterMetadata(key: string): Promise { - const deferred: Deferred = new Deferred(); - - deferred.resolve(); - - return deferred.promise; + throw new Error('Not implemented!'); } public async cleanUp(): Promise { this.log.info('Stopping PAI training service...'); this.stopping = true; + if (this.paiJobRestServer === undefined) { - throw new Error('paiBaseJobRestServer not initialized!'); + throw new Error('paiJobRestServer not initialized!'); } - const deferred: Deferred = new Deferred(); try { await this.paiJobRestServer.stop(); - deferred.resolve(); this.log.info('PAI Training service rest server stopped successfully.'); } catch (error) { this.log.error(`PAI Training service rest server stopped failed, error: ${error.message}`); - deferred.reject(error); } - - return deferred.promise; } public get MetricsEmitter(): EventEmitter { diff --git a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts index af1025f3a5..5de0ffaae4 100644 --- a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts @@ -282,9 +282,7 @@ class PAIYarnTrainingService extends PAITrainingService { } catch (error) { this.log.error(`PAI Training service: copy ${this.paiTrialConfig.codeDir} to HDFS ${hdfsCodeDir} failed, error is ${error}`); trialJobDetail.status = 'FAILED'; // eslint-disable-line require-atomic-updates - deferred.resolve(true); - - return deferred.promise; + return true; } // Step 3. Submit PAI job via Rest call From 2b57042d02041dd8ea1355c056acd2cdd539206b Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Sun, 22 Dec 2019 20:54:21 +0800 Subject: [PATCH 18/18] fix comments --- .../pai/paiYarn/paiYarnTrainingService.ts | 24 ++++--------------- tools/nni_cmd/launcher_utils.py | 2 +- 2 files changed, 5 insertions(+), 21 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts index 5de0ffaae4..2106cf145f 100644 --- a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts @@ -51,7 +51,6 @@ class PAIYarnTrainingService extends PAITrainingService { if (this.paiClusterConfig === undefined) { throw new Error(`paiBaseClusterConfig not initialized!`); } - const deferred: Deferred = new Deferred(); this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`); @@ -79,18 +78,14 @@ class PAIYarnTrainingService extends PAITrainingService { this.trialJobsMap.set(trialJobId, trialJobDetail); this.jobQueue.push(trialJobId); - deferred.resolve(trialJobDetail); - return deferred.promise; + return trialJobDetail; } public async setClusterMetadata(key: string, value: string): Promise { - const deferred: Deferred = new Deferred(); - switch (key) { case TrialConfigMetadataKey.NNI_MANAGER_IP: this.nniManagerIpConfig = JSON.parse(value); - deferred.resolve(); break; case TrialConfigMetadataKey.PAI_YARN_CLUSTER_CONFIG: @@ -110,28 +105,20 @@ class PAIYarnTrainingService extends PAITrainingService { } else if(this.paiClusterConfig.token) { this.paiToken = this.paiClusterConfig.token; } else { - deferred.reject(new Error('paiBase cluster config format error, please set password or token!')); + throw new Error('pai cluster config format error, please set password or token!'); } - deferred.resolve(); break; case TrialConfigMetadataKey.TRIAL_CONFIG: if (this.paiClusterConfig === undefined) { this.log.error('pai cluster config is not initialized'); - deferred.reject(new Error('pai cluster config is not initialized')); break; } this.paiTrialConfig = JSON.parse(value); // Validate to make sure codeDir doesn't have too many files - try { - await validateCodeDir(this.paiTrialConfig.codeDir); - } catch (error) { - this.log.error(error); - deferred.reject(new Error(error)); - break; - } + await validateCodeDir(this.paiTrialConfig.codeDir); // Copy experiment files from local folder to HDFS this.copyExpCodeDirPromise = HDFSClientUtility.copyDirectoryToHdfs( @@ -146,7 +133,6 @@ class PAIYarnTrainingService extends PAITrainingService { this.copyAuthFilePromise = HDFSClientUtility.copyFileToHdfs(this.paiTrialConfig.authFile, this.authFileHdfsPath, this.hdfsClient); } - deferred.resolve(); break; case TrialConfigMetadataKey.VERSION_CHECK: this.versionCheck = (value === 'true' || value === 'True'); @@ -161,8 +147,6 @@ class PAIYarnTrainingService extends PAITrainingService { //Reject for unknown keys throw new Error(`Uknown key: ${key}`); } - - return deferred.promise; } protected async submitTrialJobToPAI(trialJobId: string): Promise { @@ -349,7 +333,7 @@ class PAIYarnTrainingService extends PAITrainingService { protected postParameterFileMeta(parameterFileMeta: ParameterFileMeta): Promise { const deferred: Deferred = new Deferred(); if (this.paiJobRestServer === undefined) { - throw new Error('paiBaseJobRestServer not implemented!'); + throw new Error('paiJobRestServer not implemented!'); } const req: request.Options = { uri: `${this.paiJobRestServer.endPoint}${this.paiJobRestServer.apiRootUrl}/parameter-file-meta`, diff --git a/tools/nni_cmd/launcher_utils.py b/tools/nni_cmd/launcher_utils.py index 5c1178c979..64d264dadd 100644 --- a/tools/nni_cmd/launcher_utils.py +++ b/tools/nni_cmd/launcher_utils.py @@ -262,7 +262,7 @@ def validate_machine_list(experiment_config): def validate_pai_trial_conifg(experiment_config): '''validate the trial config in pai platform''' - if experiment_config.get('trainingServicePlatform') in ['pai', 'PAIYarn']: + if experiment_config.get('trainingServicePlatform') in ['pai', 'paiYarn']: if experiment_config.get('trial').get('shmMB') and \ experiment_config['trial']['shmMB'] > experiment_config['trial']['memoryMB']: print_error('shmMB should be no more than memoryMB!')