From f7c6ef308553b919da31d16cb45d1fcaa32bc9a8 Mon Sep 17 00:00:00 2001 From: Junwei Sun <30487595+JunweiSUN@users.noreply.github.com> Date: Wed, 12 Aug 2020 19:58:51 +0800 Subject: [PATCH] update nnicli (#2713) (cherry picked from commit f82ef623c1b813e7676849f885c12bcdc98d2a8e) --- docs/en_US/Tutorial/Nnictl.md | 2 +- docs/en_US/conf.py | 1 + docs/en_US/nnicli_ref.md | 41 ++ docs/en_US/sdk_reference.rst | 3 +- src/sdk/pycli/nnicli/nni_client.py | 544 ++++++++++++++++++++++---- test/config/integration_tests.yml | 4 +- test/config/integration_tests_tf2.yml | 4 +- test/config/pr_tests.yml | 8 +- test/nni_test/nnitest/validators.py | 12 +- tools/nni_cmd/updater.py | 2 +- 10 files changed, 520 insertions(+), 101 deletions(-) create mode 100644 docs/en_US/nnicli_ref.md diff --git a/docs/en_US/Tutorial/Nnictl.md b/docs/en_US/Tutorial/Nnictl.md index 3c71ccf8a5..adcc4b91e3 100644 --- a/docs/en_US/Tutorial/Nnictl.md +++ b/docs/en_US/Tutorial/Nnictl.md @@ -262,7 +262,7 @@ Debug mode will disable version check function in Trialkeeper. |Name, shorthand|Required|Default|Description| |------|------|------ |------| |id| False| |ID of the experiment you want to set| - |--value, -v| True| |the experiment duration will be NUMBER seconds. SUFFIX may be 's' for seconds (the default), 'm' for minutes, 'h' for hours or 'd' for days.| + |--value, -v| True| | Strings like '1m' for one minute or '2h' for two hours. SUFFIX may be 's' for seconds, 'm' for minutes, 'h' for hours or 'd' for days.| * Example diff --git a/docs/en_US/conf.py b/docs/en_US/conf.py index 6962037bfb..2d6ce2fb75 100644 --- a/docs/en_US/conf.py +++ b/docs/en_US/conf.py @@ -17,6 +17,7 @@ import os import sys sys.path.insert(0, os.path.abspath('../../src/sdk/pynni')) +sys.path.insert(1, os.path.abspath('../../src/sdk/pycli')) # -- Project information --------------------------------------------------- diff --git a/docs/en_US/nnicli_ref.md b/docs/en_US/nnicli_ref.md new file mode 100644 index 0000000000..02c8cbbb30 --- /dev/null +++ b/docs/en_US/nnicli_ref.md @@ -0,0 +1,41 @@ +# NNI Client + +NNI client is a python API of `nnictl`, which implements the most commonly used commands. Users can use this API to control their experiments, collect experiment results and conduct advanced analyses based on experiment results in python code directly instead of using command line. Here is an example: + +``` +from nnicli import Experiment + +# create an experiment instance +exp = Experiment() + +# start an experiment, then connect the instance to this experiment +# you can also use `resume_experiment`, `view_experiment` or `connect_experiment` +# only one of them should be called in one instance +exp.start_experiment('nni/examples/trials/mnist-pytorch/config.yml', port=9090) + +# update the experiment's concurrency +exp.update_concurrency(3) + +# get some information about the experiment +print(exp.get_experiment_status()) +print(exp.get_job_statistics()) +print(exp.list_trial_jobs()) + +# stop the experiment, then disconnect the instance from the experiment. +exp.stop_experiment() +``` + +## References + +```eval_rst +.. autoclass:: nnicli.Experiment + :members: +.. autoclass:: nnicli.TrialJob + :members: +.. autoclass:: nnicli.TrialHyperParameters + :members: +.. autoclass:: nnicli.TrialMetricData + :members: +.. autoclass:: nnicli.TrialResult + :members: +``` diff --git a/docs/en_US/sdk_reference.rst b/docs/en_US/sdk_reference.rst index 2602e257b9..ca87bf7500 100644 --- a/docs/en_US/sdk_reference.rst +++ b/docs/en_US/sdk_reference.rst @@ -8,4 +8,5 @@ Python API Reference Auto Tune NAS - Compression Utilities \ No newline at end of file + Compression Utilities + NNI Client \ No newline at end of file diff --git a/src/sdk/pycli/nnicli/nni_client.py b/src/sdk/pycli/nnicli/nni_client.py index ca083c2c7f..571e2bd036 100644 --- a/src/sdk/pycli/nnicli/nni_client.py +++ b/src/sdk/pycli/nnicli/nni_client.py @@ -5,67 +5,47 @@ Example: -import nnicli as nc +from nnicli import Experiment -nc.start_nni('../../../../examples/trials/mnist/config.yml') +exp = Experiment() +exp.start_experiment('../../../../examples/trials/mnist-pytorch/config.yml') -nc.set_endpoint('http://localhost:8080') +exp.update_concurrency(3) -print(nc.version()) -print(nc.get_experiment_status()) +print(exp.get_experiment_status()) +print(exp.get_job_statistics()) +print(exp.list_trial_jobs()) -print(nc.get_job_statistics()) -print(nc.list_trial_jobs()) - -nc.stop_nni() +exp.stop_experiment() """ import sys import os import subprocess +import re +import json import requests __all__ = [ - 'start_nni', - 'stop_nni', - 'set_endpoint', - 'version', - 'get_experiment_status', - 'get_experiment_profile', - 'get_trial_job', - 'list_trial_jobs', - 'get_job_statistics', - 'get_job_metrics', - 'export_data' + 'Experiment', + 'TrialResult', + 'TrialMetricData', + 'TrialHyperParameters', + 'TrialJob' ] EXPERIMENT_PATH = 'experiment' -VERSION_PATH = 'version' STATUS_PATH = 'check-status' JOB_STATISTICS_PATH = 'job-statistics' TRIAL_JOBS_PATH = 'trial-jobs' METRICS_PATH = 'metric-data' EXPORT_DATA_PATH = 'export-data' - API_ROOT_PATH = 'api/v1/nni' -_api_endpoint = None - -def set_endpoint(endpoint): - """set endpoint of nni rest server for nnicli, for example: - http://localhost:8080 - """ - global _api_endpoint - _api_endpoint = endpoint - -def _check_endpoint(): - if _api_endpoint is None: - raise AssertionError("Please call set_endpoint to specify nni endpoint") - -def _nni_rest_get(api_path, response_type='json'): - _check_endpoint() - uri = '{}/{}/{}'.format(_api_endpoint, API_ROOT_PATH, api_path) +def _nni_rest_get(endpoint, api_path, response_type='json'): + _check_endpoint(endpoint) + uri = '{}/{}/{}'.format(endpoint.strip('/'), API_ROOT_PATH, api_path) res = requests.get(uri) if _http_succeed(res.status_code): if response_type == 'json': @@ -73,7 +53,7 @@ def _nni_rest_get(api_path, response_type='json'): elif response_type == 'text': return res.text else: - raise AssertionError('Incorrect response_type') + raise RuntimeError('Incorrect response_type') else: return None @@ -92,48 +72,444 @@ def _create_process(cmd): print(output.decode('utf-8').strip()) return process.returncode -def start_nni(config_file): - """start nni experiment with specified configuration file""" - cmd = 'nnictl create --config {}'.format(config_file).split(' ') - if _create_process(cmd) != 0: - raise RuntimeError('Failed to start nni.') - -def stop_nni(): - """stop nni experiment""" - cmd = 'nnictl stop'.split(' ') - if _create_process(cmd) != 0: - raise RuntimeError('Failed to stop nni.') - -def version(): - """return version of nni""" - return _nni_rest_get(VERSION_PATH, 'text') - -def get_experiment_status(): - """return experiment status as a dict""" - return _nni_rest_get(STATUS_PATH) - -def get_experiment_profile(): - """return experiment profile as a dict""" - return _nni_rest_get(EXPERIMENT_PATH) - -def get_trial_job(trial_job_id): - """return trial job information as a dict""" - assert trial_job_id is not None - return _nni_rest_get(os.path.join(TRIAL_JOBS_PATH, trial_job_id)) - -def list_trial_jobs(): - """return information for all trial jobs as a list""" - return _nni_rest_get(TRIAL_JOBS_PATH) - -def get_job_statistics(): - """return trial job statistics information as a dict""" - return _nni_rest_get(JOB_STATISTICS_PATH) - -def get_job_metrics(trial_job_id=None): - """return trial job metrics""" - api_path = METRICS_PATH if trial_job_id is None else os.path.join(METRICS_PATH, trial_job_id) - return _nni_rest_get(api_path) - -def export_data(): - """return exported information for all trial jobs""" - return _nni_rest_get(EXPORT_DATA_PATH) +def _check_endpoint(endpoint): + if endpoint is None: + raise RuntimeError("This instance hasn't been connect to an experiment.") + +class TrialResult: + """ + TrialResult stores the result information of a trial job. + + Parameters + ---------- + json_obj: dict + Json object that stores the result information. + + Attributes + ---------- + parameter: dict + Hyper parameters for this trial. + value: serializable object, usually a number, or a dict with key "default" and other extra keys + Final result. + trialJobId: str + Trial job id. + """ + def __init__(self, json_obj): + self.parameter = None + self.value = None + self.trialJobId = None + for key in json_obj.keys(): + if key == 'id': + setattr(self, 'trialJobId', json_obj[key]) + elif hasattr(self, key): + setattr(self, key, json_obj[key]) + self.value = json.loads(self.value) + + def __repr__(self): + return "TrialResult(parameter: {} value: {} trialJobId: {})".format(self.parameter, self.value, self.trialJobId) + +class TrialMetricData: + """ + TrialMetricData stores the metric data of a trial job. + A trial job may have both intermediate metric and final metric. + + Parameters + ---------- + json_obj: dict + Json object that stores the metric data. + + Attributes + ---------- + timestamp: int + Time stamp. + trialJobId: str + Trial job id. + parameterId: int + Parameter id. + type: str + Metric type, `PERIODICAL` for intermediate result and `FINAL` for final result. + sequence: int + Sequence number in this trial. + data: serializable object, usually a number, or a dict with key "default" and other extra keys + Metric data. + """ + def __init__(self, json_obj): + self.timestamp = None + self.trialJobId = None + self.parameterId = None + self.type = None + self.sequence = None + self.data = None + for key in json_obj.keys(): + setattr(self, key, json_obj[key]) + self.data = json.loads(json.loads(self.data)) + + def __repr__(self): + return "TrialMetricData(timestamp: {} trialJobId: {} parameterId: {} type: {} sequence: {} data: {})" \ + .format(self.timestamp, self.trialJobId, self.parameterId, self.type, self.sequence, self.data) + +class TrialHyperParameters: + """ + TrialHyperParameters stores the hyper parameters of a trial job. + + Parameters + ---------- + json_obj: dict + Json object that stores the hyper parameters. + + Attributes + ---------- + parameter_id: int + Parameter id. + parameter_source: str + Parameter source. + parameters: dict + Hyper parameters. + parameter_index: int + Parameter index. + """ + def __init__(self, json_obj): + self.parameter_id = None + self.parameter_source = None + self.parameters = None + self.parameter_index = None + for key in json_obj.keys(): + if hasattr(self, key): + setattr(self, key, json_obj[key]) + + def __repr__(self): + return "TrialHyperParameters(parameter_id: {} parameter_source: {} parameters: {} parameter_index: {})" \ + .format(self.parameter_id, self.parameter_source, self.parameters, self.parameter_index) + +class TrialJob: + """ + TrialJob stores the information of a trial job. + + Parameters + ---------- + json_obj: dict + json object that stores the hyper parameters + + Attributes + ---------- + trialJobId: str + Trial job id. + status: str + Job status. + hyperParameters: list of `nnicli.TrialHyperParameters` + See `nnicli.TrialHyperParameters`. + logPath: str + Log path. + startTime: int + Job start time (timestamp). + endTime: int + Job end time (timestamp). + finalMetricData: list of `nnicli.TrialMetricData` + See `nnicli.TrialMetricData`. + parameter_index: int + Parameter index. + """ + def __init__(self, json_obj): + self.trialJobId = None + self.status = None + self.hyperParameters = None + self.logPath = None + self.startTime = None + self.endTime = None + self.finalMetricData = None + self.stderrPath = None + for key in json_obj.keys(): + if key == 'id': + setattr(self, 'trialJobId', json_obj[key]) + elif hasattr(self, key): + setattr(self, key, json_obj[key]) + if self.hyperParameters: + self.hyperParameters = [TrialHyperParameters(json.loads(e)) for e in self.hyperParameters] + if self.finalMetricData: + self.finalMetricData = [TrialMetricData(e) for e in self.finalMetricData] + + def __repr__(self): + return ("TrialJob(trialJobId: {} status: {} hyperParameters: {} logPath: {} startTime: {} " + "endTime: {} finalMetricData: {} stderrPath: {})") \ + .format(self.trialJobId, self.status, self.hyperParameters, self.logPath, + self.startTime, self.endTime, self.finalMetricData, self.stderrPath) + +class Experiment: + def __init__(self): + self._endpoint = None + self._exp_id = None + self._port = None + + @property + def endpoint(self): + return self._endpoint + + @property + def exp_id(self): + return self._exp_id + + @property + def port(self): + return self._port + + def _exec_command(self, cmd, port=None): + if self._endpoint is not None: + raise RuntimeError('This instance has been connected to an experiment.') + if _create_process(cmd) != 0: + raise RuntimeError('Failed to establish experiment, please check your config.') + else: + if port: + self._port = port + else: + self._port = 8080 + self._endpoint = 'http://localhost:{}'.format(self._port) + self._exp_id = self.get_experiment_profile()['id'] + + def start_experiment(self, config_file, port=None, debug=False): + """ + Start an experiment with specified configuration file and connect to it. + + Parameters + ---------- + config_file: str + Path to the config file. + port: int + The port of restful server, bigger than 1024. + debug: boolean + Set debug mode. + """ + cmd = 'nnictl create --config {}'.format(config_file).split(' ') + if port: + cmd += '--port {}'.format(port).split(' ') + if debug: + cmd += ['--debug'] + self._exec_command(cmd, port) + + def resume_experiment(self, exp_id, port=None, debug=False): + """ + Resume a stopped experiment with specified experiment id + + Parameters + ---------- + exp_id: str + Experiment id. + port: int + The port of restful server, bigger than 1024. + debug: boolean + Set debug mode. + """ + cmd = 'nnictl resume {}'.format(exp_id).split(' ') + if port: + cmd += '--port {}'.format(port).split(' ') + if debug: + cmd += ['--debug'] + self._exec_command(cmd, port) + + def view_experiment(self, exp_id, port=None): + """ + View a stopped experiment with specified experiment id. + + Parameters + ---------- + exp_id: str + Experiment id. + port: int + The port of restful server, bigger than 1024. + """ + cmd = 'nnictl view {}'.format(exp_id).split(' ') + if port: + cmd += '--port {}'.format(port).split(' ') + self._exec_command(cmd, port) + + def connect_experiment(self, endpoint): + """ + Connect to an existing experiment. + + Parameters + ---------- + endpoint: str + The endpoint of nni rest server, i.e, the url of Web UI. Should be a format like `http://ip:port`. + """ + if self._endpoint is not None: + raise RuntimeError('This instance has been connected to an experiment.') + self._endpoint = endpoint + try: + self._exp_id = self.get_experiment_profile()['id'] + except TypeError: + raise RuntimeError('Invalid experiment endpoint.') + self._port = int(re.search(r':[0-9]+', self._endpoint).group().replace(':', '')) + + def stop_experiment(self): + """Stop the experiment. + """ + _check_endpoint(self._endpoint) + cmd = 'nnictl stop {}'.format(self._exp_id).split(' ') + if _create_process(cmd) != 0: + raise RuntimeError('Failed to stop experiment.') + self._endpoint = None + self._exp_id = None + self._port = None + + def update_searchspace(self, filename): + """ + Update the experiment's search space. + + Parameters + ---------- + filename: str + Path to the searchspace file. + """ + _check_endpoint(self._endpoint) + cmd = 'nnictl update searchspace {} --filename {}'.format(self._exp_id, filename).split(' ') + if _create_process(cmd) != 0: + raise RuntimeError('Failed to update searchspace.') + + def update_concurrency(self, value): + """ + Update an experiment's concurrency + + Parameters + ---------- + value: int + New concurrency value. + """ + _check_endpoint(self._endpoint) + cmd = 'nnictl update concurrency {} --value {}'.format(self._exp_id, value).split(' ') + if _create_process(cmd) != 0: + raise RuntimeError('Failed to update concurrency.') + + def update_duration(self, value): + """ + Update an experiment's duration + + Parameters + ---------- + value: str + Strings like '1m' for one minute or '2h' for two hours. + SUFFIX may be 's' for seconds, 'm' for minutes, 'h' for hours or 'd' for days. + """ + _check_endpoint(self._endpoint) + cmd = 'nnictl update duration {} --value {}'.format(self._exp_id, value).split(' ') + if _create_process(cmd) != 0: + raise RuntimeError('Failed to update duration.') + + def update_trailnum(self, value): + """ + Update an experiment's maxtrialnum + + Parameters + ---------- + value: int + New trailnum value. + """ + _check_endpoint(self._endpoint) + cmd = 'nnictl update trialnum {} --value {}'.format(self._exp_id, value).split(' ') + if _create_process(cmd) != 0: + raise RuntimeError('Failed to update trailnum.') + + def get_experiment_status(self): + """ + Return experiment status as a dict. + + Returns + ---------- + dict + Experiment status. + """ + _check_endpoint(self._endpoint) + return _nni_rest_get(self._endpoint, STATUS_PATH) + + def get_trial_job(self, trial_job_id): + """ + Return a trial job. + + Parameters + ---------- + trial_job_id: str + Trial job id. + + Returns + ---------- + nnicli.TrialJob + A `nnicli.TrialJob` instance corresponding to `trial_job_id`. + """ + _check_endpoint(self._endpoint) + assert trial_job_id is not None + trial_job = _nni_rest_get(self._endpoint, os.path.join(TRIAL_JOBS_PATH, trial_job_id)) + return TrialJob(trial_job) + + def list_trial_jobs(self): + """ + Return information for all trial jobs as a list. + + Returns + ---------- + list + List of `nnicli.TrialJob`. + """ + _check_endpoint(self._endpoint) + trial_jobs = _nni_rest_get(self._endpoint, TRIAL_JOBS_PATH) + return [TrialJob(e) for e in trial_jobs] + + def get_job_statistics(self): + """ + Return trial job statistics information as a dict. + + Returns + ---------- + list + Job statistics information. + """ + _check_endpoint(self._endpoint) + return _nni_rest_get(self._endpoint, JOB_STATISTICS_PATH) + + def get_job_metrics(self, trial_job_id=None): + """ + Return trial job metrics. + + Parameters + ---------- + trial_job_id: str + trial job id. if this parameter is None, all trail jobs' metrics will be returned. + + Returns + ---------- + dict + Each key is a trialJobId, the corresponding value is a list of `nnicli.TrialMetricData`. + """ + _check_endpoint(self._endpoint) + api_path = METRICS_PATH if trial_job_id is None else os.path.join(METRICS_PATH, trial_job_id) + output = {} + trail_metrics = _nni_rest_get(self._endpoint, api_path) + for metric in trail_metrics: + trial_id = metric["trialJobId"] + if trial_id not in output: + output[trial_id] = [TrialMetricData(metric)] + else: + output[trial_id].append(TrialMetricData(metric)) + return output + + def export_data(self): + """ + Return exported information for all trial jobs. + + Returns + ---------- + list + List of `nnicli.TrialResult`. + """ + _check_endpoint(self._endpoint) + trial_results = _nni_rest_get(self._endpoint, EXPORT_DATA_PATH) + return [TrialResult(e) for e in trial_results] + + def get_experiment_profile(self): + """ + Return experiment profile as a dict. + + Returns + ---------- + dict + The profile of the experiment. + """ + _check_endpoint(self._endpoint) + return _nni_rest_get(self._endpoint, EXPERIMENT_PATH) diff --git a/test/config/integration_tests.yml b/test/config/integration_tests.yml index b3802239da..4078321479 100644 --- a/test/config/integration_tests.yml +++ b/test/config/integration_tests.yml @@ -140,8 +140,8 @@ testCases: config: maxTrialNum: 4 trialConcurrency: 4 - launchCommand: python3 -c 'import nnicli as nc; nc.start_nni("$configFile")' - stopCommand: python3 -c 'import nnicli as nc; nc.stop_nni()' + launchCommand: python3 -c 'from nnicli import Experiment; exp = Experiment(); exp.start_experiment("$configFile")' + stopCommand: python3 -c 'from nnicli import Experiment; exp = Experiment(); exp.connect_experiment("http://localhost:8080/"); exp.stop_experiment()' validator: class: NnicliValidator platform: linux darwin diff --git a/test/config/integration_tests_tf2.yml b/test/config/integration_tests_tf2.yml index e060511289..2002f36367 100644 --- a/test/config/integration_tests_tf2.yml +++ b/test/config/integration_tests_tf2.yml @@ -110,8 +110,8 @@ testCases: config: maxTrialNum: 4 trialConcurrency: 4 - launchCommand: python3 -c 'import nnicli as nc; nc.start_nni("$configFile")' - stopCommand: python3 -c 'import nnicli as nc; nc.stop_nni()' + launchCommand: python3 -c 'from nnicli import Experiment; exp = Experiment(); exp.start_experiment("$configFile")' + stopCommand: python3 -c 'from nnicli import Experiment; exp = Experiment(); exp.connect_experiment("http://localhost:8080/"); exp.stop_experiment()' validator: class: NnicliValidator platform: linux darwin diff --git a/test/config/pr_tests.yml b/test/config/pr_tests.yml index d49bf9d7ec..f82143b836 100644 --- a/test/config/pr_tests.yml +++ b/test/config/pr_tests.yml @@ -45,10 +45,10 @@ testCases: - name: nnicli configFile: test/config/examples/sklearn-regression.yml config: - maxTrialNum: 2 - trialConcurrency: 2 - launchCommand: python3 -c 'import nnicli as nc; nc.start_nni("$configFile")' - stopCommand: python3 -c 'import nnicli as nc; nc.stop_nni()' + maxTrialNum: 4 + trialConcurrency: 4 + launchCommand: python3 -c 'from nnicli import Experiment; exp = Experiment(); exp.start_experiment("$configFile")' + stopCommand: python3 -c 'from nnicli import Experiment; exp = Experiment(); exp.connect_experiment("http://localhost:8080/"); exp.stop_experiment()' validator: class: NnicliValidator platform: linux darwin diff --git a/test/nni_test/nnitest/validators.py b/test/nni_test/nnitest/validators.py index 1cdadb8669..5ad9090c18 100644 --- a/test/nni_test/nnitest/validators.py +++ b/test/nni_test/nnitest/validators.py @@ -6,7 +6,7 @@ import subprocess import json import requests -import nnicli as nc +from nnicli import Experiment from utils import METRICS_URL @@ -80,8 +80,8 @@ def get_metric_results(self, metrics): class NnicliValidator(ITValidator): def __call__(self, rest_endpoint, experiment_dir, nni_source_dir, **kwargs): print(rest_endpoint) - nc.set_endpoint(rest_endpoint) - #print(nc.version()) - print(nc.get_job_statistics()) - print(nc.get_experiment_status()) - print(nc.list_trial_jobs()) + exp = Experiment() + exp.connect_experiment(rest_endpoint) + print(exp.get_job_statistics()) + print(exp.get_experiment_status()) + print(exp.list_trial_jobs()) diff --git a/tools/nni_cmd/updater.py b/tools/nni_cmd/updater.py index 13ee679c49..c9991b8bab 100644 --- a/tools/nni_cmd/updater.py +++ b/tools/nni_cmd/updater.py @@ -14,7 +14,7 @@ def validate_digit(value, start, end): '''validate if a digit is valid''' if not str(value).isdigit() or int(value) < start or int(value) > end: - raise ValueError('%s must be a digit from %s to %s' % (value, start, end)) + raise ValueError('value (%s) must be a digit from %s to %s' % (value, start, end)) def validate_file(path): '''validate if a file exist'''