Skip to content

Commit

Permalink
Add nightly builds for docker images (mars-project#2456)
Browse files Browse the repository at this point in the history
  • Loading branch information
wjsi committed Sep 16, 2021
1 parent f8bf144 commit 64c9559
Show file tree
Hide file tree
Showing 14 changed files with 76 additions and 35 deletions.
13 changes: 11 additions & 2 deletions .github/workflows/docker-cd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ name: Mars CD for DockerHub

on:
push:
branches:
- master
- main
- 'v*'
tags:
- '*'

Expand All @@ -27,5 +31,10 @@ jobs:
if [[ "$DOCKER_ORG" == "marsuploader" ]]; then
export DOCKER_ORG="marsproject"
fi
bash bin/kube-image-tool.sh -o "$DOCKER_ORG" -t "$GIT_TAG" build
docker push "$DOCKER_ORG/mars:$GIT_TAG"
if [[ -n "$GIT_TAG" ]]; then
export IMAGE_TAG="$GIT_TAG"
else
export IMAGE_TAG="nightly-$GIT_BRANCH"
fi
bash bin/kube-image-tool.sh -o "$DOCKER_ORG" -t "$IMAGE_TAG" build
docker push "$DOCKER_ORG/mars:$IMAGE_TAG"
2 changes: 2 additions & 0 deletions ci/reload-env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ if [[ "$GITHUB_REF" =~ ^"refs/tags/" ]]; then
export GITHUB_TAG_REF="$GITHUB_REF"
unset CYTHON_TRACE
export GIT_TAG=$(echo "$GITHUB_REF" | sed -e "s/refs\/tags\///g")
else
export GIT_BRANCH=$(echo "$GITHUB_REF" | sed -e "s/refs\/heads\///g")
fi

if [[ $UNAME == "mingw"* ]] || [[ $UNAME == "msys"* ]]; then
Expand Down
4 changes: 2 additions & 2 deletions docs/source/installation/kubernetes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,10 @@ Arguments for workers:
+====================+================================================================+
| worker_num | Number of workers in the cluster, 1 by default |
+--------------------+----------------------------------------------------------------+
| worker_cpu | Number of CPUs for every worker |
| worker_cpu | Number of CPUs for every worker, required. |
+--------------------+----------------------------------------------------------------+
| worker_mem | Memory size for workers in the cluster, in bytes or size units |
| | like ``1g`` |
| | like ``1g``, required. |
+--------------------+----------------------------------------------------------------+
| worker_spill_paths | List of spill paths for worker pods on hosts |
+--------------------+----------------------------------------------------------------+
Expand Down
4 changes: 2 additions & 2 deletions docs/source/installation/yarn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,10 @@ Arguments for workers:
+====================+================================================================+
| worker_num | Number of workers in the cluster, 1 by default |
+--------------------+----------------------------------------------------------------+
| worker_cpu | Number of CPUs for every worker |
| worker_cpu | Number of CPUs for every worker, required. |
+--------------------+----------------------------------------------------------------+
| worker_mem | Memory size for workers in the cluster, in bytes or size units |
| | like ``1g`` |
| | like ``1g``, required. |
+--------------------+----------------------------------------------------------------+
| worker_spill_paths | List of spill paths for worker pods on hosts |
+--------------------+----------------------------------------------------------------+
Expand Down
14 changes: 9 additions & 5 deletions docs/source/locale/zh_CN/LC_MESSAGES/installation/kubernetes.po
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: mars 0.5.0a2\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2021-08-04 17:13+0800\n"
"POT-Creation-Date: 2021-09-16 16:05+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language-Team: LANGUAGE <[email protected]>\n"
Expand Down Expand Up @@ -169,16 +169,20 @@ msgid "worker_cpu"
msgstr ""

#: ../../source/installation/kubernetes.rst:107
msgid "Number of CPUs for every worker"
msgstr "每个 Worker 的 CPU 数目"
msgid "Number of CPUs for every worker, required."
msgstr "每个 Worker 的 CPU 数目,此参数为必需"

#: ../../source/installation/kubernetes.rst:109
msgid "worker_mem"
msgstr ""

#: ../../source/installation/kubernetes.rst:109
msgid "Memory size for workers in the cluster, in bytes or size units like ``1g``"
msgstr "每个 Worker 的内存大小,可使用字节数或带单位的大小,例如 ``1g``"
msgid ""
"Memory size for workers in the cluster, in bytes or size units like "
"``1g``, required."
msgstr ""
"每个 Worker 的内存大小,可使用字节数或带单位的大小,例如 ``1g``,此参数为"
"必需"

#: ../../source/installation/kubernetes.rst:112
msgid "worker_spill_paths"
Expand Down
14 changes: 9 additions & 5 deletions docs/source/locale/zh_CN/LC_MESSAGES/installation/yarn.po
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: mars 0.7.0a2\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2021-08-04 17:36+0800\n"
"POT-Creation-Date: 2021-09-16 16:05+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language-Team: LANGUAGE <[email protected]>\n"
Expand Down Expand Up @@ -185,16 +185,20 @@ msgid "worker_cpu"
msgstr ""

#: ../../source/installation/yarn.rst:125
msgid "Number of CPUs for every worker"
msgstr "每个 Worker 的 CPU 数目"
msgid "Number of CPUs for every worker, required."
msgstr "每个 Worker 的 CPU 数目,此参数为必需"

#: ../../source/installation/yarn.rst:127
msgid "worker_mem"
msgstr ""

#: ../../source/installation/yarn.rst:127
msgid "Memory size for workers in the cluster, in bytes or size units like ``1g``"
msgstr "每个 Worker 的内存大小,可使用字节数或带单位的大小,例如 ``1g``"
msgid ""
"Memory size for workers in the cluster, in bytes or size units like "
"``1g``, required."
msgstr ""
"每个 Worker 的内存大小,可使用字节数或带单位的大小,例如 ``1g``,此参数为"
"必需"

#: ../../source/installation/yarn.rst:130
msgid "worker_spill_paths"
Expand Down
5 changes: 4 additions & 1 deletion mars/deploy/kubernetes/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ class KubernetesCluster:
_default_web_port = 7104

def __init__(self, kube_api_client=None, image=None, namespace=None,
supervisor_num=1, supervisor_cpu=None, supervisor_mem=None,
supervisor_num=1, supervisor_cpu=1, supervisor_mem='4G',
supervisor_mem_limit_ratio=None,
worker_num=1, worker_cpu=None, worker_mem=None,
worker_spill_paths=None, worker_cache_mem=None, min_worker_num=None,
Expand All @@ -83,6 +83,9 @@ def __init__(self, kube_api_client=None, image=None, namespace=None,
timeout=None, **kwargs):
from kubernetes import client as kube_client

if worker_cpu is None or worker_mem is None: # pragma: no cover
raise TypeError('`worker_cpu` and `worker_mem` must be specified')

self._api_client = kube_api_client
self._core_api = kube_client.CoreV1Api(kube_api_client)

Expand Down
2 changes: 1 addition & 1 deletion mars/deploy/kubernetes/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,7 @@ def __init__(self, *args, **kwargs):
else:
size_limit = None

if mount_shm and size_limit:
if mount_shm:
self.add_volume(EmptyDirVolumeConfig(
'mars-shared', '/dev/shm', size_limit=size_limit
))
Expand Down
2 changes: 1 addition & 1 deletion mars/deploy/kubernetes/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(self, node_role=None, pool_address=None,
self._pool_address = pool_address
self._k8s_config = k8s_config

verify_ssl = bool(int(os.environ.get('KUBE_VERIFY_SSL', '1').strip('"')))
verify_ssl = bool(int(os.environ.get('KUBE_VERIFY_SSL', '1')))
if not verify_ssl:
c = client.Configuration()
c.verify_ssl = False
Expand Down
5 changes: 4 additions & 1 deletion mars/deploy/kubernetes/tests/test_kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,8 @@ def _start_kube_cluster(use_test_docker_file=True, **kwargs):
@pytest.mark.skipif(not kube_available, reason='Cannot run without kubernetes')
def test_run_in_kubernetes(use_test_docker_file):
with _start_kube_cluster(
worker_mem='1G', worker_cache_mem='128m',
supervisor_cpu=0.5, supervisor_mem='1G',
worker_cpu=0.5, worker_mem='1G', worker_cache_mem='64m',
extra_labels={'mars-test/group': 'test-label-name'},
extra_env={'MARS_K8S_GROUP_LABELS': 'mars-test/group'},
use_test_docker_file=use_test_docker_file):
Expand All @@ -186,6 +187,8 @@ def test_create_timeout():
extra_vol_config = HostPathVolumeConfig('mars-src-path', '/mnt/mars', MARS_ROOT)
with pytest.raises(TimeoutError):
cluster = new_cluster(api_client, image='pseudo_image',
supervisor_cpu=0.5, supervisor_mem='1G',
worker_cpu=0.5, worker_mem='1G',
extra_volumes=[extra_vol_config], timeout=1)
finally:
if cluster:
Expand Down
3 changes: 3 additions & 0 deletions mars/deploy/yarn/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ def _override_envs(src, updates):
ret.update(updates)
return ret

if worker_cpu is None or worker_mem is None: # pragma: no cover
raise TypeError('`worker_cpu` and `worker_mem` must be specified')

app_name = app_name or f'mars-app-{uuid.uuid4()}'
supervisor_mem = calc_size_by_str(supervisor_mem, None)
worker_mem = calc_size_by_str(worker_mem, None)
Expand Down
7 changes: 5 additions & 2 deletions mars/deploy/yarn/tests/test_yarn.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ def _run_yarn_test_with_env(env_path, timeout):
cmd_tmpl = '"{executable}" -m coverage run --source=%s/mars --rcfile=%s/setup.cfg' \
% (MARS_ROOT, MARS_ROOT)
extra_env = {'COVERAGE_FILE': coverage_result, 'COVERAGE_PROCESS_START': f'{MARS_ROOT}/setup.cfg'}
cluster = new_cluster(env_path, timeout=timeout, extra_env=extra_env, log_config=log_config_file,
cluster = new_cluster(env_path, timeout=timeout, worker_cpu=1, worker_mem='1G',
extra_env=extra_env, log_config=log_config_file,
extra_args=f'--config-file {MARS_ROOT}/mars/deploy/yarn/tests/test_yarn_config.yml',
log_when_fail=True, cmd_tmpl=cmd_tmpl)
assert cluster.endpoint is not None
Expand Down Expand Up @@ -139,7 +140,9 @@ def test_create_timeout():
log_config_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'yarn-logging.conf')

with pytest.raises(TimeoutError):
cluster = new_cluster(env_path, log_config=log_config_file, worker_cache_mem='64m',
cluster = new_cluster(env_path, log_config=log_config_file,
worker_cpu=1, worker_mem='1G',
worker_cache_mem='64m',
log_when_fail=True, timeout=1)
finally:
if cluster is not None:
Expand Down
29 changes: 16 additions & 13 deletions mars/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.

import logging
import math
import os
import subprocess # nosec
import sys
Expand All @@ -24,34 +25,36 @@
import psutil

from .lib import nvutils
from .utils import get_bool_environ

logger = logging.getLogger(__name__)

CGROUP_CPU_STAT_FILE = '/sys/fs/cgroup/cpuacct/cpuacct.stat'
CGROUP_CPU_STAT_FILE = '/sys/fs/cgroup/cpuacct/cpuacct.usage'
CGROUP_MEM_STAT_FILE = '/sys/fs/cgroup/memory/memory.stat'

_proc = psutil.Process()
_timer = getattr(time, 'monotonic', time.time)

_cpu_use_process_stat = bool(int(os.environ.get('MARS_CPU_USE_PROCESS_STAT', '0').strip('"')))
_cpu_use_cgroup_stat = bool(int(os.environ.get('MARS_CPU_USE_CGROUP_STAT', '0').strip('"')))
_mem_use_process_stat = bool(int(os.environ.get('MARS_MEM_USE_PROCESS_STAT', '0').strip('"')))
_mem_use_cgroup_stat = bool(int(os.environ.get('MARS_MEM_USE_CGROUP_STAT', '0').strip('"')))
_use_process_stat = get_bool_environ('MARS_USE_PROCESS_STAT')
_use_cgroup_stat = get_bool_environ('MARS_USE_CGROUP_STAT')
_cpu_use_process_stat = get_bool_environ('MARS_CPU_USE_PROCESS_STAT')
_cpu_use_cgroup_stat = get_bool_environ('MARS_CPU_USE_CGROUP_STAT')
_mem_use_process_stat = get_bool_environ('MARS_MEM_USE_PROCESS_STAT')
_mem_use_cgroup_stat = get_bool_environ('MARS_MEM_USE_CGROUP_STAT')

if 'MARS_USE_PROCESS_STAT' in os.environ:
_cpu_use_process_stat = _mem_use_process_stat = \
bool(int(os.environ['MARS_USE_PROCESS_STAT'].strip('"')))
if 'MARS_USE_CGROUP_STAT' in os.environ:
_cpu_use_cgroup_stat = _mem_use_cgroup_stat = \
bool(int(os.environ['MARS_USE_CGROUP_STAT'].strip('"')))
# if general config exists, overwrite individual ones
if _use_process_stat is not None:
_cpu_use_process_stat = _mem_use_process_stat = _use_process_stat
if _use_cgroup_stat is not None:
_cpu_use_cgroup_stat = _mem_use_cgroup_stat = _use_cgroup_stat

if 'MARS_CPU_TOTAL' in os.environ:
_cpu_total = int(os.environ['MARS_CPU_TOTAL'].strip('"'))
_cpu_total = int(math.ceil(float(os.environ['MARS_CPU_TOTAL'])))
else:
_cpu_total = psutil.cpu_count(logical=True)

if 'MARS_MEMORY_TOTAL' in os.environ:
_mem_total = int(os.environ['MARS_MEMORY_TOTAL'].strip('"'))
_mem_total = int(os.environ['MARS_MEMORY_TOTAL'])
else:
_mem_total = None

Expand Down
7 changes: 7 additions & 0 deletions mars/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,13 @@ def __getattr__(self, item):
f"'AttributeDict' object has no attribute {item}")


def get_bool_environ(var_name: str) -> Optional[bool]:
var_value = os.environ.get(var_name)
if not var_value:
return None
return bool(int(var_value))


def on_serialize_shape(shape: Tuple[int]):
if shape:
return tuple(s if not np.isnan(s) else -1 for s in shape)
Expand Down

0 comments on commit 64c9559

Please sign in to comment.