Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add nightly builds for docker images #2456

Merged
merged 3 commits into from
Sep 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions .github/workflows/docker-cd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ name: Mars CD for DockerHub

on:
push:
branches:
- master
- main
- 'v*'
tags:
- '*'

Expand All @@ -27,5 +31,10 @@ jobs:
if [[ "$DOCKER_ORG" == "marsuploader" ]]; then
export DOCKER_ORG="marsproject"
fi
bash bin/kube-image-tool.sh -o "$DOCKER_ORG" -t "$GIT_TAG" build
docker push "$DOCKER_ORG/mars:$GIT_TAG"
if [[ -n "$GIT_TAG" ]]; then
export IMAGE_TAG="$GIT_TAG"
else
export IMAGE_TAG="nightly-$GIT_BRANCH"
fi
bash bin/kube-image-tool.sh -o "$DOCKER_ORG" -t "$IMAGE_TAG" build
docker push "$DOCKER_ORG/mars:$IMAGE_TAG"
2 changes: 2 additions & 0 deletions ci/reload-env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ if [[ "$GITHUB_REF" =~ ^"refs/tags/" ]]; then
export GITHUB_TAG_REF="$GITHUB_REF"
unset CYTHON_TRACE
export GIT_TAG=$(echo "$GITHUB_REF" | sed -e "s/refs\/tags\///g")
else
export GIT_BRANCH=$(echo "$GITHUB_REF" | sed -e "s/refs\/heads\///g")
fi

if [[ $UNAME == "mingw"* ]] || [[ $UNAME == "msys"* ]]; then
Expand Down
4 changes: 2 additions & 2 deletions docs/source/installation/kubernetes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,10 @@ Arguments for workers:
+====================+================================================================+
| worker_num | Number of workers in the cluster, 1 by default |
+--------------------+----------------------------------------------------------------+
| worker_cpu | Number of CPUs for every worker |
| worker_cpu | Number of CPUs for every worker, required. |
+--------------------+----------------------------------------------------------------+
| worker_mem | Memory size for workers in the cluster, in bytes or size units |
| | like ``1g`` |
| | like ``1g``, required. |
+--------------------+----------------------------------------------------------------+
| worker_spill_paths | List of spill paths for worker pods on hosts |
+--------------------+----------------------------------------------------------------+
Expand Down
4 changes: 2 additions & 2 deletions docs/source/installation/yarn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,10 @@ Arguments for workers:
+====================+================================================================+
| worker_num | Number of workers in the cluster, 1 by default |
+--------------------+----------------------------------------------------------------+
| worker_cpu | Number of CPUs for every worker |
| worker_cpu | Number of CPUs for every worker, required. |
+--------------------+----------------------------------------------------------------+
| worker_mem | Memory size for workers in the cluster, in bytes or size units |
| | like ``1g`` |
| | like ``1g``, required. |
+--------------------+----------------------------------------------------------------+
| worker_spill_paths | List of spill paths for worker pods on hosts |
+--------------------+----------------------------------------------------------------+
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: mars 0.5.0a2\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2021-08-04 17:13+0800\n"
"POT-Creation-Date: 2021-09-16 16:05+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language-Team: LANGUAGE <[email protected]>\n"
Expand Down Expand Up @@ -169,16 +169,20 @@ msgid "worker_cpu"
msgstr ""

#: ../../source/installation/kubernetes.rst:107
msgid "Number of CPUs for every worker"
msgstr "每个 Worker 的 CPU 数目"
msgid "Number of CPUs for every worker, required."
msgstr "每个 Worker 的 CPU 数目,此参数为必需"

#: ../../source/installation/kubernetes.rst:109
msgid "worker_mem"
msgstr ""

#: ../../source/installation/kubernetes.rst:109
msgid "Memory size for workers in the cluster, in bytes or size units like ``1g``"
msgstr "每个 Worker 的内存大小,可使用字节数或带单位的大小,例如 ``1g``"
msgid ""
"Memory size for workers in the cluster, in bytes or size units like "
"``1g``, required."
msgstr ""
"每个 Worker 的内存大小,可使用字节数或带单位的大小,例如 ``1g``,此参数为"
"必需"

#: ../../source/installation/kubernetes.rst:112
msgid "worker_spill_paths"
Expand Down
14 changes: 9 additions & 5 deletions docs/source/locale/zh_CN/LC_MESSAGES/installation/yarn.po
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: mars 0.7.0a2\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2021-08-04 17:36+0800\n"
"POT-Creation-Date: 2021-09-16 16:05+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language-Team: LANGUAGE <[email protected]>\n"
Expand Down Expand Up @@ -185,16 +185,20 @@ msgid "worker_cpu"
msgstr ""

#: ../../source/installation/yarn.rst:125
msgid "Number of CPUs for every worker"
msgstr "每个 Worker 的 CPU 数目"
msgid "Number of CPUs for every worker, required."
msgstr "每个 Worker 的 CPU 数目,此参数为必需"

#: ../../source/installation/yarn.rst:127
msgid "worker_mem"
msgstr ""

#: ../../source/installation/yarn.rst:127
msgid "Memory size for workers in the cluster, in bytes or size units like ``1g``"
msgstr "每个 Worker 的内存大小,可使用字节数或带单位的大小,例如 ``1g``"
msgid ""
"Memory size for workers in the cluster, in bytes or size units like "
"``1g``, required."
msgstr ""
"每个 Worker 的内存大小,可使用字节数或带单位的大小,例如 ``1g``,此参数为"
"必需"

#: ../../source/installation/yarn.rst:130
msgid "worker_spill_paths"
Expand Down
5 changes: 4 additions & 1 deletion mars/deploy/kubernetes/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ class KubernetesCluster:
_default_web_port = 7104

def __init__(self, kube_api_client=None, image=None, namespace=None,
supervisor_num=1, supervisor_cpu=None, supervisor_mem=None,
supervisor_num=1, supervisor_cpu=1, supervisor_mem='4G',
supervisor_mem_limit_ratio=None,
worker_num=1, worker_cpu=None, worker_mem=None,
worker_spill_paths=None, worker_cache_mem=None, min_worker_num=None,
Expand All @@ -83,6 +83,9 @@ def __init__(self, kube_api_client=None, image=None, namespace=None,
timeout=None, **kwargs):
from kubernetes import client as kube_client

if worker_cpu is None or worker_mem is None: # pragma: no cover
raise TypeError('`worker_cpu` and `worker_mem` must be specified')

self._api_client = kube_api_client
self._core_api = kube_client.CoreV1Api(kube_api_client)

Expand Down
2 changes: 1 addition & 1 deletion mars/deploy/kubernetes/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,7 @@ def __init__(self, *args, **kwargs):
else:
size_limit = None

if mount_shm and size_limit:
if mount_shm:
self.add_volume(EmptyDirVolumeConfig(
'mars-shared', '/dev/shm', size_limit=size_limit
))
Expand Down
2 changes: 1 addition & 1 deletion mars/deploy/kubernetes/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(self, node_role=None, pool_address=None,
self._pool_address = pool_address
self._k8s_config = k8s_config

verify_ssl = bool(int(os.environ.get('KUBE_VERIFY_SSL', '1').strip('"')))
verify_ssl = bool(int(os.environ.get('KUBE_VERIFY_SSL', '1')))
if not verify_ssl:
c = client.Configuration()
c.verify_ssl = False
Expand Down
5 changes: 4 additions & 1 deletion mars/deploy/kubernetes/tests/test_kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,8 @@ def _start_kube_cluster(use_test_docker_file=True, **kwargs):
@pytest.mark.skipif(not kube_available, reason='Cannot run without kubernetes')
def test_run_in_kubernetes(use_test_docker_file):
with _start_kube_cluster(
worker_mem='1G', worker_cache_mem='128m',
supervisor_cpu=0.5, supervisor_mem='1G',
worker_cpu=0.5, worker_mem='1G', worker_cache_mem='64m',
extra_labels={'mars-test/group': 'test-label-name'},
extra_env={'MARS_K8S_GROUP_LABELS': 'mars-test/group'},
use_test_docker_file=use_test_docker_file):
Expand All @@ -186,6 +187,8 @@ def test_create_timeout():
extra_vol_config = HostPathVolumeConfig('mars-src-path', '/mnt/mars', MARS_ROOT)
with pytest.raises(TimeoutError):
cluster = new_cluster(api_client, image='pseudo_image',
supervisor_cpu=0.5, supervisor_mem='1G',
worker_cpu=0.5, worker_mem='1G',
extra_volumes=[extra_vol_config], timeout=1)
finally:
if cluster:
Expand Down
3 changes: 3 additions & 0 deletions mars/deploy/yarn/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ def _override_envs(src, updates):
ret.update(updates)
return ret

if worker_cpu is None or worker_mem is None: # pragma: no cover
raise TypeError('`worker_cpu` and `worker_mem` must be specified')

app_name = app_name or f'mars-app-{uuid.uuid4()}'
supervisor_mem = calc_size_by_str(supervisor_mem, None)
worker_mem = calc_size_by_str(worker_mem, None)
Expand Down
7 changes: 5 additions & 2 deletions mars/deploy/yarn/tests/test_yarn.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ def _run_yarn_test_with_env(env_path, timeout):
cmd_tmpl = '"{executable}" -m coverage run --source=%s/mars --rcfile=%s/setup.cfg' \
% (MARS_ROOT, MARS_ROOT)
extra_env = {'COVERAGE_FILE': coverage_result, 'COVERAGE_PROCESS_START': f'{MARS_ROOT}/setup.cfg'}
cluster = new_cluster(env_path, timeout=timeout, extra_env=extra_env, log_config=log_config_file,
cluster = new_cluster(env_path, timeout=timeout, worker_cpu=1, worker_mem='1G',
extra_env=extra_env, log_config=log_config_file,
extra_args=f'--config-file {MARS_ROOT}/mars/deploy/yarn/tests/test_yarn_config.yml',
log_when_fail=True, cmd_tmpl=cmd_tmpl)
assert cluster.endpoint is not None
Expand Down Expand Up @@ -139,7 +140,9 @@ def test_create_timeout():
log_config_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'yarn-logging.conf')

with pytest.raises(TimeoutError):
cluster = new_cluster(env_path, log_config=log_config_file, worker_cache_mem='64m',
cluster = new_cluster(env_path, log_config=log_config_file,
worker_cpu=1, worker_mem='1G',
worker_cache_mem='64m',
log_when_fail=True, timeout=1)
finally:
if cluster is not None:
Expand Down
29 changes: 16 additions & 13 deletions mars/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.

import logging
import math
import os
import subprocess # nosec
import sys
Expand All @@ -24,34 +25,36 @@
import psutil

from .lib import nvutils
from .utils import get_bool_environ

logger = logging.getLogger(__name__)

CGROUP_CPU_STAT_FILE = '/sys/fs/cgroup/cpuacct/cpuacct.stat'
CGROUP_CPU_STAT_FILE = '/sys/fs/cgroup/cpuacct/cpuacct.usage'
CGROUP_MEM_STAT_FILE = '/sys/fs/cgroup/memory/memory.stat'

_proc = psutil.Process()
_timer = getattr(time, 'monotonic', time.time)

_cpu_use_process_stat = bool(int(os.environ.get('MARS_CPU_USE_PROCESS_STAT', '0').strip('"')))
_cpu_use_cgroup_stat = bool(int(os.environ.get('MARS_CPU_USE_CGROUP_STAT', '0').strip('"')))
_mem_use_process_stat = bool(int(os.environ.get('MARS_MEM_USE_PROCESS_STAT', '0').strip('"')))
_mem_use_cgroup_stat = bool(int(os.environ.get('MARS_MEM_USE_CGROUP_STAT', '0').strip('"')))
_use_process_stat = get_bool_environ('MARS_USE_PROCESS_STAT')
_use_cgroup_stat = get_bool_environ('MARS_USE_CGROUP_STAT')
_cpu_use_process_stat = get_bool_environ('MARS_CPU_USE_PROCESS_STAT')
_cpu_use_cgroup_stat = get_bool_environ('MARS_CPU_USE_CGROUP_STAT')
_mem_use_process_stat = get_bool_environ('MARS_MEM_USE_PROCESS_STAT')
_mem_use_cgroup_stat = get_bool_environ('MARS_MEM_USE_CGROUP_STAT')

if 'MARS_USE_PROCESS_STAT' in os.environ:
_cpu_use_process_stat = _mem_use_process_stat = \
bool(int(os.environ['MARS_USE_PROCESS_STAT'].strip('"')))
if 'MARS_USE_CGROUP_STAT' in os.environ:
_cpu_use_cgroup_stat = _mem_use_cgroup_stat = \
bool(int(os.environ['MARS_USE_CGROUP_STAT'].strip('"')))
# if general config exists, overwrite individual ones
if _use_process_stat is not None:
_cpu_use_process_stat = _mem_use_process_stat = _use_process_stat
if _use_cgroup_stat is not None:
_cpu_use_cgroup_stat = _mem_use_cgroup_stat = _use_cgroup_stat

if 'MARS_CPU_TOTAL' in os.environ:
_cpu_total = int(os.environ['MARS_CPU_TOTAL'].strip('"'))
_cpu_total = int(math.ceil(float(os.environ['MARS_CPU_TOTAL'])))
else:
_cpu_total = psutil.cpu_count(logical=True)

if 'MARS_MEMORY_TOTAL' in os.environ:
_mem_total = int(os.environ['MARS_MEMORY_TOTAL'].strip('"'))
_mem_total = int(os.environ['MARS_MEMORY_TOTAL'])
else:
_mem_total = None

Expand Down
7 changes: 7 additions & 0 deletions mars/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,13 @@ def __getattr__(self, item):
f"'AttributeDict' object has no attribute {item}")


def get_bool_environ(var_name: str) -> Optional[bool]:
var_value = os.environ.get(var_name)
if not var_value:
return None
return bool(int(var_value))


def on_serialize_shape(shape: Tuple[int]):
if shape:
return tuple(s if not np.isnan(s) else -1 for s in shape)
Expand Down