rapidsai · pentschev · Apr 18, 2019 · Apr 18, 2019 · Apr 18, 2019 · Apr 18, 2019
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -19,6 +19,10 @@ cd $WORKSPACE
 export GIT_DESCRIBE_TAG=`git describe --abbrev=0 --tags`
 export GIT_DESCRIBE_NUMBER=`git rev-list ${GIT_DESCRIBE_TAG}..HEAD --count`
 
+# Enable NumPy's __array_function__ protocol (needed for NumPy 1.16.x,
+# will possibly be enabled by default starting on 1.17)
+export NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1
+
 ################################################################################
 # SETUP - Check environment
 ################################################################################
@@ -38,6 +42,13 @@ conda list
 # FIX Added to deal with Anancoda SSL verification issues during conda builds
 conda config --set ssl_verify False
 
+################################################################################
+# SETUP - Install additional packages
+################################################################################
+
+# Install CuPy for tests
+pip install cupy-cuda100==6.0.0rc1
+
 ################################################################################
 # TEST - Run tests
 ################################################################################

diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
@@ -21,8 +21,10 @@ requirements:
     - setuptools
   run:
     - python x.x
-    - dask-core >=1.1.4
-    - distributed >=1.25.2
+    - dask-core >=1.2.1
+    - distributed >=1.27.1
+    - numpy >=1.16.0
+    - numba >=0.40.1
 
 test:
   imports:

diff --git a/dask_cuda/__init__.py b/dask_cuda/__init__.py
@@ -1 +1,2 @@
 from .local_cuda_cluster import LocalCUDACluster
+from . import config
diff --git a/dask_cuda/config.py b/dask_cuda/config.py
@@ -0,0 +1,14 @@
+import yaml
+import os
+
+import dask
+
+config = dask.config.config
+
+
+fn = os.path.join(os.path.dirname(__file__), "cuda.yaml")
+dask.config.ensure_file(source=fn)
+with open(fn) as f:
+    dask_cuda_defaults = yaml.safe_load(f)
+
+dask.config.update_defaults(dask_cuda_defaults)
diff --git a/dask_cuda/cuda.yaml b/dask_cuda/cuda.yaml
@@ -0,0 +1,8 @@
+distributed:
+  worker:
+    # Fractions of device memory at which we take action to avoid memory blowup
+    # Set any of the lower three values to False to turn off the behavior entirely
+    device-memory:
+      target: 0.60  # target fraction to stay below
+      spill: 0.70  # fraction at which we spill to host
+      pause: 0.0  # fraction at which we pause worker threads - Disabled (0.0) by default, preventing worker from stalling when third-party memory managers are used and can't be disabled
diff --git a/dask_cuda/dask_cuda_worker.py b/dask_cuda/dask_cuda_worker.py
@@ -3,26 +3,25 @@
 import atexit
 import logging
 import os
-from sys import exit
 
 import click
-from distributed import Nanny, Worker
+from distributed import Nanny
 from distributed.config import config
 from distributed.utils import get_ip_interface, parse_timedelta
-from distributed.worker import _ncores
+from distributed.worker import Worker, _ncores
 from distributed.security import Security
 from distributed.cli.utils import (
     check_python_3,
     uri_from_host_port,
     install_signal_handlers,
 )
-from distributed.comm import get_address_host_port
 from distributed.preloading import validate_preload_argv
 from distributed.proctitle import (
     enable_proctitle_on_children,
     enable_proctitle_on_current,
 )
 
+from .worker import CUDAWorker
 from .local_cuda_cluster import cuda_visible_devices
 from .utils import get_n_gpus
 
@@ -98,6 +97,25 @@
     "string (like 5GB or 5000M), "
     "'auto', or zero for no memory management",
 )
+@click.option(
+    "--device-memory-limit",
+    default="auto",
+    help="Bytes of memory per CUDA device that the worker can use. "
+    "This can be an integer (bytes), "
+    "float (fraction of total device memory), "
+    "string (like 5GB or 5000M), "
+    "'auto', or zero for no memory management. "
+    "Note: this parameter is ignored when "
+    "--worker-class is not CUDAWorker.",
+)
+@click.option(
+    "--worker-class",
+    type=str,
+    default="CUDAWorker",
+    help="The Worker class to be used. "
+    "Choosing a non-default worker may result in limited functionality, "
+    "such as no device memory spilling support.",
+)
 @click.option(
     "--reconnect/--no-reconnect",
     default=True,
@@ -146,6 +164,8 @@ def main(
     nthreads,
     name,
     memory_limit,
+    device_memory_limit,
+    worker_class,
     pid_file,
     reconnect,
     resources,
@@ -175,7 +195,7 @@ def main(
         nprocs = get_n_gpus()
 
     if not nthreads:
-        nthreads = min(1, _ncores // nprocs )
+        nthreads = min(1, _ncores // nprocs)
 
     if pid_file:
         with open(pid_file, "w") as f:
@@ -234,6 +254,14 @@ def del_pid_file():
     if death_timeout is not None:
         death_timeout = parse_timedelta(death_timeout, "s")
 
+    if worker_class == "Worker":
+        worker_class = Worker
+    elif worker_class == "CUDAWorker":
+        worker_class = CUDAWorker
+        kwargs["device_memory_limit"] = device_memory_limit
+    else:
+        raise ValueError("worker_class %s not recognized" % worker_class)
+
     nannies = [
         t(
             scheduler,
@@ -252,6 +280,7 @@ def del_pid_file():
             contact_address=None,
             env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
             name=name if nprocs == 1 or not name else name + "-" + str(i),
+            worker_class=worker_class,
             **kwargs
         )
         for i in range(nprocs)

diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
@@ -0,0 +1,104 @@
+from zict import Buffer, File, Func
+from zict.common import ZictBase
+from distributed.protocol import deserialize_bytes, serialize_bytes
+from distributed.worker import weight
+
+from functools import partial
+import os
+
+
+def _is_device_object(obj):
+    """
+    Check if obj is a device object, by checking if it has a
+    __cuda_array_interface__ attributed
+    """
+    return hasattr(obj, "__cuda_array_interface__")
+
+
+def _serialize_if_device(obj):
+    """ Serialize an object if it's a device object """
+    if _is_device_object(obj):
+        return serialize_bytes(obj, on_error="raise")
+    else:
+        return obj
+
+
+def _deserialize_if_device(obj):
+    """ Deserialize an object if it's an instance of bytes """
+    if isinstance(obj, bytes):
+        return deserialize_bytes(obj)
+    else:
+        return obj
+
+
+class DeviceHostFile(ZictBase):
+    """ Manages serialization/deserialization of objects.
+
+    Three LRU cache levels are controlled, for device, host and disk.
+    Each level takes care of serializing objects once its limit has been
+    reached and pass it to the subsequent level. Similarly, each cache
+    may deserialize the object, but storing it back in the appropriate
+    cache, depending on the type of object being deserialized.
+
+    Parameters
+    ----------
+    device_memory_limit: int
+        Number of bytes of CUDA device memory for device LRU cache,
+        spills to host cache once filled.
+    memory_limit: int
+        Number of bytes of host memory for host LRU cache, spills to
+        disk once filled.
+    local_dir: path
+        Path where to store serialized objects on disk
+    """
+
+    def __init__(
+        self, device_memory_limit=None, memory_limit=None, local_dir="dask-worker-space"
+    ):
+        path = os.path.join(local_dir, "storage")
+
+        self.host_func = dict()
+        self.disk_func = Func(
+            partial(serialize_bytes, on_error="raise"), deserialize_bytes, File(path)
+        )
+        self.host_buffer = Buffer(
+            self.host_func, self.disk_func, memory_limit, weight=weight
+        )
+
+        self.device_func = dict()
+        self.device_host_func = Func(
+            _serialize_if_device, _deserialize_if_device, self.host_buffer
+        )
+        self.device_buffer = Buffer(
+            self.device_func, self.device_host_func, device_memory_limit, weight=weight
+        )
+
+        self.device = self.device_buffer.fast.d
+        self.host = self.host_buffer.fast.d
+        self.disk = self.host_buffer.slow.d
+
+    def __setitem__(self, key, value):
+        if _is_device_object(value):
+            self.device_buffer[key] = value
+        else:
+            self.host_buffer[key] = value
+
+    def __getitem__(self, key):
+        if key in self.host_buffer:
+            obj = self.host_buffer[key]
+            del self.host_buffer[key]
+            self.device_buffer[key] = _deserialize_if_device(obj)
+
+        if key in self.device_buffer:
+            return self.device_buffer[key]
+        else:
+            raise KeyError
+
+    def __len__(self):
+        return len(self.device_buffer)
+
+    def __iter__(self):
+        return iter(self.device_buffer)
+
+    def __delitem__(self, i):
+        del self.device_buffer[i]
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
@@ -4,7 +4,9 @@
 
 from dask.distributed import LocalCluster
 from distributed.worker import TOTAL_MEMORY
+from distributed.utils import get_ip_interface
 
+from .nanny import CUDANanny
 from .utils import get_n_gpus
 
 
@@ -35,7 +37,9 @@ def __init__(
         threads_per_worker=1,
         processes=True,
         memory_limit=None,
-        **kwargs
+        device_memory_limit=None,
+        worker_class=CUDANanny,
+        **kwargs,
     ):
         if n_workers is None:
             n_workers = get_n_gpus()
@@ -45,11 +49,14 @@ def __init__(
             raise ValueError("Can not specify more processes than GPUs")
         if memory_limit is None:
             memory_limit = TOTAL_MEMORY / n_workers
+
         LocalCluster.__init__(
             self,
             n_workers=n_workers,
             threads_per_worker=threads_per_worker,
             memory_limit=memory_limit,
+            device_memory_limit=device_memory_limit,
+            worker_class=worker_class,
             **kwargs,
         )
 
@@ -60,21 +67,30 @@ def _start(self, ip=None, n_workers=0):
         """
         if self.status == "running":
             return
-        if (ip is None) and (not self.scheduler_port) and (not self.processes):
-            # Use inproc transport for optimization
-            scheduler_address = "inproc://"
-        elif ip is not None and ip.startswith("tls://"):
-            scheduler_address = "%s:%d" % (ip, self.scheduler_port)
+
+        if self.protocol == "inproc://":
+            address = self.protocol
         else:
             if ip is None:
-                ip = "127.0.0.1"
-            scheduler_address = (ip, self.scheduler_port)
-        self.scheduler.start(scheduler_address)
+                if self.interface:
+                    ip = get_ip_interface(self.interface)
+                else:
+                    ip = "127.0.0.1"
+
+            if "://" in ip:
+                address = ip
+            else:
+                address = self.protocol + ip
+            if self.scheduler_port:
+                address += ":" + str(self.scheduler_port)
+
+        self.scheduler.start(address)
 
         yield [
             self._start_worker(
                 **self.worker_kwargs,
                 env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
+                name="gpu-" + str(i),
             )
             for i in range(n_workers)
         ]

diff --git a/dask_cuda/nanny.py b/dask_cuda/nanny.py
@@ -0,0 +1,14 @@
+from distributed import Nanny
+
+from .worker import CUDAWorker
+
+
+class CUDANanny(Nanny):
+    """ A process to manage CUDAWorker processes
+
+    This is a subclass of Nanny, with the only difference
+    being worker_class=CUDAWorker.
+    """
+
+    def __init__(self, *args, worker_class=CUDAWorker, **kwargs):
+        Nanny.__init__(self, *args, worker_class=worker_class, **kwargs)
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from .local_cuda_cluster import LocalCUDACluster
		from . import config