rapidsai · pentschev · Apr 18, 2019 · Apr 18, 2019 · Apr 18, 2019 · Apr 18, 2019
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -19,6 +19,10 @@ cd $WORKSPACE
 export GIT_DESCRIBE_TAG=`git describe --abbrev=0 --tags`
 export GIT_DESCRIBE_NUMBER=`git rev-list ${GIT_DESCRIBE_TAG}..HEAD --count`
 
+# Enable NumPy's __array_function__ protocol (needed for NumPy 1.16.x,
+# will possibly be enabled by default starting on 1.17)
+export NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1
+
 ################################################################################
 # SETUP - Check environment
 ################################################################################
@@ -38,6 +42,21 @@ conda list
 # FIX Added to deal with Anancoda SSL verification issues during conda builds
 conda config --set ssl_verify False
 
+################################################################################
+# SETUP - Install additional packages
+################################################################################
+
+# Use dask master until there's a release with
+# https://github.com/dask/dask/pull/4715
+pip install --upgrade git+https://github.com/dask/dask
+
+# Use dask-distributed master until there's a release with
+# https://github.com/dask/distributed/pull/2625
+pip install --upgrade git+https://github.com/dask/distributed
+
+# Install CuPy for tests
+pip install cupy-cuda100==6.0.0rc1
+
 ################################################################################
 # TEST - Run tests
 ################################################################################

diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
@@ -23,6 +23,8 @@ requirements:
     - python x.x
     - dask-core >=1.1.4
     - distributed >=1.25.2
+    - numpy >=1.16.0
+    - numba >=0.40.1
 
 test:
   imports:

diff --git a/dask_cuda/__init__.py b/dask_cuda/__init__.py
@@ -1 +1,2 @@
 from .local_cuda_cluster import LocalCUDACluster
+from . import config
diff --git a/dask_cuda/config.py b/dask_cuda/config.py
@@ -0,0 +1,13 @@
+import yaml
+import os
+
+import dask
+
+config = dask.config.config
+
+
+fn = os.path.join(os.path.dirname(__file__), 'cuda.yaml')
+with open(fn) as f:
+    dask_cuda_defaults = yaml.load(f)
+
+dask.config.update_defaults(dask_cuda_defaults)
diff --git a/dask_cuda/cuda.yaml b/dask_cuda/cuda.yaml
@@ -0,0 +1,8 @@
+cuda:
+  worker:
+    # Fractions of device memory at which we take action to avoid memory blowup
+    # Set any of the lower three values to False to turn off the behavior entirely
+    device-memory:
+      target: 0.60  # target fraction to stay below
+      spill: 0.70  # fraction at which we spill to host
+      pause: 0.80  # fraction at which we pause worker threads
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
@@ -0,0 +1,252 @@
+from tornado import gen
+from numba import cuda
+
+import dask
+from distributed import Worker
+from distributed.worker import logger
+from distributed.compatibility import unicode
+from distributed.utils import format_bytes, ignoring, parse_bytes, PeriodicCallback
+
+from .device_host_file import DeviceHostFile
+
+
+def get_device_total_memory():
+    """ Return total memory of CUDA device from current context """
+    return cuda.current_context().get_memory_info()[1]  # (free, total)
+
+
+def get_device_used_memory():
+    """ Return used memory of CUDA device from current context """
+    memory_info = cuda.current_context().get_memory_info()  # (free, total)
+    return memory_info[1] - memory_info[0]
+
+
+def parse_device_memory_limit(memory_limit, ncores):
+    """ Parse device memory limit input """
+    if memory_limit is None or memory_limit == 0 or memory_limit == 'auto':
+        memory_limit = int(get_device_total_memory())
+    with ignoring(ValueError, TypeError):
+        x = float(memory_limit)
+        if isinstance(x, float) and x <= 1:
+            return int(x * get_device_total_memory())
+
+    if isinstance(memory_limit, (unicode, str)):
+        return parse_bytes(memory_limit)
+    else:
+        return int(memory_limit)
+
+
+class CUDAWorker(Worker):
+    """ CUDA Worker node in a Dask distributed cluster
+
+    Parameters
+    ----------
+    device_memory_limit: int, float, string
+        Number of bytes of CUDA device memory that this worker should use.
+        Set to zero for no limit or 'auto' for 100% of memory use.
+        Use strings or numbers like 5GB or 5e9
+    device_memory_target_fraction: float
+        Fraction of CUDA device memory to try to stay beneath
+    device_memory_spill_fraction: float
+        Fraction of CUDA device memory at which we start spilling to disk
+    device_memory_pause_fraction: float
+        Fraction of CUDA device memory at which we stop running new tasks
+
+    Note: CUDAWorker is a subclass fo distributed.Worker, only parameters
+    specific for CUDAWorker are listed here. For a complete list of
+    parameters, refer to that.
+    """
+
+    def __init__(self, scheduler_ip=None, **kwargs):
+        self.device_memory_limit = kwargs.pop('device_memory_limit', None)
+
+        if 'device_memory_target_fraction' in kwargs:
+            self.device_memory_target_fraction = kwargs.pop(
+                'device_memory_target_fraction')
+        else:
+            self.device_memory_target_fraction = dask.config.get(
+                'cuda.worker.device-memory.target')
+        if 'device_memory_spill_fraction' in kwargs:
+            self.device_memory_spill_fraction = kwargs.pop(
+                'device_memory_spill_fraction')
+        else:
+            self.device_memory_spill_fraction = dask.config.get(
+                'cuda.worker.device-memory.spill')
+        if 'device_memory_pause_fraction' in kwargs:
+            self.device_memory_pause_fraction = kwargs.pop(
+                'device_memory_pause_fraction')
+        else:
+            self.device_memory_pause_fraction = dask.config.get(
+                'cuda.worker.device-memory.pause')
+
+        super().__init__(scheduler_ip=scheduler_ip, **kwargs)
+
+        self.device_memory_limit = parse_device_memory_limit(
+            self.device_memory_limit, self.ncores)
+
+        self.data = DeviceHostFile(device_memory_limit=self.device_memory_limit,
+                                   memory_limit=self.memory_limit,
+                                   local_dir=self.local_dir)
+
+        self._paused = False
+        self._device_paused = False
+
+        if self.device_memory_limit:
+            self._device_memory_monitoring = False
+            pc = PeriodicCallback(
+                self.device_memory_monitor,
+                self.memory_monitor_interval * 1000,
+                io_loop=self.io_loop
+            )
+            self.periodic_callbacks["device_memory"] = pc
+
+    def _start(self, addr_on_port=0):
+        super()._start(addr_on_port)
+        if self.device_memory_limit:
+            logger.info('        Device Memory: %26s',
+                        format_bytes(self.device_memory_limit))
+            logger.info('-' * 49)
+
+    def _check_for_pause(self, fraction, pause_fraction, used_memory, memory_limit,
+                         paused, free_func, worker_description):
+        if pause_fraction and fraction > pause_fraction:
+            # Try to free some memory while in paused state
+            if free_func:
+                free_func()
+            if not self._paused:
+                logger.warning("%s is at %d%% memory usage. Pausing worker.  "
+                               "Process memory: %s -- Worker memory limit: %s",
+                               worker_description,
+                               int(fraction * 100),
+                               format_bytes(used_memory),
+                               format_bytes(memory_limit))
+                return True
+        elif paused:
+            logger.warning("Worker is at %d%% memory usage. Resuming worker. "
+                           "Process memory: %s -- Worker memory limit: %s",
+                           int(fraction * 100),
+                           format_bytes(used_memory),
+                           format_bytes(memory_limit))
+            self.ensure_computing()
+            return False
+
+    @gen.coroutine
+    def memory_monitor(self):
+        """ Track this process's memory usage and act accordingly
+
+        If we rise above (memory_spill_fraction * memory_limit) of
+        memory use, start dumping data to disk. The default value for
+        memory_spill_fraction is 0.7, defined via configuration
+        'distributed.worker.memory.target'.
+
+        If we rise above (memory_pause_fraction * memory_limit) of
+        memory use , stop execution of new tasks. The default value
+        for memory_pause_fraction is 0.8, defined via configuration
+        'distributed.worker.memory.pause'.
+        """
+        if self._memory_monitoring:
+            return
+        self._memory_monitoring = True
+        total = 0
+
+        proc = self.monitor.proc
+        memory = proc.memory_info().rss
+        frac = memory / self.memory_limit
+
+        # Pause worker threads if device memory use above
+        # (self.memory_pause_fraction * 100)%
+        self._paused = self._check_for_pause(frac, self.memory_pause_fraction, memory,
+                                             self.memory_limit, self._paused,
+                                             self._throttled_gc.collect(), 'Worker')
+        self.paused = (self._paused or self._device_paused)
+
+        # Dump data to disk if memory use above
+        # (self.memory_spill_fraction * 100)%
+        if self.memory_spill_fraction and frac > self.memory_spill_fraction:
+            target = self.memory_limit * self.memory_target_fraction
+            count = 0
+            need = memory - target
+            while memory > target:
+                if not self.data.host.fast:
+                    logger.warning("Memory use is high but worker has no data "
+                                   "to store to disk.  Perhaps some other process "
+                                   "is leaking memory?  Process memory: %s -- "
+                                   "Worker memory limit: %s",
+                                   format_bytes(proc.memory_info().rss),
+                                   format_bytes(self.memory_limit))
+                    break
+                k, v, weight = self.data.host.fast.evict()
+                del k, v
+                total += weight
+                count += 1
+                yield gen.moment
+                memory = proc.memory_info().rss
+                if total > need and memory > target:
+                    # Issue a GC to ensure that the evicted data is actually
+                    # freed from memory and taken into account by the monitor
+                    # before trying to evict even more data.
+                    self._throttled_gc.collect()
+                    memory = proc.memory_info().rss
+            if count:
+                logger.debug("Moved %d pieces of data and %s bytes to disk",
+                             count, format_bytes(total))
+
+        self._memory_monitoring = False
+        raise gen.Return(total)
+
+    @gen.coroutine
+    def device_memory_monitor(self):
+        """ Track this process's memory usage and act accordingly
+
+        If we rise above (device_memory_spill_fraction * memory_limit) of
+        device memory use, start dumping data to disk. The default value
+        for device_memory_spill_fraction is 0.7, defined via configuration
+        'cuda.worker.device-memory.target'.
+
+        If we rise above (device_memory_pause_fraction * memory_limit) of
+        device memory use, stop execution of new tasks. The default value
+        for device_memory_pause_fraction is 0.8, defined via configuration
+        'cuda.worker.device-memory.pause'.
+        """
+        if self._memory_monitoring:
+            return
+        self._device_memory_monitoring = True
+        total = 0
+        memory = get_device_used_memory()
+        frac = memory / self.device_memory_limit
+
+        # Pause worker threads if device memory use above
+        # (self.device_memory_pause_fraction * 100)%
+        self._paused = self._check_for_pause(frac, self.device_memory_pause_fraction,
+                                             memory, self.device_memory_limit,
+                                             self._device_paused, None,
+                                             "Worker's CUDA device")
+        self.paused = (self._paused or self._device_paused)
+
+        # Dump device data to host if device memory use above
+        # (self.device_memory_spill_fraction * 100)%
+        if (self.device_memory_spill_fraction
+                and frac > self.device_memory_spill_fraction):
+            target = self.device_memory_limit * self.device_memory_target_fraction
+            count = 0
+            while memory > target:
+                if not self.data.device.fast:
+                    logger.warning("CUDA device memory use is high but worker has "
+                                   "no data to store to host.  Perhaps some other "
+                                   "process is leaking memory?  Process memory: "
+                                   "%s -- Worker memory limit: %s",
+                                   format_bytes(get_device_used_memory()),
+                                   format_bytes(self.device_memory_limit))
+                    break
+                k, v, weight = self.data.device.fast.evict()
+                del k, v
+                total += weight
+                count += 1
+                yield gen.moment
+                memory = get_device_used_memory()
+            if count:
+                logger.debug("Moved %d pieces of data and %s bytes to host memory",
+                             count, format_bytes(total))
+
+        self._device_memory_monitoring = False
+        raise gen.Return(total)
diff --git a/dask_cuda/dask_cuda_worker.py b/dask_cuda/dask_cuda_worker.py
@@ -6,7 +6,7 @@
 from sys import exit
 
 import click
-from distributed import Nanny, Worker
+from distributed import Nanny
 from distributed.config import config
 from distributed.utils import get_ip_interface, parse_timedelta
 from distributed.worker import _ncores
@@ -23,6 +23,7 @@
     enable_proctitle_on_current,
 )
 
+from .cuda_worker import CUDAWorker
 from .local_cuda_cluster import cuda_visible_devices
 from .utils import get_n_gpus
 
@@ -98,6 +99,15 @@
     "string (like 5GB or 5000M), "
     "'auto', or zero for no memory management",
 )
+@click.option(
+    "--device-memory-limit",
+    default="auto",
+    help="Bytes of memory per CUDA device that the worker can use. "
+    "This can be an integer (bytes), "
+    "float (fraction of total system memory), "
+    "string (like 5GB or 5000M), "
+    "'auto', or zero for no memory management",
+)
 @click.option(
     "--reconnect/--no-reconnect",
     default=True,
@@ -146,6 +156,7 @@ def main(
     nthreads,
     name,
     memory_limit,
+    device_memory_limit,
     pid_file,
     reconnect,
     resources,
@@ -243,6 +254,7 @@ def del_pid_file():
             loop=loop,
             resources=resources,
             memory_limit=memory_limit,
+            device_memory_limit=device_memory_limit,
             reconnect=reconnect,
             local_dir=local_directory,
             death_timeout=death_timeout,
@@ -252,6 +264,7 @@ def del_pid_file():
             contact_address=None,
             env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
             name=name if nprocs == 1 or not name else name + "-" + str(i),
+            worker_class=CUDAWorker,
             **kwargs
         )
         for i in range(nprocs)
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from .local_cuda_cluster import LocalCUDACluster
		from . import config