rapidsai · pentschev · Apr 18, 2019 · Apr 18, 2019 · Apr 18, 2019 · Apr 18, 2019
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -19,6 +19,10 @@ cd $WORKSPACE
 export GIT_DESCRIBE_TAG=`git describe --abbrev=0 --tags`
 export GIT_DESCRIBE_NUMBER=`git rev-list ${GIT_DESCRIBE_TAG}..HEAD --count`
 
+# Enable NumPy's __array_function__ protocol (needed for NumPy 1.16.x,
+# will possibly be enabled by default starting on 1.17)
+export NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1
+
 ################################################################################
 # SETUP - Check environment
 ################################################################################
@@ -38,6 +42,13 @@ conda list
 # FIX Added to deal with Anancoda SSL verification issues during conda builds
 conda config --set ssl_verify False
 
+################################################################################
+# SETUP - Install additional packages
+################################################################################
+
+# Install CuPy for tests
+pip install cupy-cuda100==6.0.0rc1
+
 ################################################################################
 # TEST - Run tests
 ################################################################################

diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
@@ -21,8 +21,10 @@ requirements:
     - setuptools
   run:
     - python x.x
-    - dask-core >=1.1.4
-    - distributed >=1.25.2
+    - dask-core >=1.2.1
+    - distributed >=1.27.1
+    - numpy >=1.16.0
+    - numba >=0.40.1
 
 test:
   imports:

diff --git a/dask_cuda/__init__.py b/dask_cuda/__init__.py
@@ -1 +1,2 @@
 from .local_cuda_cluster import LocalCUDACluster
+from . import config
diff --git a/dask_cuda/config.py b/dask_cuda/config.py
@@ -0,0 +1,13 @@
+import yaml
+import os
+
+import dask
+
+config = dask.config.config
+
+
+fn = os.path.join(os.path.dirname(__file__), 'cuda.yaml')
+with open(fn) as f:
+    dask_cuda_defaults = yaml.load(f)
+
+dask.config.update_defaults(dask_cuda_defaults)
diff --git a/dask_cuda/cuda.yaml b/dask_cuda/cuda.yaml
@@ -0,0 +1,8 @@
+cuda:
+  worker:
+    # Fractions of device memory at which we take action to avoid memory blowup
+    # Set any of the lower three values to False to turn off the behavior entirely
+    device-memory:
+      target: 0.60  # target fraction to stay below
+      spill: 0.70  # fraction at which we spill to host
+      pause: 0.80  # fraction at which we pause worker threads
diff --git a/dask_cuda/cuda_nanny.py b/dask_cuda/cuda_nanny.py
@@ -0,0 +1,14 @@
+from distributed import Nanny
+
+from .cuda_worker import CUDAWorker
+
+
+class CUDANanny(Nanny):
+    """ A process to manage CUDAWorker processes
+
+    This is a subclass of Nanny, with the only difference
+    being worker_class=CUDAWorker.
+    """
+    def __init__(self, *args, **kwargs):
+        Nanny.__init__(self, *args,
+                       worker_class=CUDAWorker, **kwargs)
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
@@ -0,0 +1,271 @@
+from tornado import gen
+from numba import cuda
+
+import dask
+from distributed import Worker
+from distributed.worker import logger
+from distributed.compatibility import unicode
+from distributed.utils import format_bytes, ignoring, parse_bytes, PeriodicCallback
+
+from .device_host_file import DeviceHostFile
+
+
+def get_device_total_memory():
+    """ Return total memory of CUDA device from current context """
+    return cuda.current_context().get_memory_info()[1]  # (free, total)
+
+
+def get_device_used_memory():
+    """ Return used memory of CUDA device from current context """
+    memory_info = cuda.current_context().get_memory_info()  # (free, total)
+    return memory_info[1] - memory_info[0]
+
+
+def parse_device_memory_limit(memory_limit, ncores):
+    """ Parse device memory limit input """
+    if memory_limit is None or memory_limit == 0 or memory_limit == 'auto':
+        memory_limit = int(get_device_total_memory())
+    with ignoring(ValueError, TypeError):
+        x = float(memory_limit)
+        if isinstance(x, float) and x <= 1:
+            return int(x * get_device_total_memory())
+
+    if isinstance(memory_limit, (unicode, str)):
+        return parse_bytes(memory_limit)
+    else:
+        return int(memory_limit)
+
+
+class CUDAWorker(Worker):
+    """ CUDA Worker node in a Dask distributed cluster
+
+    Parameters
+    ----------
+    device_memory_limit: int, float, string
+        Number of bytes of CUDA device memory that this worker should use.
+        Set to zero for no limit or 'auto' for 100% of memory use.
+        Use strings or numbers like 5GB or 5e9
+    device_memory_target_fraction: float
+        Fraction of CUDA device memory to try to stay beneath
+    device_memory_spill_fraction: float
+        Fraction of CUDA device memory at which we start spilling to disk
+    device_memory_pause_fraction: float
+        Fraction of CUDA device memory at which we stop running new tasks
+
+    Note: CUDAWorker is a subclass fo distributed.Worker, only parameters
+    specific for CUDAWorker are listed here. For a complete list of
+    parameters, refer to that.
+    """
+
+    def __init__(self, *args, **kwargs):
+        self.device_memory_limit = kwargs.pop('device_memory_limit',
+                                              get_device_total_memory())
+
+        if 'device_memory_target_fraction' in kwargs:
+            self.device_memory_target_fraction = kwargs.pop(
+                'device_memory_target_fraction')
+        else:
+            self.device_memory_target_fraction = dask.config.get(
+                'cuda.worker.device-memory.target')
+        if 'device_memory_spill_fraction' in kwargs:
+            self.device_memory_spill_fraction = kwargs.pop(
+                'device_memory_spill_fraction')
+        else:
+            self.device_memory_spill_fraction = dask.config.get(
+                'cuda.worker.device-memory.spill')
+        if 'device_memory_pause_fraction' in kwargs:
+            self.device_memory_pause_fraction = kwargs.pop(
+                'device_memory_pause_fraction')
+        else:
+            self.device_memory_pause_fraction = dask.config.get(
+                'cuda.worker.device-memory.pause')
+
+        super().__init__(*args, **kwargs)
+
+        self.device_memory_limit = parse_device_memory_limit(
+            self.device_memory_limit, self.ncores)
+
+        self.data = DeviceHostFile(device_memory_limit=self.device_memory_limit,
+                                   memory_limit=self.memory_limit,
+                                   local_dir=self.local_dir)
+
+        self._paused = False
+        self._device_paused = False
+
+        if self.device_memory_limit:
+            self._device_memory_monitoring = False
+            pc = PeriodicCallback(
+                self.device_memory_monitor,
+                self.memory_monitor_interval * 1000,
+                io_loop=self.io_loop
+            )
+            self.periodic_callbacks["device_memory"] = pc
+
+    def _start(self, addr_on_port=0):
+        super()._start(addr_on_port)
+        if self.device_memory_limit:
+            logger.info('        Device Memory: %26s',
+                        format_bytes(self.device_memory_limit))
+            logger.info('-' * 49)
+
+    def _check_for_pause(self, fraction, pause_fraction, used_memory, memory_limit,
+                         paused, free_func, worker_description):
+        if pause_fraction and fraction > pause_fraction:
+            # Try to free some memory while in paused state
+            if free_func:
+                free_func()
+            if not self._paused:
+                logger.warning("%s is at %d%% memory usage. Pausing worker.  "
+                               "Process memory: %s -- Worker memory limit: %s",
+                               worker_description,
+                               int(fraction * 100),
+                               format_bytes(used_memory),
+                               format_bytes(memory_limit))
+                return True
+        return False
+
+    def _resume_message(self, fraction, used_memory, memory_limit,
+                        worker_description):
+        logger.warning("%s is at %d%% memory usage. Resuming worker. "
+                       "Process memory: %s -- Worker memory limit: %s",
+                       worker_description,
+                       int(fraction * 100),
+                       format_bytes(used_memory),
+                       format_bytes(memory_limit))
+
+    def _resume_worker(self):
+        if self.paused and not (self._paused or self._device_paused):
+            self.paused = False
+            self.ensure_computing()
+
+    @gen.coroutine
+    def memory_monitor(self):
+        """ Track this process's memory usage and act accordingly
+
+        If we rise above (memory_spill_fraction * memory_limit) of
+        memory use, start dumping data to disk. The default value for
+        memory_spill_fraction is 0.7, defined via configuration
+        'distributed.worker.memory.target'.
+
+        If we rise above (memory_pause_fraction * memory_limit) of
+        memory use , stop execution of new tasks. The default value
+        for memory_pause_fraction is 0.8, defined via configuration
+        'distributed.worker.memory.pause'.
+        """
+        if self._memory_monitoring:
+            return
+        self._memory_monitoring = True
+        total = 0
+
+        proc = self.monitor.proc
+        memory = proc.memory_info().rss
+        frac = memory / self.memory_limit
+
+        # Pause worker threads if device memory use above
+        # (self.memory_pause_fraction * 100)%
+        old_pause_state = self._paused
+        worker_description = 'Worker'
+        self._paused = self._check_for_pause(frac, self.memory_pause_fraction, memory,
+                                             self.memory_limit, self._paused,
+                                             self._throttled_gc.collect(),
+                                             worker_description)
+        if old_pause_state and not self._paused:
+            self._resume_message(frac, memory, self.memory_limit,
+                                 worker_description)
+        self._resume_worker()
+
+        # Dump data to disk if memory use above
+        # (self.memory_spill_fraction * 100)%
+        if self.memory_spill_fraction and frac > self.memory_spill_fraction:
+            target = self.memory_limit * self.memory_target_fraction
+            count = 0
+            need = memory - target
+            while memory > target:
+                if not self.data.host.fast:
+                    logger.warning("Memory use is high but worker has no data "
+                                   "to store to disk.  Perhaps some other process "
+                                   "is leaking memory?  Process memory: %s -- "
+                                   "Worker memory limit: %s",
+                                   format_bytes(proc.memory_info().rss),
+                                   format_bytes(self.memory_limit))
+                    break
+                k, v, weight = self.data.host.fast.evict()
+                del k, v
+                total += weight
+                count += 1
+                yield gen.moment
+                memory = proc.memory_info().rss
+                if total > need and memory > target:
+                    # Issue a GC to ensure that the evicted data is actually
+                    # freed from memory and taken into account by the monitor
+                    # before trying to evict even more data.
+                    self._throttled_gc.collect()
+                    memory = proc.memory_info().rss
+            if count:
+                logger.debug("Moved %d pieces of data and %s bytes to disk",
+                             count, format_bytes(total))
+
+        self._memory_monitoring = False
+        raise gen.Return(total)
+
+    @gen.coroutine
+    def device_memory_monitor(self):
+        """ Track this process's memory usage and act accordingly
+
+        If we rise above (device_memory_spill_fraction * memory_limit) of
+        device memory use, start dumping data to disk. The default value
+        for device_memory_spill_fraction is 0.7, defined via configuration
+        'cuda.worker.device-memory.target'.
+
+        If we rise above (device_memory_pause_fraction * memory_limit) of
+        device memory use, stop execution of new tasks. The default value
+        for device_memory_pause_fraction is 0.8, defined via configuration
+        'cuda.worker.device-memory.pause'.
+        """
+        if self._memory_monitoring:
+            return
+        self._device_memory_monitoring = True
+        total = 0
+        memory = get_device_used_memory()
+        frac = memory / self.device_memory_limit
+
+        # Pause worker threads if device memory use above
+        # (self.device_memory_pause_fraction * 100)%
+        old_pause_state = self._device_paused
+        worker_description = "Worker's CUDA device"
+        self._device_paused = self._check_for_pause(
+                frac, self.device_memory_pause_fraction, memory,
+                self.device_memory_limit, self._device_paused, None,
+                worker_description)
+        if old_pause_state and not self._device_paused:
+            self._resume_message(frac, memory, self.device_memory_limit,
+                                 worker_description)
+        self._resume_worker()
+
+        # Dump device data to host if device memory use above
+        # (self.device_memory_spill_fraction * 100)%
+        if (self.device_memory_spill_fraction
+                and frac > self.device_memory_spill_fraction):
+            target = self.device_memory_limit * self.device_memory_target_fraction
+            count = 0
+            while memory > target:
+                if not self.data.device.fast:
+                    logger.warning("CUDA device memory use is high but worker has "
+                                   "no data to store to host.  Perhaps some other "
+                                   "process is leaking memory?  Process memory: "
+                                   "%s -- Worker memory limit: %s",
+                                   format_bytes(get_device_used_memory()),
+                                   format_bytes(self.device_memory_limit))
+                    break
+                k, v, weight = self.data.device.fast.evict()
+                del k, v
+                total += weight
+                count += 1
+                yield gen.moment
+                memory = get_device_used_memory()
+            if count:
+                logger.debug("Moved %d pieces of data and %s bytes to host memory",
+                             count, format_bytes(total))
+
+        self._device_memory_monitoring = False
+        raise gen.Return(total)
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from .local_cuda_cluster import LocalCUDACluster
		from . import config