Skip to content

Commit

Permalink
yarn: use prebuilt conda environment in nodemanagers
Browse files Browse the repository at this point in the history
  • Loading branch information
aabadie committed Jun 6, 2017
1 parent cf3963c commit b80e6b5
Show file tree
Hide file tree
Showing 7 changed files with 63 additions and 6 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
include *.txt *.py
include joblibhadoop/resources/*.yml
recursive-include joblib *.rst *.py
graft doc
graft doc/_static
Expand Down
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ Your containers should be in the state *Up*.

.. code-block:: bash
$ docker-compose run --rm -e JOBLIB_HDFS_NAMENODE=namenode nodemanager pytest
$ docker-compose run --rm -e JOBLIB_HDFS_NAMENODE=namenode joblib-hadoop-client make docker-pytest
or locally:

Expand Down
12 changes: 12 additions & 0 deletions joblibhadoop/resources/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Joblibhadoop resources module."""

import os.path

ENVIRONMENT_YML_FILENAME = 'joblib-hadoop-environment.yml'


def conda_environment_filename():
"""Return the conda environment filename."""
return os.path.join(
os.path.dirname(os.path.abspath(__file__)), ENVIRONMENT_YML_FILENAME)

12 changes: 12 additions & 0 deletions joblibhadoop/resources/joblib-hadoop-environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
channels:
- conda-forge
dependencies:
- python=3.5
- pip
- setuptools
- libhdfs3
- numpy
- pip:
- git+https://github.com/dask/hdfs3.git@master#egg=hdfs3
- git+https://github.com/aabadie/joblib.git@storage_backend_2#egg=joblib
- git+https://github.com/joblib/joblib-hadoop@master#egg=joblibhadoop
36 changes: 32 additions & 4 deletions joblibhadoop/yarn/pool.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,40 @@
"""Yarn pool module."""

import os
import os.path
import shutil
import socket
import tempfile
from threading import Thread
from time import sleep
from multiprocessing.util import debug
from knit import Knit
from .remotepool import RemotePool, RemoteWorker
from ..resources import conda_environment_filename

JOBLIB_YARN_WORKER = 'joblib-yarn-worker'
JOBLIB_YARN_CONDA_ENV = 'conda_env'

TEMP_DIR = os.environ.get('JOBLIB_TEMP_FOLDER', tempfile.gettempdir())
CONDA_ENV_CREATE_COMMAND = 'conda env create -p {} --file={}'
CONDA_ENV_INSTALL_COMMAND = 'conda install -p /tmp/{} {}'


def create_conda_env(*extra_packages):
"""Create a conda environment to pass to Knit"""
# Create conda environment
if os.path.isfile(os.path.join(TEMP_DIR, JOBLIB_YARN_CONDA_ENV + '.zip')):
return

os.system(CONDA_ENV_CREATE_COMMAND.format(
os.path.join(TEMP_DIR, JOBLIB_YARN_CONDA_ENV),
conda_environment_filename()))
if len(*extra_packages):
os.system(CONDA_ENV_INSTALL_COMMAND.format(' '.join(*extra_packages)))
# Archive conda environment
shutil.make_archive(os.path.join(TEMP_DIR, JOBLIB_YARN_CONDA_ENV), 'zip',
root_dir=TEMP_DIR,
base_dir=JOBLIB_YARN_CONDA_ENV)


class YarnPool(RemotePool):
Expand All @@ -20,14 +47,15 @@ def __init__(self, processes=None, port=0, authkey=None):
workerscript=JOBLIB_YARN_WORKER)
self.stopping = False
self.knit = Knit(autodetect=True)

cmd = ('{} --host {} --port {} --key {}'
create_conda_env([])
cmd = ('$PYTHON_BIN $CONDA_PREFIX/bin/{} --host {} --port {} --key {}'
.format(JOBLIB_YARN_WORKER,
socket.gethostname(),
self.server.address[1],
self.authkey))
self.app_id = self.knit.start(cmd,
num_containers=self._processes)
self.app_id = self.knit.start(
cmd, num_containers=self._processes,
env='{}.zip'.format(os.path.join(TEMP_DIR, JOBLIB_YARN_CONDA_ENV)))
self.thread = Thread(target=self._monitor_appid)
self.thread.deamon = True
self.thread.start()
Expand Down
2 changes: 1 addition & 1 deletion joblibhadoop/yarn/remotepool.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def _start_remote_worker(self, pid):
remote_worker = RemoteWorker(pid)
debug('starting remote worker %d', pid)

args = [self.workerscript,
args = ['$PYTHON_BIN $CONDA_PREFIX/bin/' + self.workerscript,
'--host', socket.gethostname(),
'--port', str(self.server.address[1]),
'--workerid', str(pid),
Expand Down
4 changes: 4 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
license='BSD',
platforms='any',
packages=find_packages(),
package_data={
'': ['joblibhadoop/resources/joblib-hadoop-environment.yml']
},
include_package_data=True,
scripts=[pjoin('bin', 'joblib-yarn-worker')],
install_requires=[
'joblib>=0.10',
Expand Down

0 comments on commit b80e6b5

Please sign in to comment.