initial commit: copy store backend from joblib-store project

joblib · May 10, 2017 · 30101e0 · 30101e0
commit 30101e0
Show file tree

Hide file tree

Showing 19 changed files with 734 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,38 @@
+*.py[oc]
+*.so
+# setup.py working directory
+build
+# setup.py dist directory
+dist
+# Editor temporary/working/backup files
+*$
+.*.sw[nop]
+.sw[nop]
+*~
+[#]*#
+.#*
+*.bak
+*.tmp
+*.tgz
+*.rej
+*.org
+.project
+*.diff
+.settings/
+*.svn/
+# Egg metadata
+*.egg-info
+# The shelf plugin uses this dir
+./.shelf
+# Some IDEs add this directory
+.idea
+# Mac droppings
+.DS_Store
+doc/documentation.zip
+doc/generated
+doc/CHANGES.rst
+doc/README.rst
+# Coverage report
+.coverage
+# Pytest
+.cache
diff --git a/.pylintrc b/.pylintrc
@@ -0,0 +1,2 @@
+[pylint]
+disable=invalid-name,too-many-arguments,duplicate-code,arguments-differ,super-init-not-called
diff --git a/LICENSE.rst b/LICENSE.rst
@@ -0,0 +1,34 @@
+Joblib-hadoop is **BSD-licenced** (3 clause):
+
+    This software is OSI Certified Open Source Software.
+    OSI Certified is a certification mark of the Open Source Initiative.
+
+    Copyright (c) 2009-2011, joblib developpers
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+
+    * Neither the name of Gael Varoquaux. nor the names of other joblib
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+    **This software is provided by the copyright holders and contributors
+    "as is" and any express or implied warranties, including, but not
+    limited to, the implied warranties of merchantability and fitness for
+    a particular purpose are disclaimed. In no event shall the copyright
+    owner or contributors be liable for any direct, indirect, incidental,
+    special, exemplary, or consequential damages (including, but not
+    limited to, procurement of substitute goods or services; loss of use,
+    data, or profits; or business interruption) however caused and on any
+    theory of liability, whether in contract, strict liability, or tort
+    (including negligence or otherwise) arising in any way out of the use
+    of this software, even if advised of the possibility of such
+    damage.**
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,6 @@
+include *.txt *.py
+recursive-include joblib *.rst *.py
+graft doc
+graft doc/_static
+graft doc/_templates
+global-exclude *~ *.swp
diff --git a/README.rst b/README.rst
@@ -0,0 +1,192 @@
+Joblib-hadoop
+=============
+
+|Travis| |Codecov|
+
+.. |Travis| image:: https://travis-ci.org/joblib/joblib-hadoop.svg?branch=master
+    :target: https://travis-ci.org/joblib/joblib-hadoop
+
+.. |Codecov| image:: https://codecov.io/gh/joblib/joblib-hadoop/branch/master/graph/badge.svg
+    :target: https://codecov.io/gh/joblib/joblib-hadoop
+
+This package provides parallel and store backends for joblib that can be use on
+a Hadoop cluster.
+
+If you don't know joblib already, user documentation is located on
+https://pythonhosted.org/joblib
+
+Joblib-hadoop supports Python 2.7, 3.4 and 3.5.
+
+Getting the latest code
+=======================
+
+To get the latest code use git::
+
+    git clone git://github.com/joblib/joblib-hadoop.git
+
+Installing joblib-hadoop
+========================
+
+We recommend using
+`Python Anaconda 3 distribution <https://www.continuum.io/Downloads>`_ for
+full support of the HDFS store backends.
+
+1. Create an Anaconda environment (use python 2.7, 3.4 or 3.5) and activate it:
+
+..  code-block:: bash
+
+    $ conda create -n joblibhadoop-env python==3.5 s3fs libhdfs3 -c conda-forge
+    $ . activate joblibhadoop-env
+
+2. From the `joblibhadoop-env` environment, perform installation using pip:
+
+..  code-block:: bash
+
+    $ cd joblib-hadoop
+    $ pip install -r requirements.txt .
+
+
+Using joblib-hadoop on a Hadoop cluster
+=======================================
+
+TODO: add parallel backend
+
+..  code-block:: python
+
+  import numpy as np
+  from joblib import Memory
+  from joblibhadoop.hdfs import register_hdfs_store_backend
+
+  if __name__ == '__main__':
+      register_hdfs_store_backend()
+
+      mem = Memory(location='joblib_cache_hdfs',
+                   backend='hdfs', host='localhost', port=8020, user='test',
+                   verbose=100, compress=True)
+
+      multiply = mem.cache(np.multiply)
+      array1 = np.arange(10000)
+      array2 = np.arange(10000)
+
+      result = multiply(array1, array2)
+
+      # Second call should return the cached result
+      result = multiply(array1, array2)
+      print(result)
+
+
+All examples are available in the `examples <examples>`_ directory.
+
+Developping in joblibhadoop
+===========================
+
+Prerequisites
+-------------
+
+In order to run the test suite, you need to setup a local hadoop cluster. This
+can be achieved very easily using the docker and docker-compose recipes given
+in the `docker <docker>`_ directory:
+
+1. `Install docker-engine <https://docs.docker.com/engine/installation/>`_:
+
+You have to be able to run the hello-world container:
+
+..  code-block:: bash
+
+    $ docker run hello-world
+
+2. Install docker-compose with pip:
+
+..  code-block:: bash
+
+    $ pip install docker-compose
+
+
+3. Build the hadoop cluster using docker-compose:
+
+..  code-block:: bash
+
+    $ cd joblistore/docker
+    $ docker-compose run namenode hdfs namenode -format
+
+Running the test suite
+----------------------
+
+1. Start your hadoop cluster:
+
+..  code-block:: bash
+
+   $ cd joblib-hadoop/docker
+   $ docker-compose up
+
+2. In another terminal, activate your joblibhadoop-env conda environment:
+
+..  code-block:: bash
+
+    $ . activate joblibhadoop-env
+
+3. Run pytest
+
+..  code-block:: bash
+
+    $ pytest
+
+
+Installing the hdfs3 package by hand
+====================================
+
+For the moment hdfs3 cannot be directly installed using pip : the reason is
+because hdfs3 depends on a C++ based library that is not available in the
+Linux distros and that one needs to build by hand first.
+
+The following notes are specific to Ubuntu 16.04 but can also be adapted to
+Fedora (packages names are slightly different).
+
+1. Clone libhdfs3 from github:
+
+..  code-block:: bash
+
+    $ sudo mkdir /opt/hdfs3
+    $ sudo chown <login>:<login> /opt/hdfs3
+    $ cd /opt/hdfs3
+    $ git clone [email protected]:Pivotal-Data-Attic/pivotalrd-libhdfs3.git libhdfs3
+
+
+2. Install required packages
+
+..  code-block:: bash
+
+    $ sudo apt-get install cmake cmake-curses-gui libxml2-dev libprotobuf-dev \
+    libkrb5-dev uuid-dev libgsasl7-dev protobuf-compiler protobuf-c-compiler \
+    build-essential -y
+
+
+3. Use CMake to configure and build
+
+..  code-block:: bash
+
+   $ cd /opt/hdfs3/libhdfs3
+   $ mkdir build
+   $ cd build
+   $ ../bootstrap
+   $ make
+   $ make install
+
+
+4. Add the following to your **~/.bashrc** environment file:
+
+::
+
+   export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hdfs3/libhdfs3/dist
+
+5. reload your environment:
+
+..  code-block:: bash
+
+   $ source ~/.bashrc
+
+6. Use **pip** to install *hdfs3* (use `sudo` if needed):
+
+..  code-block:: bash
+
+   $ pip install hdfs3
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -0,0 +1,32 @@
+FROM badele/debian-openjdk:7
+MAINTAINER Bruno Adelé "[email protected]"
+
+USER root
+ENV DEBIAN_FRONTEND noninteractive
+
+# Programs version
+ENV HADOOP_VERSION 2.6.0
+
+# Hadoop variable
+ENV HADOOP_PREFIX /opt/hadoop
+ENV HADOOP_CONF_DIR $HADOOP_PREFIX/conf
+ENV PATH $PATH:$HADOOP_PREFIX/bin
+ENV PATH $PATH:$HADOOP_PREFIX/sbin
+
+# Install dev tools
+RUN apt-get update
+RUN apt-get install -y apt-utils
+RUN apt-get install -y wget tar sudo openssh-server openssh-client rsync
+
+# Install Hadoop
+RUN wget http://archive.apache.org/dist/hadoop/core/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz && \
+    tar -zxf /hadoop-$HADOOP_VERSION.tar.gz && \
+    rm /hadoop-$HADOOP_VERSION.tar.gz && \
+    mv hadoop-$HADOOP_VERSION $HADOOP_PREFIX && \
+    mkdir -p $HADOOP_VERSION/logs
+
+VOLUME /shared
+
+ADD core-site.xml $HADOOP_CONF_DIR/core-site.xml
+ADD hdfs-site.xml $HADOOP_CONF_DIR/hdfs-site.xml
+
diff --git a/docker/core-site.xml b/docker/core-site.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+	<property>
+		<name>fs.defaultFS</name>
+		<value>hdfs://namenode:8020</value>
+	</property>
+</configuration>
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -0,0 +1,43 @@
+namenode:
+  build: .
+  hostname: namenode
+  volumes:
+    - /docker_shared/debian-hadoop/namenode:/shared
+  ports:
+    - "8020:8020"
+    - "50070:50070"
+  command: hdfs namenode
+
+
+datanode1:
+  build: .
+  hostname: datanode1
+  volumes:
+      - /docker_shared/debian-hadoop/datanode1:/shared
+  links:
+      - namenode
+  ports:
+      - "5001:50075"
+  command: hdfs datanode
+
+datanode2:
+  build: .
+  hostname: datanode2
+  volumes:
+      - /docker_shared/debian-hadoop/datanode2:/shared
+  links:
+      - namenode
+  ports:
+      - "5002:50075"
+  command: hdfs datanode
+
+datanode3:
+  build: .
+  hostname: datanode3
+  volumes:
+      - /docker_shared/debian-hadoop/datanode3:/shared
+  links:
+      - namenode
+  ports:
+      - "5003:50075"
+  command: hdfs datanode
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		[pylint]
		disable=invalid-name,too-many-arguments,duplicate-code,arguments-differ,super-init-not-called