jupyter-server · akchinSTC · May 7, 2019 · Apr 17, 2019 · Apr 24, 2019
diff --git a/Makefile b/Makefile
@@ -4,7 +4,7 @@
 .PHONY: help build clean nuke dev dev-http docs install sdist test release clean-images clean-enterprise-gateway \
     clean-nb2kg clean-demo-base clean-kernel-images clean-enterprise-gateway \
     clean-kernel-py clean-kernel-spark-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py \
-    clean-kernel-tf-gpu-py publish-images
+    clean-kernel-tf-gpu-py clean-kernel-image-puller publish-images
 
 SA:=source activate
 ENV:=enterprise-gateway-dev
@@ -110,14 +110,14 @@ docker-images:  ## Build docker images (includes kernel-based images)
 kernel-images: ## Build kernel-based docker images
 
 # Actual working targets...
-docker-images enterprise-gateway-demo demo-base nb2kg kernel-images enterprise-gateway kernel-py kernel-spark-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py:
+docker-images enterprise-gateway-demo demo-base nb2kg kernel-images enterprise-gateway kernel-py kernel-spark-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py kernel-image-puller:
 	make WHEEL_FILE=$(WHEEL_FILE) VERSION=$(VERSION) TAG=$(TAG) -C etc $@
 
 # Here for doc purposes
 clean-images: ## Remove docker images (includes kernel-based images)
 clean-kernel-images: ## Remove kernel-based images
 
-clean-images clean-enterprise-gateway-demo clean-nb2kg clean-demo-base clean-kernel-images clean-enterprise-gateway clean-kernel-py clean-kernel-spark-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py clean-kernel-tf-gpu-py:
+clean-images clean-enterprise-gateway-demo clean-nb2kg clean-demo-base clean-kernel-images clean-enterprise-gateway clean-kernel-py clean-kernel-spark-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala clean-kernel-tf-py clean-kernel-tf-gpu-py clean-kernel-image-puller:
 	make WHEEL_FILE=$(WHEEL_FILE) VERSION=$(VERSION) TAG=$(TAG) -C etc $@
 
 publish-images: ## Push docker images to docker hub

diff --git a/docs/source/getting-started-kubernetes.md b/docs/source/getting-started-kubernetes.md
@@ -211,6 +211,48 @@ roleRef:
   name: kernel-controller
   apiGroup: rbac.authorization.k8s.io
 ```
+#### Kernel Image Puller
+Because kernels now reside within containers and its typical for the first reference of a container to trigger its pull from a docker repository, kernel startup requests can easily timeout whenever the kernel image is first accessed on any given node.  To mitigate this issue, Enterprise Gateway deployment includes a DaemonSet object named `kernel-image-puller` or KIP.  This object is responsible for polling Enterprise Gateway for the current set of configured kernelspecs, picking out any configured image name references, and pulling those images to the node on which KIP is running.  Because its a daemon set, this will also address the case when new nodes are added to a configuration.
+
+The Kernel Image Puller can be configured for the interval at which it checks for new kernelspecs (`KIP_INTERVAL`), the number of puller threads it will utilize per node (`KIP_NUM_PULLERS`), the number of retries it will attempt for a given image (`KIP_NUM_RETRIES`), and the pull policy (`KIP_PULL_POLICY`) - which essentially dictates whether it will attempt to pull images that its already encoutnered (`Always`) vs. only pulling the image if it hasn't seen it yet (`IfNotPresent`).
+
+Here's what the Kernel Image Puller looks like in the yaml...
+```yaml
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: kernel-image-puller
+  namespace: enterprise-gateway
+spec:
+  selector:
+    matchLabels:
+      name: kernel-image-puller 
+  template:
+    metadata:
+      labels:
+        name: kernel-image-puller 
+        app: enterprise-gateway
+        component: kernel-image-puller
+    spec:
+      containers:
+      - name: kernel-image-puller 
+        image: elyra/kernel-image-puller:dev
+        env:
+          - name: KIP_GATEWAY_HOST
+            value: "http://enterprise-gateway.enterprise-gateway:8888"
+          - name: KIP_INTERVAL
+            value: "300"
+          - name: KIP_PULL_POLICY
+            value: "IfNotPresent"
+        volumeMounts:
+          - name: dockersock
+            mountPath: "/var/run/docker.sock"
+      volumes:
+      - name: dockersock
+        hostPath:
+          path: /var/run/docker.sock
+```
+
 
 #### Kernelspec Modifications
 
@@ -397,16 +439,7 @@ For these invocations, the `argv:` is nearly identical to non-kubernetes configu
 ```
 
 ### Deploying Enterprise Gateway on Kubernetes
-Once the Kubernetes cluster is configured and `kubectl` is demonstrated to be working on the master node, pull the Enterprise Gateway and Kernel images on each worker node:
-
-```
-docker pull elyra/enterprise-gateway:VERSION
-docker pull elyra/kernel-py:VERSION
-```
-
-**Note:** It is important to pre-seed the worker nodes with **all** kernel images, otherwise the automatic download time will count against the kernel's launch timeout.  Although this will likely only impact the first launch of a given kernel on a given worker node, when multiplied against the number of kernels and worker nodes, it will prove to be a frustrating user experience. 
-
-If it is not possible to pre-seed the nodes, you will likely need to adjust the `EG_KERNEL_LAUNCH_TIMEOUT` value in the `enterprise-gateway.yaml` file as well as the `KG_REQUEST_TIMEOUT` parameter that issue the kernel start requests from the `NB2KG` extension of the Notebook client.
+Once the Kubernetes cluster is configured and `kubectl` is demonstrated to be working on the master node, it is time to deploy Enterprise Gateway. There a couple of different deployment options - kubectl or helm.
 
 #### Option 1: Deploying with kubectl
 
@@ -464,8 +497,13 @@ can override them with Helm's `--set` or `--values` options.
 
 | **Parameter** | **Description** | **Default** |
 | ------------- | --------------- | ----------- |
-| `image` | Enterprise Gateway image name and tag to use. Ensure the tag is updated to the version of Enterprise Gateway you wish to run. | `elyra/enterprise-gateway:VERSION`, where `VERSION` is the release being used |
-| `image_pull_policy` | Enterprise Gateway image pull policy. Use `IfNotPresent` policy so that dev-based systems don't automatically update. This provides more control.  Since formal tags will be release-specific this policy should be sufficient for them as well. | `IfNotPresent` |
+| `eg_image` | Enterprise Gateway image name and tag to use. Ensure the tag is updated to the version of Enterprise Gateway you wish to run. | `elyra/enterprise-gateway:VERSION`, where `VERSION` is the release being used |
+| `eg_image_pull_policy` | Enterprise Gateway image pull policy. Use `IfNotPresent` policy so that dev-based systems don't automatically update. This provides more control.  Since formal tags will be release-specific this policy should be sufficient for them as well. | `IfNotPresent` |
+| `eg_port` | The primary port on which Enterprise Gateway is servicing requests. | `8888` |
+| `kip_image` | Kernel Image Puller image name and tag to use. Ensure the tag is updated to the version of the Enterprise Gateway release you wish to run. | `elyra/kernel-image-puller:VERSION`, where `VERSION` is the release being used |
+| `kip_image_pull_policy` | Kernel Image Puller image pull policy. Use `IfNotPresent` policy so that dev-based systems don't automatically update. This provides more control.  Since formal tags will be release-specific this policy should be sufficient for them as well. | `IfNotPresent` |
+| `kip_interval` | The interval (in seconds) at which the Kernel Image Puller fetches kernelspecs to pull kernel images. | `300` |
+| `kip_pull_policy` | Determines whether the Kernel Image Puller will pull kernel images it has previously pulled (`Always`) or only those it hasn't yet pulled (`IfNotPresent`) | `IfNotPresent` |
 | `kernelspecs_image` | Optional custom data image containing kernelspecs to use. Cannot be used with NFS enabled. | `nil` |
 | `kernelspecs_image_pull_policy` | Kernelspecs image pull policy. | `Always` |
 | `replicas` | Update to deploy multiple replicas of EG. | `1` |

diff --git a/etc/Makefile b/etc/Makefile
@@ -3,7 +3,7 @@
 
 .PHONY: help clean clean-images clean-enterprise-gateway clean-enterprise-gateway-demo clean-nb2kg clean-demo-base \
     clean-kernel-images clean-py clean-tf-py clean-tf-gpu-py clean-r clean-spark-r clean-scala toree-launcher \
-    kernelspecs_all kernelspecs_yarn kernelspecs_conductor kernelspecs_kubernetes kernelspecs_docker 
+    kernelspecs_all kernelspecs_yarn kernelspecs_conductor kernelspecs_kubernetes kernelspecs_docker clean-kernel-image-puller
 
 SA:=source activate
 ENV:=enterprise-gateway-dev
@@ -100,34 +100,37 @@ kernel_image_files: ../build/kernel_image_files
 #
 
 KERNEL_IMAGES := kernel-py kernel-spark-py kernel-r kernel-spark-r kernel-scala kernel-tf-py kernel-tf-gpu-py
-DOCKER_IMAGES := demo-base enterprise-gateway-demo nb2kg enterprise-gateway $(KERNEL_IMAGES)
-PUBLISHED_IMAGES := enterprise-gateway-demo nb2kg enterprise-gateway $(KERNEL_IMAGES)
+DOCKER_IMAGES := demo-base enterprise-gateway-demo nb2kg enterprise-gateway kernel-image-puller $(KERNEL_IMAGES)
+PUBLISHED_IMAGES := enterprise-gateway-demo nb2kg enterprise-gateway kernel-image-puller $(KERNEL_IMAGES)
 
 docker-images: $(DOCKER_IMAGES)
 kernel-images: $(KERNEL_IMAGES)
 
-publish-images: publish-enterprise-gateway-demo publish-nb2kg publish-enterprise-gateway publish-kernel-py publish-kernel-spark-py publish-kernel-tf-py publish-kernel-r publish-kernel-spark-r publish-kernel-scala
+publish-images: publish-enterprise-gateway-demo publish-nb2kg publish-enterprise-gateway publish-kernel-py publish-kernel-spark-py publish-kernel-tf-py publish-kernel-r publish-kernel-spark-r publish-kernel-scala publish-kernel-image-puller
 
-clean-images: clean-enterprise-gateway-demo clean-nb2kg clean-demo-base clean-enterprise-gateway clean-kernel-images
+clean-images: clean-enterprise-gateway-demo clean-nb2kg clean-demo-base clean-enterprise-gateway clean-kernel-image-puller clean-kernel-images
 clean-kernel-images: clean-kernel-py clean-kernel-spark-py clean-kernel-tf-py clean-kernel-tf-gpu-py clean-kernel-r clean-kernel-spark-r clean-kernel-scala
 
 # Extra dependencies for each docker image...
 DEPENDS_nb2kg:
 DEPENDS_demo-base:
 DEPENDS_enterprise-gateway-demo: demo-base
 DEPENDS_enterprise-gateway: $(FILE_kernelspecs_all)
+DEPENDS_kernel-image-puller:
 DEPENDS_kernel-py DEPENDS_kernel-spark-py DEPENDS_kernel-r DEPENDS_kernel-spark-r DEPENDS_kernel-scala DEPENDS_kernel-tf-py DEPENDS_kernel-tf-gpu-py: $(FILE_kernelspecs_kubernetes) $(FILE_kernelspecs_docker)
 
 # Extra targets for each docker image...
 TARGETS_nb2kg:
 TARGETS_demo-base:
+TARGETS_kernel-image-puller:
 TARGETS_enterprise-gateway TARGETS_enterprise-gateway-demo: kernelspecs
 	@make -C .. bdist
 TARGETS_kernel-py TARGETS_kernel-spark-py TARGETS_kernel-r TARGETS_kernel-spark-r TARGETS_kernel-scala TARGETS_kernel-tf-py TARGETS_kernel-tf-gpu-py: kernelspecs
 
 # Extra files for each docker image...
 FILES_nb2kg :=
 FILES_demo-base :=
+FILES_kernel-image-puller :=
 FILES_enterprise-gateway-demo := ../dist/jupyter_enterprise_gateway_kernelspecs-* ../dist/jupyter_enterprise_gateway*.whl
 FILES_enterprise-gateway := ../dist/jupyter_enterprise_gateway_kernel_image_files* ../dist/jupyter_enterprise_gateway_kernelspecs-* ../dist/jupyter_enterprise_gateway*.whl
 FILES_kernel-py := ../dist/jupyter_enterprise_gateway_kernel_image_files*

diff --git a/etc/docker/kernel-image-puller/Dockerfile b/etc/docker/kernel-image-puller/Dockerfile
@@ -0,0 +1,18 @@
+FROM python:3
+
+WORKDIR /usr/src/app
+
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY kernel_image_puller.py ./
+
+# The following environment variables are supported - defaults provided.  Override as needed.
+ENV KIP_GATEWAY_HOST http://localhost:8888
+ENV KIP_INTERVAL 300
+ENV KIP_LOG_LEVEL INFO
+ENV KIP_NUM_PULLERS 2
+ENV KIP_NUM_RETRIES 3
+ENV KIP_PULL_POLICY 'IfNotPresent'
+
+CMD [ "python", "./kernel_image_puller.py" ]
diff --git a/etc/docker/kernel-image-puller/README.md b/etc/docker/kernel-image-puller/README.md
@@ -0,0 +1,20 @@
+This image is responsible for contacting the configured [Jupyter Enterprise Gateway](http://jupyter-enterprise-gateway.readthedocs.io/en/latest/) instance within a Kubernetes or Docker Swarm cluster and pulling the set of kernel-based images to the node on which it is running.
+
+# What it Gives You
+* The ability to add new nodes and have kernel images on those nodes automatically populated.
+* The ability to configure new kernelspecs that use different images and have those images pulled to all cluster nodes.
+
+# Basic Use
+Deploy [enterprise-gateway](https://hub.docker.com/r/elyra/enterprise-gateway/) per its instructions and configured to the appropriate environment.
+
+As part of that deployment, Kernel Image Puller (KIP) will be launched on each node.  On Kubernetes, this will be accomplished via a DaemonSet.  On Docker Swarm, it will be via a global service.  KIP will then contact the configured Enterprise Gateway instance, fetch the set of in-use kernelspecs, parse out the image names and pull those images.
+
+There are a few points of configuration listed below - all of which are environment variables (defaults in parenthesis).
+* `KIP_GATEWAY_HOST` (`http://localhost:8888`)
+* `KIP_INTERVAL` (`300`)
+* `KIP_LOG_LEVEL` (`INFO`)
+* `KIP_NUM_PULLERS` (`2`)
+* `KIP_NUM_RETRIES` (`3`)
+* `KIP_PULL_POLICY` (`IfNotPresent`)
+
+For more information, check our [repo](https://github.com/jupyter/enterprise_gateway) and [docs](http://jupyter-enterprise-gateway.readthedocs.io/en/latest/).
diff --git a/etc/docker/kernel-image-puller/kernel_image_puller.py b/etc/docker/kernel-image-puller/kernel_image_puller.py
@@ -0,0 +1,168 @@
+import os
+import logging
+import time
+import queue
+import requests
+from threading import Thread
+from docker.client import DockerClient
+from docker.errors import NotFound, APIError
+
+gateway_host = os.getenv("KIP_GATEWAY_HOST", "http://localhost:8888")
+num_pullers = int(os.getenv("KIP_NUM_PULLERS", "2"))
+num_retries = int(os.getenv("KIP_NUM_RETRIES", "3"))
+interval = int(os.getenv("KIP_INTERVAL", "300"))
+log_level = os.getenv("KIP_LOG_LEVEL", "INFO")
+
+POLICY_IF_NOT_PRESENT = "IfNotPresent"
+POLICY_ALYWAYS = "Always"
+policies = (POLICY_IF_NOT_PRESENT, POLICY_ALYWAYS)
+
+policy = os.getenv("KIP_PULL_POLICY", POLICY_IF_NOT_PRESENT)
+
+docker_client = DockerClient.from_env()
+
+logging.basicConfig(format='[%(levelname)1.1s %(asctime)s %(name)s.%(threadName)s] %(message)s')
+
+
+def get_kernelspecs():
+    """Fetches the set of kernelspecs from the gateway, returning a dict of configured kernel specs"""
+    end_point = '{}/api/kernelspecs'.format(gateway_host)
+    logger.info("Fetching kernelspecs from '{}' ...".format(end_point))
+    resp = requests.get(end_point)
+    if not resp.ok:
+        raise requests.exceptions.HTTPError('Gateway server response: {}'.format(resp.status_code))
+    return resp.json()
+
+
+def fetch_image_names():
+    """
+    Fetches the image names by hitting the /api/kernelspecs endpoing of the Gateway.
+
+    For process-proxy kernelspecs, the image names are contained in the config stanza - which
+    resides in the process-proxy stanza located in the metadata.
+    """
+
+    kspecs = None
+    try:
+        kspecs_response = get_kernelspecs()
+        kspecs = kspecs_response.get('kernelspecs')
+    except Exception as ex:
+        logger.error("Got exception attempting to retrieve kernelspecs - retrying. Exception was: {}".format(ex))
+    finally:
+        if kspecs is None:
+            return False
+
+    # Locate the configured images within the kernelspecs and add to set for duplicate management
+    images = set()
+    for key in kspecs.keys():
+        metadata = kspecs.get(key).get('spec').get('metadata')
+        if metadata is not None:
+            process_proxy = metadata.get('process_proxy')
+            if process_proxy is not None:
+                config = process_proxy.get('config')
+                if config is not None:
+                    image_name = config.get('image_name')
+                    if image_name is not None:
+                        images.add(image_name)
+                    executor_image_name = config.get('executor_image_name')
+                    if executor_image_name is not None:
+                        images.add(executor_image_name)
+
+    # Add the image names to the name queue
+    for image_name in images:
+        name_queue.put_nowait(image_name)
+
+    return True
+
+
+def pull_image(image_name):
+    """
+    Pulls the image.  If the policy is `IfNotPresent` the set of pulled image names is
+    checked and, if present, the method returns.  Otherwise, the pull attempt is made
+    and the set of pulled images is updated, when successful.
+
+    Since NotFound exceptions are tolerated, we trap for only that exception and let
+    the caller handle others.
+    """
+    if policy == POLICY_IF_NOT_PRESENT:
+        if image_name in pulled_images:
+            logger.info("Image '{}' already pulled and policy is '{}'.".format(image_name, policy))
+            return
+
+    logger.debug("Pulling image '{}'...".format(image_name))
+    try:
+        t1 = time.time()
+        docker_client.images.pull(image_name)
+        t2 = time.time()
+        pulled_images.add(image_name)
+        logger.info("Pulled image '{}' in {:.3f} secs.".format(image_name, t2-t1))
+    except NotFound:
+        logger.warning("Image '{}' was not found!".format(image_name))
+
+
+def puller():
+    """
+    Thread-based puller.  Gets image name from the queue and attempts to pull the image.
+    Any issues, except for NotFound, are retried up to num_retries times.
+    Once the image has been pulled, it's not found or the retries have been exceeded,
+    the queue task is marked as done.
+    """
+    while True:
+        image_name = name_queue.get()
+        if image_name is None:
+            break
+
+        i = 0
+        while i < num_retries:
+            try:
+                pull_image(image_name)
+                break
+            except APIError as ex:
+                i += 1
+                if i < num_retries:
+                    logger.warning("Attempt {} to pull image '{}' encountered exception - retrying.  Exception was: {}".
+                                 format(i, image_name, ex))
+                else:
+                    logger.error("Attempt {} to pull image '{}' failed with exception: {}".
+                                 format(i, image_name, ex))
+        name_queue.task_done()
+
+
+if __name__ == "__main__":
+
+    logger = logging.getLogger('kernel_image_puller')
+    logger.setLevel(log_level)
+
+    # Determine pull policy.
+    pulled_images = set()
+    if policy not in policies:
+        logger.warning("Invalid pull policy detected in KIP_PULL_POLICY: '{}'.  Using policy '{}'.".
+                       format(policy, POLICY_IF_NOT_PRESENT))
+        policy = POLICY_IF_NOT_PRESENT
+
+    logger.info("Starting Kernel Image Puller with the following parameters:")
+    logger.info("KIP_GATEWAY_HOST: {}".format(gateway_host))
+    logger.info("KIP_INTERVAL: {} secs".format(interval))
+    logger.info("KIP_NUM_PULLERS: {}".format(num_pullers))
+    logger.info("KIP_NUM_RETRIES: {}".format(num_retries))
+    logger.info("KIP_PULL_POLICY: {}".format(policy))
+    logger.info("KIP_LOG_LEVEL: {}\n".format(log_level))
+
+    # Create an empty queue and start the puller threads.  The number of puller threads is configurable.
+    name_queue = queue.Queue()
+    threads = []
+    for i in range(num_pullers):
+        t = Thread(target=puller, name="t{}".format(i+1))
+        t.start()
+        threads.append(t)
+
+    # Fetch the image names, then wait for name queue to drain.  Once drained, or if there were issues
+    # fetching the image names, wait the interval number of seconds and perform the operation again.
+    while True:
+        fetched = fetch_image_names()
+        if fetched:
+            name_queue.join()
+            logger.info("Images pulled.  Sleeping {} seconds...\n".format(interval))
+        else:
+            logger.info("Sleeping {} seconds to fetch image names...\n".format(interval))
+        time.sleep(interval)
diff --git a/etc/docker/kernel-image-puller/requirements.txt b/etc/docker/kernel-image-puller/requirements.txt
@@ -0,0 +1,2 @@
+docker>=3.7.2
+requests>=2.7,<3.0