Merge pull request #4036 from Hyaxia/metrics_terminal

Added metrics for currently running terminals and labeled by type kernels
8 years ago · dfcea5b3f7
parent fa7b40be27 4c1d62f3b2
commit dfcea5b3f7
7 changed files with 69 additions and 17 deletions
--- a/appveyor.yml
+++ b/appveyor.yml
@ -17,7 +17,10 @@ install:
  - cmd: conda config --set show_channel_urls true
  - cmd: conda config --add channels conda-forge
  #- cmd: conda update --yes --quiet conda
-  - cmd: conda install -y pyzmq tornado jupyter_client nbformat nbconvert ipykernel pip nodejs nose
+  - cmd: conda install -y pyzmq tornado jupyter_client nbformat ipykernel pip nodejs nose
+  # not using `conda install -y` on nbconvent package because there is
+  # currently a bug with the version that the anaconda installs, so we will just install it with pip
+  - cmd: pip install nbconvert
  - cmd: python setup.py build
  - cmd: pip install .[test]

--- a/notebook/log.py
+++ b/notebook/log.py
@ -7,7 +7,8 @@

 import json
 from tornado.log import access_log
-from .metrics import prometheus_log_method
+from .prometheus.log_functions import prometheus_log_method
+

 def log_request(handler):
    """log a bit more information about each request than tornado's default
--- a/notebook/prometheus/init.py
+++ b/notebook/prometheus/init.py
@ -0,0 +1,4 @@
+"""
+A package containing all the functionality and
+configuration connected to the prometheus metrics
+"""
--- a/notebook/prometheus/log_functions.py
+++ b/notebook/prometheus/log_functions.py
@ -1,18 +1,5 @@
-"""
-Prometheus metrics exported by Jupyter Notebook Server
+from ..prometheus.metrics import HTTP_REQUEST_DURATION_SECONDS

-Read https://prometheus.io/docs/practices/naming/ for naming
-conventions for metrics & labels.
-"""
-
-from prometheus_client import Histogram
-
-# This is a fairly standard name for HTTP duration latency reporting
-HTTP_REQUEST_DURATION_SECONDS = Histogram(
-    'http_request_duration_seconds',
-    'duration in seconds for all HTTP requests',
-    ['method', 'handler', 'status_code'],
-)

 def prometheus_log_method(handler):
    """
--- a/notebook/prometheus/metrics.py
+++ b/notebook/prometheus/metrics.py
@ -0,0 +1,27 @@
+"""
+Prometheus metrics exported by Jupyter Notebook Server
+
+Read https://prometheus.io/docs/practices/naming/ for naming
+conventions for metrics & labels.
+"""
+
+
+from prometheus_client import Histogram, Gauge
+
+
+HTTP_REQUEST_DURATION_SECONDS = Histogram(
+    'http_request_duration_seconds',
+    'duration in seconds for all HTTP requests',
+    ['method', 'handler', 'status_code'],
+)
+
+TERMINAL_CURRENTLY_RUNNING_TOTAL = Gauge(
+    'terminal_currently_running_total',
+    'counter for how many terminals are running',
+)
+
+KERNEL_CURRENTLY_RUNNING_TOTAL = Gauge(
+    'kernel_currently_running_total',
+    'counter for how many kernels are running labeled by type',
+    ['type']
+)
--- a/notebook/services/kernels/kernelmanager.py
+++ b/notebook/services/kernels/kernelmanager.py
@ -26,6 +26,8 @@ from notebook.utils import to_os_path, exists
 from notebook._tz import utcnow, isoformat
 from ipython_genutils.py3compat import getcwd

+from notebook.prometheus.metrics import KERNEL_CURRENTLY_RUNNING_TOTAL
+

 class MappingKernelManager(MultiKernelManager):
    """A KernelManager that handles notebook mapping and HTTP error handling"""
@ -168,6 +170,13 @@ class MappingKernelManager(MultiKernelManager):
                lambda : self._handle_kernel_died(kernel_id),
                'dead',
            )
+
+            # Increase the metric of number of kernels running
+            # for the relevant kernel type by 1
+            KERNEL_CURRENTLY_RUNNING_TOTAL.labels(
+                type=self._kernels[kernel_id].kernel_name
+            ).inc()
+
        else:
            self._check_kernel_id(kernel_id)
            self.log.info("Using existing kernel: %s" % kernel_id)
@ -278,6 +287,13 @@ class MappingKernelManager(MultiKernelManager):
        self.stop_buffering(kernel_id)
        self._kernel_connections.pop(kernel_id, None)
        self.last_kernel_activity = utcnow()
+
+        # Decrease the metric of number of kernels
+        # running for the relevant kernel type by 1
+        KERNEL_CURRENTLY_RUNNING_TOTAL.labels(
+            type=self._kernels[kernel_id].kernel_name
+        ).dec()
+
        return super(MappingKernelManager, self).shutdown_kernel(kernel_id, now=now)

    def restart_kernel(self, kernel_id):
--- a/notebook/terminal/api_handlers.py
+++ b/notebook/terminal/api_handlers.py
@ -1,7 +1,8 @@
 import json
 from tornado import web, gen
 from ..base.handlers import APIHandler
-from ..utils import url_path_join
+from ..prometheus.metrics import TERMINAL_CURRENTLY_RUNNING_TOTAL
+

 class TerminalRootHandler(APIHandler):
    @web.authenticated
@ -10,12 +11,20 @@ class TerminalRootHandler(APIHandler):
        terms = [{'name': name} for name in tm.terminals]
        self.finish(json.dumps(terms))

+        # Update the metric below to the length of the list 'terms'
+        TERMINAL_CURRENTLY_RUNNING_TOTAL.set(
+            len(terms)
+        )
+
    @web.authenticated
    def post(self):
        """POST /terminals creates a new terminal and redirects to it"""
        name, _ = self.terminal_manager.new_named_terminal()
        self.finish(json.dumps({'name': name}))

+        # Increase the metric by one because a new terminal was created
+        TERMINAL_CURRENTLY_RUNNING_TOTAL.inc()
+

 class TerminalHandler(APIHandler):
    SUPPORTED_METHODS = ('GET', 'DELETE')
@ -36,5 +45,10 @@ class TerminalHandler(APIHandler):
            yield tm.terminate(name, force=True)
            self.set_status(204)
            self.finish()
+
+            # Decrease the metric below by one
+            # because a terminal has been shutdown
+            TERMINAL_CURRENTLY_RUNNING_TOTAL.dec()
+
        else:
            raise web.HTTPError(404, "Terminal not found: %r" % name)