Skip to content

report the memory usage metrics as prometheus metrics #22

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 11 additions & 91 deletions nbresuse/__init__.py
Original file line number Diff line number Diff line change
@@ -1,100 +1,17 @@
import os
import json
import psutil

from tornado import ioloop
from traitlets import Bool, Float, Int, Union, default
from traitlets.config import Configurable
from notebook.utils import url_path_join
from notebook.base.handlers import IPythonHandler
from tornado import web

from nbresuse.prometheus import PrometheusHandler
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After merging this, I might want to make prometheus optional by having the metrics reporting done this way if prometheus_client is available, and have it done with the old "prometheus-stupid" if not available.

Some people I have talked to said that they don't want to install Prometheus, I don't really understand why since I don't know anything about Prometheus to be honest.

If you could maybe implement this logic yourself (see e.g. this StackOverflow answer https://stackoverflow.com/a/24640526 ) it would be very helpful for me and save me some time after merging this PR. That being said if it's not clear what I'm requesting I can spend some hours myself implementing it.


try:
# Traitlets >= 4.3.3
from traitlets import Callable
except ImportError:
from .utils import Callable

from concurrent.futures import ThreadPoolExecutor
from tornado.concurrent import run_on_executor

class MetricsHandler(IPythonHandler):
def initialize(self):
super().initialize()
self.cpu_percent = 0

# https://www.tornadoweb.org/en/stable/concurrent.html#tornado.concurrent.run_on_executor
self.executor = ThreadPoolExecutor(max_workers=10)

self.cpu_count = psutil.cpu_count()

@run_on_executor
def update_cpu_percent(self, all_processes):

def get_cpu_percent(p):
try:
return p.cpu_percent(interval=0.05)
# Avoid littering logs with stack traces complaining
# about dead processes having no CPU usage
except:
return 0

return sum([get_cpu_percent(p) for p in all_processes])

@web.authenticated
async def get(self):
"""
Calculate and return current resource usage metrics
"""
config = self.settings['nbresuse_display_config']
cur_process = psutil.Process()
all_processes = [cur_process] + cur_process.children(recursive=True)
limits = {}

# Get memory information
rss = sum([p.memory_info().rss for p in all_processes])

if callable(config.mem_limit):
mem_limit = config.mem_limit(rss=rss)
else: # mem_limit is an Int
mem_limit = config.mem_limit

# A better approach would use cpu_affinity to account for the
# fact that the number of logical CPUs in the system is not
# necessarily the same as the number of CPUs the process
# can actually use. But cpu_affinity isn't available for OS X.
cpu_count = psutil.cpu_count()

if config.track_cpu_percent:
self.cpu_percent = await self.update_cpu_percent(all_processes)

if config.mem_limit != 0:
limits['memory'] = {
'rss': mem_limit
}
if config.mem_warning_threshold != 0:
limits['memory']['warn'] = (mem_limit - rss) < (mem_limit * config.mem_warning_threshold)

# Optionally get CPU information
if config.track_cpu_percent:
self.cpu_percent = await self.update_cpu_percent(all_processes)

if config.cpu_limit != 0:
limits['cpu'] = {
'cpu': config.cpu_limit
}
if config.cpu_warning_threshold != 0:
limits['cpu']['warn'] = (config.cpu_limit - self.cpu_percent) < (config.cpu_limit * config.cpu_warning_threshold)

metrics = {
'rss': rss,
'limits': limits,
}
if config.track_cpu_percent:
metrics.update(cpu_percent=self.cpu_percent,
cpu_count=self.cpu_count)

self.log.debug("NBResuse metrics: %s", metrics)
self.write(json.dumps(metrics))


def _jupyter_server_extension_paths():
"""
Expand All @@ -104,6 +21,7 @@ def _jupyter_server_extension_paths():
'module': 'nbresuse',
}]


def _jupyter_nbextension_paths():
"""
Set up the notebook extension for displaying metrics
Expand All @@ -115,6 +33,7 @@ def _jupyter_nbextension_paths():
"require": "nbresuse/main"
}]


class ResourceUseDisplay(Configurable):
"""
Holds server-side configuration for nbresuse
Expand Down Expand Up @@ -142,7 +61,7 @@ class ResourceUseDisplay(Configurable):
Note that this does not actually limit the user's memory usage!

Defaults to reading from the `MEM_LIMIT` environment variable. If
set to 0, no memory limit is displayed.
set to 0, the max memory available is displayed.
"""
).tag(config=True)

Expand Down Expand Up @@ -178,19 +97,20 @@ def _mem_limit_default(self):
Note that this does not actually limit the user's CPU usage!

Defaults to reading from the `CPU_LIMIT` environment variable. If
set to 0, no CPU usage limit is displayed.
set to 0, the total CPU count available is displayed.
"""
).tag(config=True)

@default('cpu_limit')
def _cpu_limit_default(self):
return float(os.environ.get('CPU_LIMIT', 0))


def load_jupyter_server_extension(nbapp):
"""
Called during notebook start
"""
resuseconfig = ResourceUseDisplay(parent=nbapp)
nbapp.web_app.settings['nbresuse_display_config'] = resuseconfig
route_pattern = url_path_join(nbapp.web_app.settings['base_url'], '/metrics')
nbapp.web_app.add_handlers('.*', [(route_pattern, MetricsHandler)])
callback = ioloop.PeriodicCallback(PrometheusHandler(nbapp), 1000)
callback.start()
47 changes: 47 additions & 0 deletions nbresuse/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from typing import NamedTuple

import psutil


class MemoryMetrics(NamedTuple):
current_memory: int
max_memory: int


class CPUMetrics(NamedTuple):
cpu_max: float
cpu_usage: float


def memory_metrics() -> MemoryMetrics:
cur_process = psutil.Process()
all_processes = [cur_process] + cur_process.children(recursive=True)

rss = sum([p.memory_info().rss for p in all_processes])
virtual_memory = psutil.virtual_memory()

return MemoryMetrics(
rss,
virtual_memory.total
)


def cpu_metrics() -> CPUMetrics:
cur_process = psutil.Process()
all_processes = [cur_process] + cur_process.children(recursive=True)

cpu_count = psutil.cpu_count()

def get_cpu_percent(p):
try:
return p.cpu_percent(interval=0.05)
# Avoid littering logs with stack traces complaining
# about dead processes having no CPU usage
except:
return 0
cpu_percent = sum([get_cpu_percent(p) for p in all_processes])

return CPUMetrics(
cpu_count * 100.0,
cpu_percent
)
64 changes: 64 additions & 0 deletions nbresuse/prometheus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from notebook.notebookapp import NotebookApp
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I appreciate you using the same Callable here that was used elsewhere. Ultimately it probably doesn't matter from a functional point of view (i.e. no substance difference), but I imagine that future code maintainers will disproportionately be people (like myself) who have more experience with the "Jupyter"y tools like traitlets then other implementations of the same features, so sticking with those might enhance the code's long-term maintainability.

So anyway in the interests of long-term maintainability I am glad that you did that.

I also like the more sophisticated file structure (e.g. instead of squeezing everything into __init__.py) -- that sort of separation of concerns I think will also improve NBResuse's long-term maintainability. Which is something that would likely be important if it were to ever be merged into JupyterLab.

from prometheus_client import Gauge
from tornado import gen

from nbresuse.metrics import CPUMetrics, MemoryMetrics, cpu_metrics, memory_metrics

try:
# Traitlets >= 4.3.3
from traitlets import Callable
except ImportError:
from .utils import Callable

TOTAL_MEMORY_USAGE = Gauge(
'total_memory_usage',
'counter for total memory usage',
[]
)

MAX_MEMORY_USAGE = Gauge(
'max_memory_usage',
'counter for max memory usage',
[]
)

TOTAL_CPU_USAGE = Gauge(
'total_cpu_usage',
'counter for total cpu usage',
[]
)

MAX_CPU_USAGE = Gauge(
'max_cpu_usage',
'counter for max cpu usage',
[]
)


class PrometheusHandler(Callable):
def __init__(self, nbapp: NotebookApp):
super().__init__()
self.config = nbapp.web_app.settings['nbresuse_display_config']
self.session_manager = nbapp.session_manager

@gen.coroutine
def __call__(self, *args, **kwargs):
metrics = self.apply_memory_limits(memory_metrics())
TOTAL_MEMORY_USAGE.set(metrics.current_memory)
MAX_MEMORY_USAGE.set(metrics.max_memory)
if self.config.track_cpu_percent:
metrics = self.apply_cpu_limits(cpu_metrics())
TOTAL_CPU_USAGE.set(metrics.cpu_usage)
MAX_CPU_USAGE.set(metrics.cpu_max)

def apply_memory_limits(self, metrics: MemoryMetrics) -> MemoryMetrics:
if callable(self.config.mem_limit):
metrics.max_memory = self.config.mem_limit(rss=metrics.max_memory)
elif self.config.mem_limit > 0: # mem_limit is an Int
metrics.max_memory = self.config.mem_limit
return metrics

def apply_cpu_limits(self, metrics: CPUMetrics) -> CPUMetrics:
if self.config.cpu_limit > 0:
metrics.cpu_max = self.config.cpu_limit
return metrics
63 changes: 44 additions & 19 deletions nbresuse/static/main.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
define(['jquery', 'base/js/utils'], function ($, utils) {
define([
'jquery',
'base/js/utils'
], function ($, utils) {
function setupDOM() {
$('#maintoolbar-container').append(
$('<div>').attr('id', 'nbresuse-display')
Expand All @@ -20,32 +23,54 @@ define(['jquery', 'base/js/utils'], function ($, utils) {
);
}

function humanFileSize(size) {
var i = Math.floor( Math.log(size) / Math.log(1024) );
return ( size / Math.pow(1024, i) ).toFixed(1) * 1 + ' ' + ['B', 'kB', 'MB', 'GB', 'TB'][i];
}


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So is the idea of this function to automatically parse the JSON emitted by the backend NBResuse server extension and format it correctly?

It does seem to substantially simplify the displayMetrics function, so it seems like a good idea to me. I guess I'm just somewhat unclear about its purpose based on the function name only.

function metric(metric_name, text, multiple=false) {
var regex = new RegExp("^" + metric_name + "\{?([^ \}]*)\}? (.*)$", "gm");
var matches = [];
var match;

do{
match = regex.exec(text);
if (match){
matches.push(match)
}
}
while (match);

if (!multiple) {
if (matches.length > 0)
return matches[0];
return null;
}else
return matches;
}

var displayMetrics = function() {
if (document.hidden) {
// Don't poll when nobody is looking
return;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would this code be easily modifiable to also report CPU usage information if that is also available?

If not, don't worry about it, that would just be a nice extra plus, and I realize the fact that we can't assume/guarantee that the CPU information will be available for the client to parse probably increases the complexity of the logic required to implement this substantially.

$.getJSON(utils.get_body_data('baseUrl') + 'metrics', function(data) {
// FIXME: Proper setups for MB and GB. MB should have 0 things
// after the ., but GB should have 2.
var display = Math.round(data['rss'] / (1024 * 1024));
$.ajax({
url: utils.get_body_data('baseUrl') + 'metrics',
success: function(data) {
let totalMemoryUsage = metric("total_memory_usage", data);
let maxMemoryUsage = metric("max_memory_usage", data);

var limits = data['limits'];
if ('memory' in limits) {
if ('rss' in limits['memory']) {
display += " / " + Math.round(limits['memory']['rss'] / (1024 * 1024));
}
if (limits['memory']['warn']) {
$('#nbresuse-display').addClass('nbresuse-warn');
} else {
$('#nbresuse-display').removeClass('nbresuse-warn');
}
}
if (data['limits']['memory'] !== null) {
if (!totalMemoryUsage || !maxMemoryUsage)
return;
totalMemoryUsage = humanFileSize(parseFloat(totalMemoryUsage[2]));
maxMemoryUsage = humanFileSize(parseFloat(maxMemoryUsage[2]));

var display = totalMemoryUsage + "/" + maxMemoryUsage;
$('#nbresuse-mem').text(display);
}
$('#nbresuse-mem').text(display + ' MB');
});
}
};

var load_ipython_extension = function () {
setupDOM();
Expand Down