-
Notifications
You must be signed in to change notification settings - Fork 102
report the memory usage metrics as prometheus metrics #22
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
from typing import NamedTuple | ||
krinsman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
import psutil | ||
|
||
|
||
class MemoryMetrics(NamedTuple): | ||
current_memory: int | ||
max_memory: int | ||
|
||
|
||
class CPUMetrics(NamedTuple): | ||
cpu_max: float | ||
cpu_usage: float | ||
|
||
|
||
def memory_metrics() -> MemoryMetrics: | ||
cur_process = psutil.Process() | ||
all_processes = [cur_process] + cur_process.children(recursive=True) | ||
|
||
rss = sum([p.memory_info().rss for p in all_processes]) | ||
virtual_memory = psutil.virtual_memory() | ||
|
||
return MemoryMetrics( | ||
rss, | ||
virtual_memory.total | ||
) | ||
|
||
|
||
def cpu_metrics() -> CPUMetrics: | ||
cur_process = psutil.Process() | ||
all_processes = [cur_process] + cur_process.children(recursive=True) | ||
|
||
cpu_count = psutil.cpu_count() | ||
|
||
def get_cpu_percent(p): | ||
try: | ||
return p.cpu_percent(interval=0.05) | ||
# Avoid littering logs with stack traces complaining | ||
# about dead processes having no CPU usage | ||
except: | ||
return 0 | ||
cpu_percent = sum([get_cpu_percent(p) for p in all_processes]) | ||
|
||
return CPUMetrics( | ||
cpu_count * 100.0, | ||
cpu_percent | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
from notebook.notebookapp import NotebookApp | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I appreciate you using the same So anyway in the interests of long-term maintainability I am glad that you did that. I also like the more sophisticated file structure (e.g. instead of squeezing everything into |
||
from prometheus_client import Gauge | ||
from tornado import gen | ||
|
||
from nbresuse.metrics import CPUMetrics, MemoryMetrics, cpu_metrics, memory_metrics | ||
|
||
krinsman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
try: | ||
# Traitlets >= 4.3.3 | ||
from traitlets import Callable | ||
except ImportError: | ||
from .utils import Callable | ||
|
||
TOTAL_MEMORY_USAGE = Gauge( | ||
'total_memory_usage', | ||
'counter for total memory usage', | ||
[] | ||
) | ||
|
||
MAX_MEMORY_USAGE = Gauge( | ||
'max_memory_usage', | ||
'counter for max memory usage', | ||
[] | ||
) | ||
|
||
TOTAL_CPU_USAGE = Gauge( | ||
'total_cpu_usage', | ||
'counter for total cpu usage', | ||
[] | ||
) | ||
|
||
MAX_CPU_USAGE = Gauge( | ||
'max_cpu_usage', | ||
'counter for max cpu usage', | ||
[] | ||
) | ||
|
||
|
||
class PrometheusHandler(Callable): | ||
def __init__(self, nbapp: NotebookApp): | ||
super().__init__() | ||
self.config = nbapp.web_app.settings['nbresuse_display_config'] | ||
self.session_manager = nbapp.session_manager | ||
|
||
@gen.coroutine | ||
def __call__(self, *args, **kwargs): | ||
metrics = self.apply_memory_limits(memory_metrics()) | ||
TOTAL_MEMORY_USAGE.set(metrics.current_memory) | ||
MAX_MEMORY_USAGE.set(metrics.max_memory) | ||
if self.config.track_cpu_percent: | ||
metrics = self.apply_cpu_limits(cpu_metrics()) | ||
TOTAL_CPU_USAGE.set(metrics.cpu_usage) | ||
MAX_CPU_USAGE.set(metrics.cpu_max) | ||
|
||
def apply_memory_limits(self, metrics: MemoryMetrics) -> MemoryMetrics: | ||
if callable(self.config.mem_limit): | ||
metrics.max_memory = self.config.mem_limit(rss=metrics.max_memory) | ||
elif self.config.mem_limit > 0: # mem_limit is an Int | ||
metrics.max_memory = self.config.mem_limit | ||
return metrics | ||
|
||
def apply_cpu_limits(self, metrics: CPUMetrics) -> CPUMetrics: | ||
if self.config.cpu_limit > 0: | ||
metrics.cpu_max = self.config.cpu_limit | ||
return metrics |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,7 @@ | ||
define(['jquery', 'base/js/utils'], function ($, utils) { | ||
define([ | ||
'jquery', | ||
'base/js/utils' | ||
], function ($, utils) { | ||
function setupDOM() { | ||
$('#maintoolbar-container').append( | ||
$('<div>').attr('id', 'nbresuse-display') | ||
|
@@ -20,32 +23,54 @@ define(['jquery', 'base/js/utils'], function ($, utils) { | |
); | ||
} | ||
|
||
function humanFileSize(size) { | ||
witzatom marked this conversation as resolved.
Show resolved
Hide resolved
|
||
var i = Math.floor( Math.log(size) / Math.log(1024) ); | ||
return ( size / Math.pow(1024, i) ).toFixed(1) * 1 + ' ' + ['B', 'kB', 'MB', 'GB', 'TB'][i]; | ||
} | ||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So is the idea of this function to automatically parse the JSON emitted by the backend NBResuse server extension and format it correctly? It does seem to substantially simplify the |
||
function metric(metric_name, text, multiple=false) { | ||
var regex = new RegExp("^" + metric_name + "\{?([^ \}]*)\}? (.*)$", "gm"); | ||
var matches = []; | ||
var match; | ||
|
||
do{ | ||
match = regex.exec(text); | ||
if (match){ | ||
matches.push(match) | ||
} | ||
} | ||
while (match); | ||
|
||
if (!multiple) { | ||
if (matches.length > 0) | ||
return matches[0]; | ||
return null; | ||
}else | ||
return matches; | ||
} | ||
|
||
var displayMetrics = function() { | ||
if (document.hidden) { | ||
// Don't poll when nobody is looking | ||
return; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would this code be easily modifiable to also report CPU usage information if that is also available? If not, don't worry about it, that would just be a nice extra plus, and I realize the fact that we can't assume/guarantee that the CPU information will be available for the client to parse probably increases the complexity of the logic required to implement this substantially. |
||
$.getJSON(utils.get_body_data('baseUrl') + 'metrics', function(data) { | ||
// FIXME: Proper setups for MB and GB. MB should have 0 things | ||
// after the ., but GB should have 2. | ||
var display = Math.round(data['rss'] / (1024 * 1024)); | ||
$.ajax({ | ||
url: utils.get_body_data('baseUrl') + 'metrics', | ||
success: function(data) { | ||
let totalMemoryUsage = metric("total_memory_usage", data); | ||
let maxMemoryUsage = metric("max_memory_usage", data); | ||
|
||
var limits = data['limits']; | ||
if ('memory' in limits) { | ||
if ('rss' in limits['memory']) { | ||
display += " / " + Math.round(limits['memory']['rss'] / (1024 * 1024)); | ||
} | ||
if (limits['memory']['warn']) { | ||
$('#nbresuse-display').addClass('nbresuse-warn'); | ||
} else { | ||
$('#nbresuse-display').removeClass('nbresuse-warn'); | ||
} | ||
} | ||
if (data['limits']['memory'] !== null) { | ||
if (!totalMemoryUsage || !maxMemoryUsage) | ||
return; | ||
totalMemoryUsage = humanFileSize(parseFloat(totalMemoryUsage[2])); | ||
maxMemoryUsage = humanFileSize(parseFloat(maxMemoryUsage[2])); | ||
|
||
var display = totalMemoryUsage + "/" + maxMemoryUsage; | ||
$('#nbresuse-mem').text(display); | ||
} | ||
$('#nbresuse-mem').text(display + ' MB'); | ||
}); | ||
} | ||
}; | ||
|
||
var load_ipython_extension = function () { | ||
setupDOM(); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After merging this, I might want to make prometheus optional by having the metrics reporting done this way if
prometheus_client
is available, and have it done with the old "prometheus-stupid" if not available.Some people I have talked to said that they don't want to install Prometheus, I don't really understand why since I don't know anything about Prometheus to be honest.
If you could maybe implement this logic yourself (see e.g. this StackOverflow answer https://stackoverflow.com/a/24640526 ) it would be very helpful for me and save me some time after merging this PR. That being said if it's not clear what I'm requesting I can spend some hours myself implementing it.