optimize openmetrics text parsing (~4x perf) (prometheus#402)

ahmed-mez · brian-brazil · commit 67402133513b · 2019-05-17T10:38:53.000+01:00
Signed-off-by: Ahmed Mezghani &lt;ahmed.mezghani@outlook.com&gt;
diff --git a/prometheus_client/openmetrics/parser.py b/prometheus_client/openmetrics/parser.py
@@ -3,6 +3,7 @@
 from __future__ import unicode_literals
 
 import math
+import re
 
 from ..metrics_core import Metric, METRIC_LABEL_NAME_RE
 from ..samples import Exemplar, Sample, Timestamp
@@ -24,6 +25,24 @@ def text_string_to_metric_families(text):
         yield metric_family
 
 
+ESCAPE_SEQUENCES = {
+    '\\\\': '\\',
+    '\\n': '\n',
+    '\\"': '"',
+}
+
+
+def _replace_escape_sequence(match):
+    return ESCAPE_SEQUENCES[match.group(0)]
+
+
+ESCAPING_RE = re.compile(r'\\[\\n"]')
+
+
+def _replace_escaping(s):
+    return ESCAPING_RE.sub(_replace_escape_sequence, s)
+
+
 def _unescape_help(text):
     result = []
     slash = False
@@ -83,14 +102,23 @@ def _parse_timestamp(timestamp):
             return ts
 
 
-def _parse_labels(it, text):
+def _is_character_escaped(s, charpos):
+    num_bslashes = 0
+    while (charpos > num_bslashes and
+           s[charpos - 1 - num_bslashes] == '\\'):
+        num_bslashes += 1
+    return num_bslashes % 2 == 1
+
+
+def _parse_labels_with_state_machine(text):
     # The { has already been parsed.
     state = 'startoflabelname'
     labelname = []
     labelvalue = []
     labels = {}
+    labels_len = 0
 
-    for char in it:
+    for char in text:
         if state == 'startoflabelname':
             if char == '}':
                 state = 'endoflabels'
@@ -141,37 +169,123 @@ def _parse_labels(it, text):
                 break
             else:
                 raise ValueError("Invalid line: " + text)
-    return labels
+        labels_len += 1
+    return labels, labels_len
+
+
+def _parse_labels(text):
+    labels = {}
+
+    # Raise error if we don't have valid labels
+    if text and "=" not in text:
+        raise ValueError
+
+    # Copy original labels
+    sub_labels = text
+    try:
+        # Process one label at a time
+        while sub_labels:
+            # The label name is before the equal
+            value_start = sub_labels.index("=")
+            label_name = sub_labels[:value_start]
+            sub_labels = sub_labels[value_start + 1:]
+
+            # Check for missing quotes 
+            if not sub_labels or sub_labels[0] != '"':
+                raise ValueError
+
+            # The first quote is guaranteed to be after the equal
+            value_substr = sub_labels[1:]
+
+            # Check for extra commas
+            if not label_name or label_name[0] == ',':
+                raise ValueError
+            if not value_substr or value_substr[-1] == ',':
+                raise ValueError
+
+            # Find the last unescaped quote
+            i = 0
+            while i < len(value_substr):
+                i = value_substr.index('"', i)
+                if not _is_character_escaped(value_substr[:i], i):
+                    break
+                i += 1
+
+            # The label value is inbetween the first and last quote
+            quote_end = i + 1
+            label_value = sub_labels[1:quote_end]
+            # Replace escaping if needed
+            if "\\" in label_value:
+                label_value = _replace_escaping(label_value)
+            labels[label_name] = label_value
+
+            # Remove the processed label from the sub-slice for next iteration
+            sub_labels = sub_labels[quote_end + 1:]
+            if sub_labels.startswith(","):
+                next_comma = 1
+            else:
+                next_comma = 0
+            sub_labels = sub_labels[next_comma:]
+
+            # Check for missing commas
+            if sub_labels and next_comma == 0:
+                raise ValueError
+            
+        return labels
+
+    except ValueError:
+        raise ValueError("Invalid labels: " + text)
 
 
 def _parse_sample(text):
-    name = []
-    value = []
+    # Detect the labels in the text
+    label_start = text.find("{")
+    if label_start == -1:
+        # We don't have labels
+        name_end = text.index(" ")
+        name = text[:name_end]
+        # Parse the remaining text after the name
+        remaining_text = text[name_end + 1:]
+        value, timestamp, exemplar = _parse_remaining_text(remaining_text)
+        return Sample(name, {}, value, timestamp, exemplar)
+    # The name is before the labels
+    name = text[:label_start]
+    seperator = " # "
+    if text.count(seperator) == 0:
+        # Line doesn't contain an exemplar
+        # We can use `rindex` to find `label_end`
+        label_end = text.rindex("}")
+        label = text[label_start + 1:label_end]
+        labels = _parse_labels(label)
+    else:
+        # Line potentially contains an exemplar
+        # Fallback to parsing labels with a state machine
+        labels, labels_len = _parse_labels_with_state_machine(text[label_start + 1:])
+        label_end = labels_len + len(name)      
+    # Parsing labels succeeded, continue parsing the remaining text
+    remaining_text = text[label_end + 2:]
+    value, timestamp, exemplar = _parse_remaining_text(remaining_text)
+    return Sample(name, labels, value, timestamp, exemplar)
+
+
+def _parse_remaining_text(text):
+    split_text = text.split(" ", 1)
+    val = _parse_value(split_text[0])
+    if len(split_text) == 1:
+        # We don't have timestamp or exemplar
+        return val, None, None  
+
     timestamp = []
-    labels = {}
     exemplar_value = []
     exemplar_timestamp = []
     exemplar_labels = None
 
-    state = 'name'
+    state = 'timestamp'
+    text = split_text[1]
 
     it = iter(text)
     for char in it:
-        if state == 'name':
-            if char == '{':
-                labels = _parse_labels(it, text)
-                # Space has already been parsed.
-                state = 'value'
-            elif char == ' ':
-                state = 'value'
-            else:
-                name.append(char)
-        elif state == 'value':
-            if char == ' ':
-                state = 'timestamp'
-            else:
-                value.append(char)
-        elif state == 'timestamp':
+        if state == 'timestamp':
             if char == '#' and not timestamp:
                 state = 'exemplarspace'
             elif char == ' ':
@@ -190,13 +304,23 @@ def _parse_sample(text):
                 raise ValueError("Invalid line: " + text)
         elif state == 'exemplarstartoflabels':
             if char == '{':
-                exemplar_labels = _parse_labels(it, text)
-                # Space has already been parsed.
+                label_start, label_end = text.index("{"), text.rindex("}")
+                exemplar_labels = _parse_labels(text[label_start + 1:label_end])
+                state = 'exemplarparsedlabels'
+            else:
+                raise ValueError("Invalid line: " + text)
+        elif state == 'exemplarparsedlabels':
+            if char == '}':
+                state = 'exemplarvaluespace'
+        elif state == 'exemplarvaluespace':
+            if char == ' ':
                 state = 'exemplarvalue'
             else:
                 raise ValueError("Invalid line: " + text)
         elif state == 'exemplarvalue':
-            if char == ' ':
+            if char == ' ' and not exemplar_value:
+                raise ValueError("Invalid line: " + text)
+            elif char == ' ':
                 state = 'exemplartimestamp'
             else:
                 exemplar_value.append(char)
@@ -212,13 +336,9 @@ def _parse_sample(text):
         raise ValueError("Invalid line: " + text)
 
     # Incomplete exemplar.
-    if state in ['exemplarhash', 'exemplarspace', 'exemplarstartoflabels']:
+    if state in ['exemplarhash', 'exemplarspace', 'exemplarstartoflabels', 'exemplarparsedlabels']:
         raise ValueError("Invalid line: " + text)
 
-    if not value:
-        raise ValueError("Invalid line: " + text)
-    value = ''.join(value)
-    val = _parse_value(value)
     ts = _parse_timestamp(timestamp)
     exemplar = None
     if exemplar_labels is not None:
@@ -231,7 +351,7 @@ def _parse_sample(text):
             _parse_timestamp(exemplar_timestamp),
         )
 
-    return Sample(''.join(name), labels, val, ts, exemplar)
+    return val, ts, exemplar
 
 
 def _group_for_sample(sample, name, typ):
diff --git a/tests/openmetrics/test_parser.py b/tests/openmetrics/test_parser.py
@@ -374,6 +374,93 @@ def test_timestamps(self):
         b.add_metric([], 2, timestamp=Timestamp(1234567890, 0))
         self.assertEqual([a, b], list(families))
 
+    def test_hash_in_label_value(self):
+        families = text_string_to_metric_families("""# TYPE a counter
+# HELP a help
+a_total{foo="foo # bar"} 1
+a_total{foo="} foo # bar # "} 1
+# EOF
+""")
+        a = CounterMetricFamily("a", "help", labels=["foo"])
+        a.add_metric(["foo # bar"], 1)
+        a.add_metric(["} foo # bar # "], 1)
+        self.assertEqual([a], list(families))
+
+    def test_exemplars_with_hash_in_label_values(self):
+        families = text_string_to_metric_families("""# TYPE a histogram
+# HELP a help
+a_bucket{le="1.0",foo="bar # "} 0 # {a="b",foo="bar # bar"} 0.5
+a_bucket{le="2.0",foo="bar # "} 2 # {a="c",foo="bar # bar"} 0.5
+a_bucket{le="+Inf",foo="bar # "} 3 # {a="d",foo="bar # bar"} 4
+# EOF
+""")
+        hfm = HistogramMetricFamily("a", "help")
+        hfm.add_sample("a_bucket", {"le": "1.0", "foo": "bar # "}, 0.0, None, Exemplar({"a": "b", "foo": "bar # bar"}, 0.5))
+        hfm.add_sample("a_bucket", {"le": "2.0", "foo": "bar # "}, 2.0, None, Exemplar({"a": "c", "foo": "bar # bar"}, 0.5))
+        hfm.add_sample("a_bucket", {"le": "+Inf", "foo": "bar # "}, 3.0, None, Exemplar({"a": "d", "foo": "bar # bar"}, 4))
+        self.assertEqual([hfm], list(families))
+
+    @unittest.skipIf(sys.version_info < (3, 3), "Test requires Python 3.3+.")
+    def test_fallback_to_state_machine_label_parsing(self):
+        from unittest.mock import patch
+        from prometheus_client.openmetrics.parser import _parse_sample
+
+        parse_sample_function = "prometheus_client.openmetrics.parser._parse_sample"
+        parse_labels_function = "prometheus_client.openmetrics.parser._parse_labels"
+        parse_remaining_function = "prometheus_client.openmetrics.parser._parse_remaining_text"
+        state_machine_function = "prometheus_client.openmetrics.parser._parse_labels_with_state_machine"
+
+        parse_sample_return_value = Sample("a_total", {"foo": "foo # bar"}, 1)
+        with patch(parse_sample_function, return_value=parse_sample_return_value) as mock:
+            families = text_string_to_metric_families("""# TYPE a counter
+# HELP a help
+a_total{foo="foo # bar"} 1
+# EOF
+""")
+            a = CounterMetricFamily("a", "help", labels=["foo"])
+            a.add_metric(["foo # bar"], 1)
+            self.assertEqual([a], list(families))
+            mock.assert_called_once_with('a_total{foo="foo # bar"} 1')
+
+        # First fallback case
+        state_machine_return_values = [{"foo": "foo # bar"}, len('foo="foo # bar"}')]
+        parse_remaining_values = [1, None, None]
+        with patch(parse_labels_function) as mock1:
+            with patch(state_machine_function, return_value=state_machine_return_values) as mock2:
+                with patch(parse_remaining_function, return_value=parse_remaining_values) as mock3:
+                    sample = _parse_sample('a_total{foo="foo # bar"} 1')
+                    s = Sample("a_total", {"foo": "foo # bar"}, 1)
+                    self.assertEqual(s, sample)
+                    mock1.assert_not_called()
+                    mock2.assert_called_once_with('foo="foo # bar"} 1')
+                    mock3.assert_called_once_with('1')
+
+        # Second fallback case
+        state_machine_return_values = [{"le": "1.0"}, len('le="1.0"}')]
+        parse_remaining_values = [0.0, Timestamp(123, 0), Exemplar({"a": "b"}, 0.5)]
+        with patch(parse_labels_function) as mock1:
+            with patch(state_machine_function, return_value=state_machine_return_values) as mock2:
+                with patch(parse_remaining_function, return_value=parse_remaining_values) as mock3:
+                    sample = _parse_sample('a_bucket{le="1.0"} 0 123 # {a="b"} 0.5')
+                    s = Sample("a_bucket", {"le": "1.0"}, 0.0, Timestamp(123, 0), Exemplar({"a": "b"}, 0.5))
+                    self.assertEqual(s, sample)
+                    mock1.assert_not_called()
+                    mock2.assert_called_once_with('le="1.0"} 0 123 # {a="b"} 0.5')
+                    mock3.assert_called_once_with('0 123 # {a="b"} 0.5')
+
+        # No need to fallback case
+        parse_labels_return_values = {"foo": "foo#bar"}
+        parse_remaining_values = [1, None, None]
+        with patch(parse_labels_function, return_value=parse_labels_return_values) as mock1:
+            with patch(state_machine_function) as mock2:
+                with patch(parse_remaining_function, return_value=parse_remaining_values) as mock3:
+                    sample = _parse_sample('a_total{foo="foo#bar"} 1')
+                    s = Sample("a_total", {"foo": "foo#bar"}, 1)
+                    self.assertEqual(s, sample)
+                    mock1.assert_called_once_with('foo="foo#bar"')
+                    mock2.assert_not_called()
+                    mock3.assert_called_once_with('1')
+
     @unittest.skipIf(sys.version_info < (2, 7), "Test requires Python 2.7+.")
     def test_roundtrip(self):
         text = """# HELP go_gc_duration_seconds A summary of the GC invocation durations.
@@ -453,6 +540,12 @@ def test_invalid_input(self):
             ('a{a=1} 1\n# EOF\n'),
             ('a{a="1} 1\n# EOF\n'),
             ('a{a=\'1\'} 1\n# EOF\n'),
+            # Missing equal or label value.
+            ('a{a} 1\n# EOF\n'),
+            ('a{a"value"} 1\n# EOF\n'),
+            ('a{a""} 1\n# EOF\n'),
+            ('a{a=} 1\n# EOF\n'),
+            ('a{a="} 1\n# EOF\n'),
             # Missing or extra commas.
             ('a{a="1"b="2"} 1\n# EOF\n'),
             ('a{a="1",,b="2"} 1\n# EOF\n'),
@@ -523,6 +616,9 @@ def test_invalid_input(self):
             ('# TYPE a histogram\na_sum 1 # {a="b"} 0.5\n# EOF\n'),
             ('# TYPE a gaugehistogram\na_sum 1 # {a="b"} 0.5\n# EOF\n'),
             ('# TYPE a_bucket gauge\na_bucket 1 # {a="b"} 0.5\n# EOF\n'),
+            # Exemplars on unallowed metric types.
+            ('# TYPE a counter\na_total 1 # {a="b"} 1\n# EOF\n'),
+            ('# TYPE a gauge\na 1 # {a="b"} 1\n# EOF\n'),
             # Bad stateset/info values.
             ('# TYPE a stateset\na 2\n# EOF\n'),
             ('# TYPE a info\na 2\n# EOF\n'),