Introduce textblob framework to analysis word (#20)

SkyAPM · Sep 4, 2024 · 3c533d9 · 3c533d9
1 parent e9be7a7
commit 3c533d9
Show file tree

Hide file tree

Showing 11 changed files with 146 additions and 29 deletions.
diff --git a/models/uri_drain/template_miner.py b/models/uri_drain/template_miner.py
@@ -114,6 +114,12 @@ def load_state(self):
 
         loaded_drain: Drain = jsonpickle.loads(state, keys=True)
 
+        # load all words into clusters
+        if len(loaded_drain.id_to_cluster) > 0:
+            for _, cluster in loaded_drain.id_to_cluster.items():
+                if isinstance(cluster, LogCluster):
+                    cluster.token_words_check()
+
         # json-pickle encoded keys as string by default, so we have to convert those back to int
         # this is only relevant for backwards compatibility when loading a snapshot of drain <= v0.9.1
         # which did not use json-pickle's keys=true
@@ -137,8 +143,8 @@ def save_state(self, snapshot_reason):
             state = base64.b64encode(zlib.compress(state))
 
         logger.info(f"Saving state of {len(self.drain.clusters)} clusters "
-                    f"with {self.drain.get_total_cluster_size()} messages, {len(state)} bytes, "
-                    f"reason: {snapshot_reason}")
+                    f"with {self.drain.get_total_cluster_size()} messages to service <{self.persistence_handler.get_service()}>, "
+                    f"{len(state)} bytes, reason: {snapshot_reason}")
         self.persistence_handler.save_state(state)
 
     def get_snapshot_reason(self, change_type, cluster_id):

diff --git a/models/uri_drain/uri_drain.py b/models/uri_drain/uri_drain.py
@@ -4,11 +4,11 @@
 # Again, it's further modified to suit URI clustering needs,
 # changes are kept minimal to avoid divergence from Drain3 upstream.
 # TODO Note:: Every change to upstream Drain3 algorithm MUST be commented starting with "Modified::"
-
 from typing import List, Dict, Sequence
 
 from cachetools import LRUCache, Cache
 
+from models.uri_drain.word_splitter import check_all_word_correct
 from models.utils.simple_profiler import Profiler, NullProfiler
 
 import logger
@@ -18,7 +18,7 @@ class LogCluster:  # TODO Modified:: Changed to URICluster
     __slots__ = ["log_template_tokens", "cluster_id", "size", "latest_urls"]
 
     def __init__(self, log_template_tokens: list, cluster_id: int, combine_min_url_count: int):
-        self.log_template_tokens = tuple(log_template_tokens)
+        self.log_template_tokens = tuple(parse_token_list(log_template_tokens))
         self.cluster_id = cluster_id
         self.size = 1
         self.latest_urls = LRUCache(combine_min_url_count+1)
@@ -57,6 +57,27 @@ def __str__(self):
         # return f"ID={str(self.cluster_id).ljust(5)} : size={str(self.size).ljust(10)}: {self.get_template()}"
         return f"size={str(self.size).ljust(10)}: {self.get_template()}"
 
+    def token_words_check(self):
+        self.log_template_tokens = parse_token_list(self.log_template_tokens)
+
+
+class Token(str):
+    __slots__ = ["token", "word_correct"]
+
+    def __new__(cls, token: str, word_correct: bool = False):
+        return super().__new__(cls, token)
+
+    def __init__(self, token: str, word_correct: bool):
+        self.token = token
+        self.word_correct = word_correct
+
+
+def parse_token_list(tokens: List[str]) -> List[Token]:
+    result = []
+    for token in tokens:
+        result.append(Token(token, check_all_word_correct(token)))
+    return result
+
 
 class SingleURILogCluster:
     __slots__ = ["uri", "cluster_id", "size"]
@@ -198,13 +219,16 @@ def fast_match(self, cluster_ids: Sequence, tokens: list, sim_th: float, include
         max_param_count = -1
         max_cluster = None
 
+        # pre-parse tokens to avoid repeated parsing
+        parsed_token = parse_token_list(tokens)
+
         for cluster_id in cluster_ids:
             # Try to retrieve cluster from cache with bypassing eviction
             # algorithm as we are only testing candidates for a match.
             cluster = self.id_to_cluster.get(cluster_id)
             if cluster is None:
                 continue
-            cur_sim, param_count = self.get_seq_distance(cluster.log_template_tokens, tokens, include_params)
+            cur_sim, param_count = self.get_seq_distance(cluster.log_template_tokens, parsed_token, include_params)
             # self.logger.debug(f'SIMILARITY = {cur_sim} for c{cluster_id}, {cluster.log_template_tokens} param={param_count}')
             if cur_sim > max_sim or (cur_sim == max_sim and param_count > max_param_count):
                 # todo: this is known caveat
@@ -495,6 +519,9 @@ def get_seq_distance(self, seq1, seq2, include_params: bool):
             if (index == 0 or index == 1) and '.' in token1 and token1 != token2:
                 # self.logger.debug('this is domain mismatch!')
                 return 0.0, 0
+            # if all new tokens are words, then we can consider it cannot be combined
+            if token1 != token2 and (token1.word_correct or token2.word_correct):
+                return -1, -1
             # if token1 in self.possible_params or token1 == self.param_str:
             if token1 == self.param_str:
                 param_count += 1
@@ -518,14 +545,6 @@ def create_template(self, seq1, seq2):
         ret_val = list(seq2)
         seq_length = len(seq1)
 
-        # SPECIAL ASSUMPTION THAT MIGHT BE FALSE::
-        # /api/getconnection
-        # /api/dropconnection
-        if seq_length == 2:
-            if (seq1[0] == seq2[0] and seq1[1] != seq2[1]  # can be simplified
-                    and not self.has_numbers(seq1[1]) and not self.has_numbers(seq2[1])):
-                print(f'first token match but second token mismatch, seq1 = {seq1}, seq2 = {seq2}')
-                return 'rejected'
         # TODO, radical assumption if there's absolutely 0 digit in seq1 and seq2, then don't consider them similar?
         # To implement this, we increase the false negative rate, but decrease false positive rate
 
@@ -626,7 +645,7 @@ def create_template(self, seq1, seq2):
                 ret_val[i] = self.param_str
 
         # self.logger.debug(f'After change: {ret_val}')
-        return ret_val
+        return parse_token_list(ret_val)
 
     def match(self, content: str, full_search_strategy="never"):
         """

diff --git a/models/uri_drain/word_splitter.py b/models/uri_drain/word_splitter.py
@@ -0,0 +1,52 @@
+# Copyright 2023 SkyAPM org
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+from cachetools import LRUCache
+from textblob import TextBlob
+
+last_word_correct_lru = LRUCache(1000)
+
+
+def split_for_url(text):
+    # split text by camel case
+    pattern = r"(?<=[a-z])(?=[A-Z])"
+    return re.split(pattern, text)
+
+
+def check_all_word_correct(text):
+    # if contains digits, then it's not a word, ignore the word check
+    if any(char.isdigit() for char in text):
+        return False
+    for word in split_for_url(text):
+        # if a word is too long, then it's not a word, just ignore to verify to reduce the analysis time
+        if len(word) > 20:
+            return False
+        word = word.lower()
+        cached_result = last_word_correct_lru.get(word)
+        if cached_result is not None:
+            if cached_result:
+                continue
+            else:
+                return False
+        # When a word is not corrected, then it's not a param
+        # text blob would also split the world by regex `\w+`, so no worry about special characters(such as "_", ".")
+        corrected_word = TextBlob(word).correct()
+        correct = word == corrected_word
+        last_word_correct_lru[word] = correct
+        if not correct:
+            return False
+
+    return True
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -63,6 +63,7 @@ inflect = "^6.0.4"
 pytest = "^7.3.2"
 apache-skywalking = "^1.0.1"
 flask = "^2.3.2"
+textblob = "0.18.0"
 
 
 

diff --git a/servers/simple/uri_drain.ini b/servers/simple/uri_drain.ini
@@ -35,7 +35,7 @@ max_children = ${DRAIN_MAX_CHILDREN:100}
 max_clusters = ${DRAIN_MAX_CLUSTERS:1024}
 extra_delimiters = ${DRAIN_EXTRA_DELIMITERS:["/"]}
 analysis_min_url_count = ${DRAIN_ANALYSIS_MIN_URL_COUNT:20}
-combine_min_url_count = ${DRAIN_COMBINE_MIN_URL_COUNT:8}
+combine_min_url_count = ${DRAIN_COMBINE_MIN_URL_COUNT:3}
 
 [PROFILING]
 enabled = ${PROFILING_ENABLED:False}

diff --git a/servers/simple/worker.py b/servers/simple/worker.py
@@ -50,7 +50,8 @@ def run_worker(uri_main_queue, shared_results_object, config, existing_miners):
             uris, service = uri_package[0], uri_package[1]
             # print(uri_main_queue.get(timeout=1))
             start_time = time.time()
-            for uri in uris:
+            sorted_uris = sorted(uris)
+            for uri in sorted_uris:
                 drain_instances[service].add_log_message(uri)
             logger.info(f'Processed {len(uris)} uris of service {service} in {time.time() - start_time} seconds')
             patterns = drain_instances[service].drain.cluster_patterns

diff --git a/test/e2e/expected/endpoint_hard.yaml b/test/e2e/expected/endpoint_hard.yaml
@@ -19,8 +19,13 @@ patterns:
     - /api/v1/services/{var}
     - /api/v1/users/{var}/posts/{var}/comments
     - /api/v1/wallets/{var}
+    - /api/v2/admin/users/{var}
     - /api/v2/courses/{var}/modules/{var}/lessons
     - /api/v2/customers/{var}
     - /api/v3/products/{var}/reviews/{var}/comments
     - /api/v4/orders/{var}/items/{var}/tracking
+    - /customer/{var}
+    - /customer/{var}/order/{var}
+    - ABC/{var}
+    - www.google.com/api/v1/users/{var}
 version: '1'
diff --git a/test/e2e/expected/endpoint_hard_3k.yaml b/test/e2e/expected/endpoint_hard_3k.yaml
@@ -19,8 +19,12 @@ patterns:
     - /api/v1/services/{var}
     - /api/v1/users/{var}/posts/{var}/comments
     - /api/v1/wallets/{var}
+    - /api/v2/admin/users/{var}
     - /api/v2/courses/{var}/modules/{var}/lessons
     - /api/v2/customers/{var}
     - /api/v3/products/{var}/reviews/{var}/comments
     - /api/v4/orders/{var}/items/{var}/tracking
+    - /customer/{var}
+    - /customer/{var}/order/{var}
+    - www.google.com/api/v1/users/{var}
 version: '1'
diff --git a/test/e2e/expected/endpoint_trivial.yaml b/test/e2e/expected/endpoint_trivial.yaml
@@ -14,8 +14,13 @@
 
 patterns:
     - /api/v1/accounts/{var}
+    - /api/v1/invoices/{var}
     - /api/v1/orders/{var}
     - /api/v1/posts/{var}
     - /api/v1/products/{var}
     - /api/v1/users/{var}
+    - /api/v2/data/users/{var}
+    - /api/v999/orders/{var}
+    - /user/{var}/post/{var}
+    - /user/{var}/profile/{var}/compare/{var}/profile/{var}
 version: '1'
diff --git a/test/e2e/expected/endpoint_trivial_3k.yaml b/test/e2e/expected/endpoint_trivial_3k.yaml
@@ -19,4 +19,9 @@ patterns:
     - /api/v1/posts/{var}
     - /api/v1/products/{var}
     - /api/v1/users/{var}
+    - /api/v2/data/users/{var}
+    - /api/v999/orders/{var}
+    - /user/{var}
+    - /user/{var}/post/{var}
+    - /user/{var}/profile/{var}/compare/{var}/profile/{var}
 version: '1'