Skip to content

Commit

Permalink
Introduce textblob framework to analysis word (#20)
Browse files Browse the repository at this point in the history
  • Loading branch information
mrproliu authored Sep 4, 2024
1 parent e9be7a7 commit 3c533d9
Show file tree
Hide file tree
Showing 11 changed files with 146 additions and 29 deletions.
10 changes: 8 additions & 2 deletions models/uri_drain/template_miner.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,12 @@ def load_state(self):

loaded_drain: Drain = jsonpickle.loads(state, keys=True)

# load all words into clusters
if len(loaded_drain.id_to_cluster) > 0:
for _, cluster in loaded_drain.id_to_cluster.items():
if isinstance(cluster, LogCluster):
cluster.token_words_check()

# json-pickle encoded keys as string by default, so we have to convert those back to int
# this is only relevant for backwards compatibility when loading a snapshot of drain <= v0.9.1
# which did not use json-pickle's keys=true
Expand All @@ -137,8 +143,8 @@ def save_state(self, snapshot_reason):
state = base64.b64encode(zlib.compress(state))

logger.info(f"Saving state of {len(self.drain.clusters)} clusters "
f"with {self.drain.get_total_cluster_size()} messages, {len(state)} bytes, "
f"reason: {snapshot_reason}")
f"with {self.drain.get_total_cluster_size()} messages to service <{self.persistence_handler.get_service()}>, "
f"{len(state)} bytes, reason: {snapshot_reason}")
self.persistence_handler.save_state(state)

def get_snapshot_reason(self, change_type, cluster_id):
Expand Down
43 changes: 31 additions & 12 deletions models/uri_drain/uri_drain.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
# Again, it's further modified to suit URI clustering needs,
# changes are kept minimal to avoid divergence from Drain3 upstream.
# TODO Note:: Every change to upstream Drain3 algorithm MUST be commented starting with "Modified::"

from typing import List, Dict, Sequence

from cachetools import LRUCache, Cache

from models.uri_drain.word_splitter import check_all_word_correct
from models.utils.simple_profiler import Profiler, NullProfiler

import logger
Expand All @@ -18,7 +18,7 @@ class LogCluster: # TODO Modified:: Changed to URICluster
__slots__ = ["log_template_tokens", "cluster_id", "size", "latest_urls"]

def __init__(self, log_template_tokens: list, cluster_id: int, combine_min_url_count: int):
self.log_template_tokens = tuple(log_template_tokens)
self.log_template_tokens = tuple(parse_token_list(log_template_tokens))
self.cluster_id = cluster_id
self.size = 1
self.latest_urls = LRUCache(combine_min_url_count+1)
Expand Down Expand Up @@ -57,6 +57,27 @@ def __str__(self):
# return f"ID={str(self.cluster_id).ljust(5)} : size={str(self.size).ljust(10)}: {self.get_template()}"
return f"size={str(self.size).ljust(10)}: {self.get_template()}"

def token_words_check(self):
self.log_template_tokens = parse_token_list(self.log_template_tokens)


class Token(str):
__slots__ = ["token", "word_correct"]

def __new__(cls, token: str, word_correct: bool = False):
return super().__new__(cls, token)

def __init__(self, token: str, word_correct: bool):
self.token = token
self.word_correct = word_correct


def parse_token_list(tokens: List[str]) -> List[Token]:
result = []
for token in tokens:
result.append(Token(token, check_all_word_correct(token)))
return result


class SingleURILogCluster:
__slots__ = ["uri", "cluster_id", "size"]
Expand Down Expand Up @@ -198,13 +219,16 @@ def fast_match(self, cluster_ids: Sequence, tokens: list, sim_th: float, include
max_param_count = -1
max_cluster = None

# pre-parse tokens to avoid repeated parsing
parsed_token = parse_token_list(tokens)

for cluster_id in cluster_ids:
# Try to retrieve cluster from cache with bypassing eviction
# algorithm as we are only testing candidates for a match.
cluster = self.id_to_cluster.get(cluster_id)
if cluster is None:
continue
cur_sim, param_count = self.get_seq_distance(cluster.log_template_tokens, tokens, include_params)
cur_sim, param_count = self.get_seq_distance(cluster.log_template_tokens, parsed_token, include_params)
# self.logger.debug(f'SIMILARITY = {cur_sim} for c{cluster_id}, {cluster.log_template_tokens} param={param_count}')
if cur_sim > max_sim or (cur_sim == max_sim and param_count > max_param_count):
# todo: this is known caveat
Expand Down Expand Up @@ -495,6 +519,9 @@ def get_seq_distance(self, seq1, seq2, include_params: bool):
if (index == 0 or index == 1) and '.' in token1 and token1 != token2:
# self.logger.debug('this is domain mismatch!')
return 0.0, 0
# if all new tokens are words, then we can consider it cannot be combined
if token1 != token2 and (token1.word_correct or token2.word_correct):
return -1, -1
# if token1 in self.possible_params or token1 == self.param_str:
if token1 == self.param_str:
param_count += 1
Expand All @@ -518,14 +545,6 @@ def create_template(self, seq1, seq2):
ret_val = list(seq2)
seq_length = len(seq1)

# SPECIAL ASSUMPTION THAT MIGHT BE FALSE::
# /api/getconnection
# /api/dropconnection
if seq_length == 2:
if (seq1[0] == seq2[0] and seq1[1] != seq2[1] # can be simplified
and not self.has_numbers(seq1[1]) and not self.has_numbers(seq2[1])):
print(f'first token match but second token mismatch, seq1 = {seq1}, seq2 = {seq2}')
return 'rejected'
# TODO, radical assumption if there's absolutely 0 digit in seq1 and seq2, then don't consider them similar?
# To implement this, we increase the false negative rate, but decrease false positive rate

Expand Down Expand Up @@ -626,7 +645,7 @@ def create_template(self, seq1, seq2):
ret_val[i] = self.param_str

# self.logger.debug(f'After change: {ret_val}')
return ret_val
return parse_token_list(ret_val)

def match(self, content: str, full_search_strategy="never"):
"""
Expand Down
52 changes: 52 additions & 0 deletions models/uri_drain/word_splitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Copyright 2023 SkyAPM org
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re

from cachetools import LRUCache
from textblob import TextBlob

last_word_correct_lru = LRUCache(1000)


def split_for_url(text):
# split text by camel case
pattern = r"(?<=[a-z])(?=[A-Z])"
return re.split(pattern, text)


def check_all_word_correct(text):
# if contains digits, then it's not a word, ignore the word check
if any(char.isdigit() for char in text):
return False
for word in split_for_url(text):
# if a word is too long, then it's not a word, just ignore to verify to reduce the analysis time
if len(word) > 20:
return False
word = word.lower()
cached_result = last_word_correct_lru.get(word)
if cached_result is not None:
if cached_result:
continue
else:
return False
# When a word is not corrected, then it's not a param
# text blob would also split the world by regex `\w+`, so no worry about special characters(such as "_", ".")
corrected_word = TextBlob(word).correct()
correct = word == corrected_word
last_word_correct_lru[word] = correct
if not correct:
return False

return True
45 changes: 32 additions & 13 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ inflect = "^6.0.4"
pytest = "^7.3.2"
apache-skywalking = "^1.0.1"
flask = "^2.3.2"
textblob = "0.18.0"



Expand Down
2 changes: 1 addition & 1 deletion servers/simple/uri_drain.ini
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ max_children = ${DRAIN_MAX_CHILDREN:100}
max_clusters = ${DRAIN_MAX_CLUSTERS:1024}
extra_delimiters = ${DRAIN_EXTRA_DELIMITERS:["/"]}
analysis_min_url_count = ${DRAIN_ANALYSIS_MIN_URL_COUNT:20}
combine_min_url_count = ${DRAIN_COMBINE_MIN_URL_COUNT:8}
combine_min_url_count = ${DRAIN_COMBINE_MIN_URL_COUNT:3}

[PROFILING]
enabled = ${PROFILING_ENABLED:False}
Expand Down
3 changes: 2 additions & 1 deletion servers/simple/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ def run_worker(uri_main_queue, shared_results_object, config, existing_miners):
uris, service = uri_package[0], uri_package[1]
# print(uri_main_queue.get(timeout=1))
start_time = time.time()
for uri in uris:
sorted_uris = sorted(uris)
for uri in sorted_uris:
drain_instances[service].add_log_message(uri)
logger.info(f'Processed {len(uris)} uris of service {service} in {time.time() - start_time} seconds')
patterns = drain_instances[service].drain.cluster_patterns
Expand Down
5 changes: 5 additions & 0 deletions test/e2e/expected/endpoint_hard.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,13 @@ patterns:
- /api/v1/services/{var}
- /api/v1/users/{var}/posts/{var}/comments
- /api/v1/wallets/{var}
- /api/v2/admin/users/{var}
- /api/v2/courses/{var}/modules/{var}/lessons
- /api/v2/customers/{var}
- /api/v3/products/{var}/reviews/{var}/comments
- /api/v4/orders/{var}/items/{var}/tracking
- /customer/{var}
- /customer/{var}/order/{var}
- ABC/{var}
- www.google.com/api/v1/users/{var}
version: '1'
4 changes: 4 additions & 0 deletions test/e2e/expected/endpoint_hard_3k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,12 @@ patterns:
- /api/v1/services/{var}
- /api/v1/users/{var}/posts/{var}/comments
- /api/v1/wallets/{var}
- /api/v2/admin/users/{var}
- /api/v2/courses/{var}/modules/{var}/lessons
- /api/v2/customers/{var}
- /api/v3/products/{var}/reviews/{var}/comments
- /api/v4/orders/{var}/items/{var}/tracking
- /customer/{var}
- /customer/{var}/order/{var}
- www.google.com/api/v1/users/{var}
version: '1'
5 changes: 5 additions & 0 deletions test/e2e/expected/endpoint_trivial.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,13 @@

patterns:
- /api/v1/accounts/{var}
- /api/v1/invoices/{var}
- /api/v1/orders/{var}
- /api/v1/posts/{var}
- /api/v1/products/{var}
- /api/v1/users/{var}
- /api/v2/data/users/{var}
- /api/v999/orders/{var}
- /user/{var}/post/{var}
- /user/{var}/profile/{var}/compare/{var}/profile/{var}
version: '1'
5 changes: 5 additions & 0 deletions test/e2e/expected/endpoint_trivial_3k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,9 @@ patterns:
- /api/v1/posts/{var}
- /api/v1/products/{var}
- /api/v1/users/{var}
- /api/v2/data/users/{var}
- /api/v999/orders/{var}
- /user/{var}
- /user/{var}/post/{var}
- /user/{var}/profile/{var}/compare/{var}/profile/{var}
version: '1'

0 comments on commit 3c533d9

Please sign in to comment.