diff --git a/README.md b/README.md index a2008cb..8fd76d8 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,8 @@ To analyze a log file, run the script with the following command line arguments: - `url` (required): The URL of the log file to be analyzed. - `--model` (optional, default: "Mistral-7B-Instruct-v0.2-GGUF"): The path or URL of the language model for analysis. - `--summarizer` (optional, default: "drain"): Choose between LLM and Drain template miner as the log summarizer. You can also provide the path to an existing language model file instead of using a URL. -- `--n_lines` (optional, default: 5): The number of lines per chunk for LLM analysis. This only makes sense when you are summarizing with LLM. +- `--n_lines` (optional, default: 8): The number of lines per chunk for LLM analysis. This only makes sense when you are summarizing with LLM. +- `--n_clusters` (optional, default 8): Number of clusters for Drain to organize log chunks into. This only makes sense when you are summarizing with Drain Example usage: diff --git a/logdetective/__init__.py b/logdetective/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/logdetective/constants.py b/logdetective/constants.py new file mode 100644 index 0000000..e26f252 --- /dev/null +++ b/logdetective/constants.py @@ -0,0 +1,29 @@ + +# pylint: disable=line-too-long +DEFAULT_ADVISOR = "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_S.gguf?download=true" + +PROMPT_TEMPLATE = """ +Given following log snippets, and nothing else, explain what failure, if any, occured during build of this package. + +{} + +Analysis of the failure must be in a format of [X] : [Y], where [X] is a log snippet, and [Y] is the explanation. + +Finally, drawing on information from all snippets, provide complete explanation of the issue. + +Analysis: + +""" + +SUMMARIZE_PROMPT_TEMPLATE = """ +Does following log contain error or issue? + +Log: + +{} + +Answer: + +""" + +CACHE_LOC = "~/.cache/logdetective/" diff --git a/logdetective/extractors.py b/logdetective/extractors.py new file mode 100644 index 0000000..4e9b4f5 --- /dev/null +++ b/logdetective/extractors.py @@ -0,0 +1,92 @@ +import os +import logging + +import drain3 +from drain3.template_miner_config import TemplateMinerConfig +from llama_cpp import Llama, LlamaGrammar + +from logdetective.constants import SUMMARIZE_PROMPT_TEMPLATE +from logdetective.utils import get_chunks + +LOG = logging.getLogger("logdetective") + + +class LLMExtractor: + """ + A class that extracts relevant information from logs using a language model. + """ + def __init__(self, model_path: str, verbose: bool, n_lines: int = 2): + self.model = Llama( + model_path=model_path, + n_ctx=0, + verbose=verbose) + self.n_lines = n_lines + self.grammar = LlamaGrammar.from_string( + "root ::= (\"Yes\" | \"No\")", verbose=False) + + def __call__(self, log: str, n_lines: int = 2, neighbors: bool = False) -> str: + chunks = self.rate_chunks(log) + out = self.create_extract(chunks, neighbors) + return out + + def rate_chunks(self, log: str) -> list[tuple]: + """Scan log by the model and store results. + + :param log: log file content + """ + results = [] + log_lines = log.split("\n") + + for i in range(0, len(log_lines), self.n_lines): + block = '\n'.join(log_lines[i:i + self.n_lines]) + prompt = SUMMARIZE_PROMPT_TEMPLATE.format(log) + out = self.model(prompt, max_tokens=7, grammar=self.grammar) + out = f"{out['choices'][0]['text']}\n" + results.append((block, out)) + + return results + + def create_extract(self, chunks: list[tuple], neighbors: bool = False) -> str: + """Extract interesting chunks from the model processing. + """ + interesting = [] + summary = "" + # pylint: disable=consider-using-enumerate + for i in range(len(chunks)): + if chunks[i][1].startswith("Yes"): + interesting.append(i) + if neighbors: + interesting.extend([max(i - 1, 0), min(i + 1, len(chunks) - 1)]) + + interesting = set(interesting) + + for i in interesting: + summary += chunks[i][0] + "\n" + + return summary + + +class DrainExtractor: + """A class that extracts information from logs using a template miner algorithm. + """ + def __init__(self, verbose: bool = False, context: bool = False, max_clusters=8): + config = TemplateMinerConfig() + config.load(f"{os.path.dirname(__file__)}/drain3.ini") + config.profiling_enabled = verbose + config.drain_max_clusters = max_clusters + self.miner = drain3.TemplateMiner(config=config) + self.verbose = verbose + self.context = context + + def __call__(self, log: str) -> str: + out = "" + for chunk in get_chunks(log): + processed_line = self.miner.add_log_message(chunk) + LOG.debug(processed_line) + sorted_clusters = sorted(self.miner.drain.clusters, key=lambda it: it.size, reverse=True) + for chunk in get_chunks(log): + cluster = self.miner.match(chunk, "always") + if cluster in sorted_clusters: + out += f"{chunk}\n" + sorted_clusters.remove(cluster) + return out diff --git a/logdetective/logdetective.py b/logdetective/logdetective.py index a38c524..6fa699b 100644 --- a/logdetective/logdetective.py +++ b/logdetective/logdetective.py @@ -2,248 +2,31 @@ import logging import os import sys -from urllib.request import urlretrieve -from urllib.parse import urlparse -import drain3 -import numpy as np -import progressbar -import requests -from drain3.template_miner_config import TemplateMinerConfig -from llama_cpp import Llama, LlamaGrammar - -# pylint: disable=line-too-long -DEFAULT_ADVISOR = "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_S.gguf?download=true" - -# pylint: disable=line-too-long -DEFAULT_LLM_RATER = "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_S.gguf?download=true" - -PROMPT_TEMPLATE = """ -Given following log snippets, and nothing else, explain what failure, if any occured during build of this package. -Ignore strings wrapped in <: :>, such as <:*:>. - -{} - -Analysis of the failure must be in a format of [X] : [Y], where [X] is a log snippet, and [Y] is the explanation. - -Finally, drawing on information from all snippets, provide complete explanation of the issue. - -Analysis: - -""" - -SUMMARIZE_PROMPT_TEMPLATE = """ -Does following log contain error or issue? - -Log: - -{} - -Answer: - -""" - -CACHE_LOC = "~/.cache/logdetective/" +from logdetective.constants import DEFAULT_ADVISOR, CACHE_LOC +from logdetective.utils import download_model, process_log, initialize_model, retrieve_log_content +from logdetective.extractors import LLMExtractor, DrainExtractor LOG = logging.getLogger("logdetective") -class MyProgressBar(): - """Show progress when downloading model.""" - def __init__(self): - self.pbar = None - - def __call__(self, block_num, block_size, total_size): - if not self.pbar: - self.pbar = progressbar.ProgressBar(maxval=total_size) - self.pbar.start() - - downloaded = block_num * block_size - if downloaded < total_size: - self.pbar.update(downloaded) - else: - self.pbar.finish() - - -def chunk_continues(text: str, index: int) -> bool: - """Set of heuristics for determining whether or not - does the current chunk of log text continue on next line. - """ - conditionals = [ - lambda i, string: string[i + 1].isspace(), - lambda i, string: string[i - 1] == "\\" - ] - - for c in conditionals: - y = c(index, text) - if y: - return True - - return False - - -def get_chunks(text: str): - """Split log into chunks according to heuristic - based on whitespace and backslash presence. - """ - text_len = len(text) - i = 0 - chunk = "" - while i < text_len: - chunk += text[i] - if text[i] == '\n': - if i + 1 < text_len and chunk_continues(text, i): - i += 1 - continue - yield chunk - chunk = "" - i += 1 - - -class LLMExtractor: - """ - A class that extracts relevant information from logs using a language model. - """ - def __init__(self, model_path: str, verbose: bool): - self.model = Llama( - model_path=model_path, - n_ctx=0, - verbose=verbose) - self.grammar = LlamaGrammar.from_string( - "root ::= (\"Yes\" | \"No\")", verbose=False) - - def __call__(self, log: str, n_lines: int = 2, neighbors: bool = False) -> str: - chunks = self.rate_chunks(log, n_lines) - out = self.create_extract(chunks, neighbors) - return out - - def rate_chunks(self, log: str, n_lines: int = 2) -> list[tuple]: - """Scan log by the model and store results. - - :param log: log file content - :param n_lines: How many lines should the model take into consideration - """ - results = [] - log_lines = log.split("\n") - - for i in range(0, len(log_lines), n_lines): - block = '\n'.join(log_lines[i:i + n_lines]) - prompt = SUMMARIZE_PROMPT_TEMPLATE.format(log) - out = self.model(prompt, max_tokens=7, grammar=self.grammar) - out = f"{out['choices'][0]['text']}\n" - results.append((block, out)) - - return results - - def create_extract(self, chunks: list[tuple], neighbors: bool = False) -> str: - """Extract interesting chunks from the model processing. - """ - interesting = [] - summary = "" - # pylint: disable=consider-using-enumerate - for i in range(len(chunks)): - if chunks[i][1].startswith("Yes"): - interesting.append(i) - if neighbors: - interesting.extend([max(i - 1, 0), min(i + 1, len(chunks) - 1)]) - - interesting = np.unique(interesting) - - for i in interesting: - summary += chunks[i][0] + "\n" - - return summary - - -class DrainExtractor: - """A class that extracts information from logs using a template miner algorithm. - """ - def __init__(self, verbose: bool = False, context: bool = False): - config = TemplateMinerConfig() - config.load(f"{os.path.dirname(__file__)}/drain3.ini") - config.profiling_enabled = verbose - self.miner = drain3.TemplateMiner(config=config) - self.verbose = verbose - self.context = context - - def __call__(self, log: str) -> str: - out = "" - for chunk in get_chunks(log): - processed_line = self.miner.add_log_message(chunk) - LOG.debug(processed_line) - sorted_clusters = sorted(self.miner.drain.clusters, key=lambda it: it.size, reverse=True) - for chunk in get_chunks(log): - cluster = self.miner.match(chunk, "always") - if cluster in sorted_clusters: - out += f"{chunk}\n" - sorted_clusters.remove(cluster) - return out - - -def download_model(url: str, verbose: bool = False) -> str: - """ Downloads a language model from a given URL and saves it to the cache directory. - - Args: - url (str): The URL of the language model to be downloaded. - - Returns: - str: The local file path of the downloaded language model. - """ - path = os.path.join( - os.path.expanduser(CACHE_LOC), url.split('/')[-1]) - - LOG.info("Downloading model from %s to %s", url, path) - if not os.path.exists(path): - if verbose: - path, _status = urlretrieve(url, path, MyProgressBar()) - else: - path, _status = urlretrieve(url, path) - - return path - - -def process_log(log: str, model: Llama) -> str: - """ - Processes a given log using the provided language model and returns its summary. - - Args: - log (str): The input log to be processed. - model (Llama): The language model used for processing the log. - - Returns: - str: The summary of the given log generated by the language model. - """ - return model(PROMPT_TEMPLATE.format(log), max_tokens=0)["choices"][0]["text"] - - -def retrieve_log_content(log_path): - """Get content of the file on the log_path path.""" - parsed_url = urlparse(log_path) - log = "" - - if not parsed_url.scheme: - if not os.path.exists(log_path): - raise ValueError(f"Local log {log_path} doesn't exist!") - - with open(log_path, "rt") as f: - log = f.read() - - else: - log = requests.get(log_path, timeout=60).text - - return log - - def main(): """Main execution function.""" parser = argparse.ArgumentParser("logdetective") - parser.add_argument("url", type=str, default="") - parser.add_argument("-M", "--model", type=str, default=DEFAULT_ADVISOR) - parser.add_argument("-S", "--summarizer", type=str, default="drain") - parser.add_argument("-N", "--n_lines", type=int, default=5) + parser.add_argument("file", type=str, default="", help="The URL or path to the log file to be analyzed.") + parser.add_argument("-M", "--model", help="The path or URL of the language model for analysis.", + type=str, default=DEFAULT_ADVISOR) + parser.add_argument("-S", "--summarizer", type=str, default="drain", + help="Choose between LLM and Drain template miner as the log summarizer.\ + LLM must be specified as path to a model, URL or local file.") + parser.add_argument("-N", "--n_lines", type=int, + default=8, help="The number of lines per chunk for LLM analysis.\ + This only makes sense when you are summarizing with LLM.") + parser.add_argument("-C", "--n_clusters", type=int, default=8, + help="Number of clusters for Drain to organize log chunks into.\ + This only makes sense when you are summarizing with Drain") parser.add_argument("-v", "--verbose", action='count', default=0) parser.add_argument("-q", "--quiet", action='store_true') - args = parser.parse_args() if args.verbose and args.quiet: @@ -266,20 +49,17 @@ def main(): model_pth = args.model if args.summarizer == "drain": - extractor = DrainExtractor(args.verbose > 1, context=True) + extractor = DrainExtractor(args.verbose > 1, context=True, max_clusters=args.n_clusters) elif os.path.isfile(args.summarizer): - extractor = LLMExtractor(args.summarizer, args.verbose > 1) + extractor = LLMExtractor(args.summarizer, args.verbose > 1, args.n_lines) else: summarizer_pth = download_model(args.summarizer, not args.quiet) extractor = LLMExtractor(summarizer_pth, args.verbose > 1) LOG.info("Getting summary") - model = Llama( - model_path=model_pth, - n_ctx=0, - verbose=args.verbose > 2) + model = initialize_model(model_pth, args.verbose > 2) - log = retrieve_log_content(args.url) + log = retrieve_log_content(args.file) log_summary = extractor(log) ratio = len(log_summary.split('\n')) / len(log.split('\n')) diff --git a/logdetective/server.py b/logdetective/server.py new file mode 100644 index 0000000..f4c998a --- /dev/null +++ b/logdetective/server.py @@ -0,0 +1,54 @@ +import logging +import os +import json + +from fastapi import FastAPI +from pydantic import BaseModel + +import requests + +from logdetective.constants import PROMPT_TEMPLATE +from logdetective.extractors import DrainExtractor + + +class BuildLog(BaseModel): + """Model of data submitted to API. + """ + url: str + +LOG = logging.getLogger("logdetective") + +app = FastAPI() + +LLM_CPP_SERVER_ADDRESS = os.environ.get("LLAMA_CPP_SERVER", " http://localhost") +LLM_CPP_SERVER_PORT = os.environ.get("LLAMA_CPP_SERVER_PORT", 8000) +LLM_CPP_SERVER_TIMEOUT = os.environ.get("LLAMA_CPP_SERVER_TIMEOUT", 200) + +@app.post("/analyze", ) +async def analyze_log(build_log: BuildLog): + """Provide endpoint for log file submission and analysis + """ + extractor = DrainExtractor(verbose=True, context=True, max_clusters=8) + + LOG.info("Getting summary") + + log = requests.get(build_log.url, timeout=60).text + log_summary = extractor(log) + + ratio = len(log_summary.split('\n')) / len(log.split('\n')) + LOG.debug("Log summary: \n %s", log_summary) + LOG.info("Compression ratio: %s", ratio) + + LOG.info("Analyzing the text") + data = { + "prompt": PROMPT_TEMPLATE.format(log_summary), + "max_tokens": "0"} + + # Expects llama-cpp server to run on LLM_CPP_SERVER_ADDRESS:LLM_CPP_SERVER_PORT + response = requests.post( + f"{LLM_CPP_SERVER_ADDRESS}:{LLM_CPP_SERVER_PORT}/v1/completions", + headers={"Content-Type":"application/json"}, + data=json.dumps(data), + timeout=int(LLM_CPP_SERVER_TIMEOUT)) + + return response.text diff --git a/logdetective/utils.py b/logdetective/utils.py new file mode 100644 index 0000000..08ea188 --- /dev/null +++ b/logdetective/utils.py @@ -0,0 +1,132 @@ +import logging +import os +from urllib.parse import urlparse +from urllib.request import urlretrieve + +import requests +import progressbar + +from llama_cpp import Llama +from logdetective.constants import CACHE_LOC, PROMPT_TEMPLATE + + +LOG = logging.getLogger("logdetective") + + +class MyProgressBar(): + """Show progress when downloading model.""" + def __init__(self): + self.pbar = None + + def __call__(self, block_num, block_size, total_size): + if not self.pbar: + self.pbar = progressbar.ProgressBar(maxval=total_size) + self.pbar.start() + + downloaded = block_num * block_size + if downloaded < total_size: + self.pbar.update(downloaded) + else: + self.pbar.finish() + + +def chunk_continues(text: str, index: int) -> bool: + """Set of heuristics for determining whether or not + does the current chunk of log text continue on next line. + """ + conditionals = [ + lambda i, string: string[i + 1].isspace(), + lambda i, string: string[i - 1] == "\\" + ] + + for c in conditionals: + y = c(index, text) + if y: + return True + + return False + + +def get_chunks(text: str): + """Split log into chunks according to heuristic + based on whitespace and backslash presence. + """ + text_len = len(text) + i = 0 + chunk = "" + while i < text_len: + chunk += text[i] + if text[i] == '\n': + if i + 1 < text_len and chunk_continues(text, i): + i += 1 + continue + yield chunk + chunk = "" + i += 1 + + +def download_model(url: str, verbose: bool = False) -> str: + """ Downloads a language model from a given URL and saves it to the cache directory. + + Args: + url (str): The URL of the language model to be downloaded. + + Returns: + str: The local file path of the downloaded language model. + """ + path = os.path.join( + os.path.expanduser(CACHE_LOC), url.split('/')[-1]) + + LOG.info("Downloading model from %s to %s", url, path) + if not os.path.exists(path): + if verbose: + path, _status = urlretrieve(url, path, MyProgressBar()) + else: + path, _status = urlretrieve(url, path) + + return path + + +def initialize_model(model_pth: str, verbose: bool) -> Llama: + """Initialize Llama class for inference. + Args: + model_pth (str): path to gguf model file + verbose (bool): level of verbosity for llamacpp + """ + model = Llama( + model_path=model_pth, + n_ctx=0, # Maximum context for the model + verbose=verbose) + + return model + + +def process_log(log: str, model: Llama) -> str: + """ + Processes a given log using the provided language model and returns its summary. + + Args: + log (str): The input log to be processed. + model (Llama): The language model used for processing the log. + + Returns: + str: The summary of the given log generated by the language model. + """ + return model(PROMPT_TEMPLATE.format(log), max_tokens=0)["choices"][0]["text"] + +def retrieve_log_content(log_path: str) -> str: + """Get content of the file on the log_path path.""" + parsed_url = urlparse(log_path) + log = "" + + if not parsed_url.scheme: + if not os.path.exists(log_path): + raise ValueError(f"Local log {log_path} doesn't exist!") + + with open(log_path, "rt") as f: + log = f.read() + + else: + log = requests.get(log_path, timeout=60).text + + return log diff --git a/poetry.lock b/poetry.lock index 42d494b..bdea15c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "cachetools" @@ -570,7 +570,10 @@ h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] +[extras] +server = [] + [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "8a12348a4f765827e190d5e9293c9025ded491bd2d31146045040f83359561c6" +content-hash = "c7ef77fd33ebc10a6e6727c1e33f8dedd50b35e1f12c02b9e2abd48d263d9d1f" diff --git a/pyproject.toml b/pyproject.toml index 2964e67..6e87cb1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,9 @@ authors = ["Jiri Podivin "] license = "Apache-2.0" readme = "README.md" include = ["logdetective/drain3.ini"] +packages = [ + { include = "logdetective" } +] classifiers = [ "Development Status :: 4 - Beta", "Environment :: Console", @@ -37,6 +40,9 @@ build-backend = "poetry.core.masonry.api" [tool.poetry.scripts] logdetective = 'logdetective.logdetective:main' +[tool.poetry.extras] +server = ["fastapi", "pydantic"] + [tool.pylint] disable = [ "inconsistent-return-statements",