Skip to content

Commit

Permalink
Refactoring module layout
Browse files Browse the repository at this point in the history
Enabling runtime setting of cluster number

Removing poetry import

Reworked help

New arg for clusters

Adding fastapi dependency

Moved inference and model initialization into utils

Adding pydantic for server

Prompt simplified

We no longer need to notify model about special substrings, now
that we retrieve representative log samples instead of templates.

Exposing n_clusters arg

Restoring local retrieval of logs

Docstrings and import rearangement

Signed-off-by: Jiri Podivin <jpodivin@redhat.com>
  • Loading branch information
jpodivin committed May 21, 2024
1 parent 323be68 commit 9f2a1f4
Show file tree
Hide file tree
Showing 9 changed files with 339 additions and 242 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ To analyze a log file, run the script with the following command line arguments:
- `url` (required): The URL of the log file to be analyzed.
- `--model` (optional, default: "Mistral-7B-Instruct-v0.2-GGUF"): The path or URL of the language model for analysis.
- `--summarizer` (optional, default: "drain"): Choose between LLM and Drain template miner as the log summarizer. You can also provide the path to an existing language model file instead of using a URL.
- `--n_lines` (optional, default: 5): The number of lines per chunk for LLM analysis. This only makes sense when you are summarizing with LLM.
- `--n_lines` (optional, default: 8): The number of lines per chunk for LLM analysis. This only makes sense when you are summarizing with LLM.
- `--n_clusters` (optional, default 8): Number of clusters for Drain to organize log chunks into. This only makes sense when you are summarizing with Drain

Example usage:

Expand Down
Empty file added logdetective/__init__.py
Empty file.
29 changes: 29 additions & 0 deletions logdetective/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@

# pylint: disable=line-too-long
DEFAULT_ADVISOR = "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_S.gguf?download=true"

PROMPT_TEMPLATE = """
Given following log snippets, and nothing else, explain what failure, if any, occured during build of this package.
{}
Analysis of the failure must be in a format of [X] : [Y], where [X] is a log snippet, and [Y] is the explanation.
Finally, drawing on information from all snippets, provide complete explanation of the issue.
Analysis:
"""

SUMMARIZE_PROMPT_TEMPLATE = """
Does following log contain error or issue?
Log:
{}
Answer:
"""

CACHE_LOC = "~/.cache/logdetective/"
92 changes: 92 additions & 0 deletions logdetective/extractors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import os
import logging

import drain3
from drain3.template_miner_config import TemplateMinerConfig
from llama_cpp import Llama, LlamaGrammar

Check warning

Code scanning / vcs-diff-lint

Unable to import 'llama_cpp' Warning

Unable to import 'llama_cpp'

from logdetective.constants import SUMMARIZE_PROMPT_TEMPLATE
from logdetective.utils import get_chunks

LOG = logging.getLogger("logdetective")


class LLMExtractor:
"""
A class that extracts relevant information from logs using a language model.
"""
def __init__(self, model_path: str, verbose: bool, n_lines: int = 2):
self.model = Llama(
model_path=model_path,
n_ctx=0,
verbose=verbose)
self.n_lines = n_lines
self.grammar = LlamaGrammar.from_string(
"root ::= (\"Yes\" | \"No\")", verbose=False)

def __call__(self, log: str, n_lines: int = 2, neighbors: bool = False) -> str:
chunks = self.rate_chunks(log)
out = self.create_extract(chunks, neighbors)
return out

def rate_chunks(self, log: str) -> list[tuple]:
"""Scan log by the model and store results.
:param log: log file content
"""
results = []
log_lines = log.split("\n")

for i in range(0, len(log_lines), self.n_lines):
block = '\n'.join(log_lines[i:i + self.n_lines])
prompt = SUMMARIZE_PROMPT_TEMPLATE.format(log)
out = self.model(prompt, max_tokens=7, grammar=self.grammar)
out = f"{out['choices'][0]['text']}\n"
results.append((block, out))

return results

def create_extract(self, chunks: list[tuple], neighbors: bool = False) -> str:
"""Extract interesting chunks from the model processing.
"""
interesting = []
summary = ""
# pylint: disable=consider-using-enumerate
for i in range(len(chunks)):
if chunks[i][1].startswith("Yes"):
interesting.append(i)
if neighbors:
interesting.extend([max(i - 1, 0), min(i + 1, len(chunks) - 1)])

interesting = set(interesting)

for i in interesting:
summary += chunks[i][0] + "\n"

return summary


class DrainExtractor:
"""A class that extracts information from logs using a template miner algorithm.
"""
def __init__(self, verbose: bool = False, context: bool = False, max_clusters=8):
config = TemplateMinerConfig()
config.load(f"{os.path.dirname(__file__)}/drain3.ini")
config.profiling_enabled = verbose
config.drain_max_clusters = max_clusters
self.miner = drain3.TemplateMiner(config=config)
self.verbose = verbose
self.context = context

def __call__(self, log: str) -> str:
out = ""
for chunk in get_chunks(log):
processed_line = self.miner.add_log_message(chunk)
LOG.debug(processed_line)
sorted_clusters = sorted(self.miner.drain.clusters, key=lambda it: it.size, reverse=True)
for chunk in get_chunks(log):
cluster = self.miner.match(chunk, "always")
if cluster in sorted_clusters:
out += f"{chunk}\n"
sorted_clusters.remove(cluster)
return out
Loading

0 comments on commit 9f2a1f4

Please sign in to comment.