Refactoring module layout

Enabling runtime setting of cluster number Removing poetry import Reworked help New arg for clusters Adding fastapi dependency Moved inference and model initialization into utils Adding pydantic for server Prompt simplified We no longer need to notify model about special substrings, now that we retrieve representative log samples instead of templates. Exposing n_clusters arg Restoring local retrieval of logs Docstrings and import rearangement Signed-off-by: Jiri Podivin <jpodivin@redhat.com>
fedora-copr · May 21, 2024 · 9f2a1f4 · 9f2a1f4
1 parent 323be68
commit 9f2a1f4
Show file tree

Hide file tree

Showing 9 changed files with 339 additions and 242 deletions.
diff --git a/README.md b/README.md
@@ -31,7 +31,8 @@ To analyze a log file, run the script with the following command line arguments:
 - `url` (required): The URL of the log file to be analyzed.
 - `--model` (optional, default: "Mistral-7B-Instruct-v0.2-GGUF"): The path or URL of the language model for analysis.
 - `--summarizer` (optional, default: "drain"): Choose between LLM and Drain template miner as the log summarizer. You can also provide the path to an existing language model file instead of using a URL.
-- `--n_lines` (optional, default: 5): The number of lines per chunk for LLM analysis. This only makes sense when you are summarizing with LLM.
+- `--n_lines` (optional, default: 8): The number of lines per chunk for LLM analysis. This only makes sense when you are summarizing with LLM.
+- `--n_clusters` (optional, default 8): Number of clusters for Drain to organize log chunks into. This only makes sense when you are summarizing with Drain
 
 Example usage:
 

diff --git a/logdetective/__init__.py b/logdetective/__init__.py
diff --git a/logdetective/constants.py b/logdetective/constants.py
@@ -0,0 +1,29 @@
+
+# pylint: disable=line-too-long
+DEFAULT_ADVISOR = "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_S.gguf?download=true"
+
+PROMPT_TEMPLATE = """
+Given following log snippets, and nothing else, explain what failure, if any, occured during build of this package.
+
+{}
+
+Analysis of the failure must be in a format of [X] : [Y], where [X] is a log snippet, and [Y] is the explanation.
+
+Finally, drawing on information from all snippets, provide complete explanation of the issue.
+
+Analysis:
+
+"""
+
+SUMMARIZE_PROMPT_TEMPLATE = """
+Does following log contain error or issue?
+
+Log:
+
+{}
+
+Answer:
+
+"""
+
+CACHE_LOC = "~/.cache/logdetective/"
diff --git a/logdetective/extractors.py b/logdetective/extractors.py
@@ -0,0 +1,92 @@
+import os
+import logging
+
+import drain3
+from drain3.template_miner_config import TemplateMinerConfig
+from llama_cpp import Llama, LlamaGrammar
+
+from logdetective.constants import SUMMARIZE_PROMPT_TEMPLATE
+from logdetective.utils import get_chunks
+
+LOG = logging.getLogger("logdetective")
+
+
+class LLMExtractor:
+    """
+    A class that extracts relevant information from logs using a language model.
+    """
+    def __init__(self, model_path: str, verbose: bool, n_lines: int = 2):
+        self.model = Llama(
+            model_path=model_path,
+            n_ctx=0,
+            verbose=verbose)
+        self.n_lines = n_lines
+        self.grammar = LlamaGrammar.from_string(
+            "root ::= (\"Yes\" | \"No\")", verbose=False)
+
+    def __call__(self, log: str, n_lines: int = 2, neighbors: bool = False) -> str:
+        chunks = self.rate_chunks(log)
+        out = self.create_extract(chunks, neighbors)
+        return out
+
+    def rate_chunks(self, log: str) -> list[tuple]:
+        """Scan log by the model and store results.
+
+        :param log: log file content
+        """
+        results = []
+        log_lines = log.split("\n")
+
+        for i in range(0, len(log_lines), self.n_lines):
+            block = '\n'.join(log_lines[i:i + self.n_lines])
+            prompt = SUMMARIZE_PROMPT_TEMPLATE.format(log)
+            out = self.model(prompt, max_tokens=7, grammar=self.grammar)
+            out = f"{out['choices'][0]['text']}\n"
+            results.append((block, out))
+
+        return results
+
+    def create_extract(self, chunks: list[tuple], neighbors: bool = False) -> str:
+        """Extract interesting chunks from the model processing.
+        """
+        interesting = []
+        summary = ""
+        # pylint: disable=consider-using-enumerate
+        for i in range(len(chunks)):
+            if chunks[i][1].startswith("Yes"):
+                interesting.append(i)
+                if neighbors:
+                    interesting.extend([max(i - 1, 0), min(i + 1, len(chunks) - 1)])
+
+        interesting = set(interesting)
+
+        for i in interesting:
+            summary += chunks[i][0] + "\n"
+
+        return summary
+
+
+class DrainExtractor:
+    """A class that extracts information from logs using a template miner algorithm.
+    """
+    def __init__(self, verbose: bool = False, context: bool = False, max_clusters=8):
+        config = TemplateMinerConfig()
+        config.load(f"{os.path.dirname(__file__)}/drain3.ini")
+        config.profiling_enabled = verbose
+        config.drain_max_clusters = max_clusters
+        self.miner = drain3.TemplateMiner(config=config)
+        self.verbose = verbose
+        self.context = context
+
+    def __call__(self, log: str) -> str:
+        out = ""
+        for chunk in get_chunks(log):
+            processed_line = self.miner.add_log_message(chunk)
+            LOG.debug(processed_line)
+        sorted_clusters = sorted(self.miner.drain.clusters, key=lambda it: it.size, reverse=True)
+        for chunk in get_chunks(log):
+            cluster = self.miner.match(chunk, "always")
+            if cluster in sorted_clusters:
+                out += f"{chunk}\n"
+                sorted_clusters.remove(cluster)
+        return out