Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
<div align="center">

# Cordon

[![PyPI version](https://img.shields.io/pypi/v/cordon.svg)](https://pypi.org/project/cordon/)
[![License](https://img.shields.io/github/license/calebevans/cordon.svg)](https://github.com/calebevans/cordon/blob/main/LICENSE)
[![PyPI Downloads](https://static.pepy.tech/personalized-badge/cordon?period=total&units=INTERNATIONAL_SYSTEM&left_color=GREY&right_color=BLUE&left_text=downloads)](https://pepy.tech/projects/cordon)
[![Security Rating](https://sonarcloud.io/api/project_badges/measure?project=calebevans_cordon&metric=security_rating)](https://sonarcloud.io/summary/new_code?id=calebevans_cordon)
[![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=calebevans_cordon&metric=alert_status)](https://sonarcloud.io/summary/new_code?id=calebevans_cordon)

</div>

Cordon uses transformer embeddings and density scoring to identify semantically unusual patterns in large log files, reducing massive logs to the most anomalous sections for analysis. Repetitive patterns (even errors) are considered "normal background." Cordon surfaces unusual, rare, or clustered events that stand out semantically from the bulk of the logs.

Expand Down
5 changes: 3 additions & 2 deletions src/cordon/analysis/thresholder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections.abc import Sequence
from math import isclose

import numpy as np

Expand Down Expand Up @@ -59,11 +60,11 @@ def select_significant(
# Single percentile mode (original behavior)

# all windows, sorted by score descending
if config.anomaly_percentile == 1.0:
if isclose(config.anomaly_percentile, 1.0):
return sorted(scored_windows, key=lambda window: window.score, reverse=True)

# no windows requested
if config.anomaly_percentile == 0.0:
if isclose(config.anomaly_percentile, 0.0):
return []

# calculate percentile threshold
Expand Down
59 changes: 34 additions & 25 deletions src/cordon/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python3
import argparse
import sys
from math import isclose
from pathlib import Path

from cordon import AnalysisConfig, SemanticLogAnalyzer
Expand Down Expand Up @@ -207,6 +208,36 @@ def analyze_file(
print()


def _print_backend_info(config: AnalysisConfig) -> None:
"""Print backend configuration details."""
print(f"Backend: {config.backend}")
if config.backend == "sentence-transformers":
print(f"Model: {config.model_name}")
print(f"Device: {config.device or 'auto'}")
elif config.backend == "llama-cpp":
print(f"Model path: {config.model_path}")
print(f"GPU layers: {config.n_gpu_layers}")
if config.n_threads:
print(f"Threads: {config.n_threads}")
elif config.backend == "remote":
print(f"Model: {config.model_name}")
if config.endpoint:
print(f"Endpoint: {config.endpoint}")
print(f"Timeout: {config.request_timeout}s")


def _print_filtering_mode(config: AnalysisConfig) -> None:
"""Print filtering mode configuration."""
if config.anomaly_range_min is not None:
# Type narrowing: if min is not None, max is also not None (enforced in config)
assert config.anomaly_range_max is not None
print(
f"Filtering mode: Range (exclude top {config.anomaly_range_min*100:.1f}%, keep up to {config.anomaly_range_max*100:.1f}%)"
)
else:
print(f"Filtering mode: Percentile (top {config.anomaly_percentile*100:.1f}%)")


def main() -> None:
"""Main entry point for the CLI."""
args = parse_args()
Expand All @@ -221,7 +252,7 @@ def main() -> None:
anomaly_range_min = args.anomaly_range[0]
anomaly_range_max = args.anomaly_range[1]
# Keep default percentile value (not used in range mode)
if args.anomaly_percentile != 0.1:
if not isclose(args.anomaly_percentile, 0.1):
print(
"Warning: --anomaly-percentile is ignored when using --anomaly-range",
file=sys.stderr,
Expand Down Expand Up @@ -253,30 +284,8 @@ def main() -> None:

# create analyzer
print("Initializing analyzer...")
print(f"Backend: {config.backend}")
if config.backend == "sentence-transformers":
print(f"Model: {config.model_name}")
print(f"Device: {config.device or 'auto'}")
elif config.backend == "llama-cpp":
print(f"Model path: {config.model_path}")
print(f"GPU layers: {config.n_gpu_layers}")
if config.n_threads:
print(f"Threads: {config.n_threads}")
elif config.backend == "remote":
print(f"Model: {config.model_name}")
if config.endpoint:
print(f"Endpoint: {config.endpoint}")
print(f"Timeout: {config.request_timeout}s")

# Display filtering mode
if config.anomaly_range_min is not None:
# Type narrowing: if min is not None, max is also not None (enforced in config)
assert config.anomaly_range_max is not None
print(
f"Filtering mode: Range (exclude top {config.anomaly_range_min*100:.1f}%, keep up to {config.anomaly_range_max*100:.1f}%)"
)
else:
print(f"Filtering mode: Percentile (top {config.anomaly_percentile*100:.1f}%)")
_print_backend_info(config)
_print_filtering_mode(config)
print()

try:
Expand Down
49 changes: 26 additions & 23 deletions src/cordon/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,27 @@ class AnalysisConfig:

def __post_init__(self) -> None:
"""Validate configuration parameters."""
self._validate_core_params()
self._validate_anomaly_range()
self._validate_backend()

def _validate_core_params(self) -> None:
"""Validate core analysis parameters."""
if self.window_size < 1:
raise ValueError("window_size must be >= 1")
if self.k_neighbors < 1:
raise ValueError("k_neighbors must be >= 1")
if not 0.0 <= self.anomaly_percentile <= 1.0:
raise ValueError("anomaly_percentile must be between 0.0 and 1.0")
if self.batch_size < 1:
raise ValueError("batch_size must be >= 1")
if self.scoring_batch_size is not None and self.scoring_batch_size < 1:
raise ValueError("scoring_batch_size must be >= 1 or None for auto-detect")
if self.device is not None and self.device not in ("cuda", "mps", "cpu"):
raise ValueError("device must be 'cuda', 'mps', 'cpu', or None")

# Validate anomaly range parameters
def _validate_anomaly_range(self) -> None:
"""Validate anomaly range parameters."""
if (self.anomaly_range_min is None) != (self.anomaly_range_max is None):
raise ValueError(
"anomaly_range_min and anomaly_range_max must both be set or both be None"
Expand All @@ -54,40 +67,30 @@ def __post_init__(self) -> None:
if self.anomaly_range_min >= self.anomaly_range_max:
raise ValueError("anomaly_range_min must be less than anomaly_range_max")

if self.batch_size < 1:
raise ValueError("batch_size must be >= 1")
if self.scoring_batch_size is not None and self.scoring_batch_size < 1:
raise ValueError("scoring_batch_size must be >= 1 or None for auto-detect")
if self.device is not None and self.device not in ("cuda", "mps", "cpu"):
raise ValueError("device must be 'cuda', 'mps', 'cpu', or None")

# Backend validation
def _validate_backend(self) -> None:
"""Validate backend and backend-specific parameters."""
if self.backend not in ("sentence-transformers", "llama-cpp", "remote"):
raise ValueError(
f"backend must be 'sentence-transformers', 'llama-cpp', or 'remote', got '{self.backend}'"
)

# llama-cpp specific validation
if self.backend == "llama-cpp" and self.model_path is not None:
# If model_path is provided, validate it exists and has correct extension
# If None, LlamaCppVectorizer will auto-download default model
model_file = Path(self.model_path)
if not model_file.exists():
raise ValueError(f"GGUF model file not found: {self.model_path}")

if model_file.suffix != ".gguf":
raise ValueError(f"model_path must be a .gguf file, got: {model_file.suffix}")
self._validate_llama_cpp_model_path()

# llama.cpp parameter validation
if self.n_ctx < 1:
raise ValueError("n_ctx must be >= 1")

if self.n_gpu_layers < -1:
raise ValueError("n_gpu_layers must be >= -1 (-1 for all layers, 0 for CPU-only)")

if self.n_threads is not None and self.n_threads < 1:
raise ValueError("n_threads must be >= 1 or None for auto-detect")

# remote backend validation
if self.request_timeout <= 0:
raise ValueError("request_timeout must be > 0")

def _validate_llama_cpp_model_path(self) -> None:
"""Validate llama.cpp model path exists and has correct extension."""
assert self.model_path is not None
model_file = Path(self.model_path)
if not model_file.exists():
raise ValueError(f"GGUF model file not found: {self.model_path}")
if model_file.suffix != ".gguf":
raise ValueError(f"model_path must be a .gguf file, got: {model_file.suffix}")