-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocessor.py
More file actions
133 lines (103 loc) · 4.71 KB
/
Copy pathprocessor.py
File metadata and controls
133 lines (103 loc) · 4.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""
processor.py — main CSV log processing pipeline
Module D: Files + Error Handling
Usage:
python processor.py data/access.log.csv
python processor.py data/access.log.csv --out-dir results/
"""
import argparse
import csv
import logging
from pathlib import Path
from parser import ParseError, parse_row, validate_row
from reporter import compute_stats, write_error_log, write_summary
# ── Logging setup ─────────────────────────────────────────────────────────────
logging.basicConfig(
level=logging.INFO,
format="%(levelname)s %(message)s",
)
log = logging.getLogger(__name__)
# ── Pipeline ──────────────────────────────────────────────────────────────────
def read_log(path: Path) -> list[dict]:
"""
Read a CSV log file and return raw rows as dicts.
Args:
path: Path to the CSV file.
Returns:
List of raw row dicts.
Raises:
FileNotFoundError: If the file doesn't exist.
ValueError: If the file is empty or has no headers.
"""
if not path.exists():
raise FileNotFoundError(f"Log file not found: {path}")
rows = []
with path.open("r", encoding="utf-8-sig") as f:
reader = csv.DictReader(f)
if reader.fieldnames is None:
raise ValueError(f"CSV file is empty or missing headers: {path}")
for row in reader:
rows.append(row)
if not rows:
raise ValueError(f"CSV file has headers but no data rows: {path}")
return rows
def process(log_path: Path, out_dir: Path) -> None:
"""
Full pipeline: read → parse → validate → report.
Args:
log_path: Path to the input CSV.
out_dir: Directory for output files.
"""
out_dir.mkdir(parents=True, exist_ok=True)
# ── Read ──────────────────────────────────────────────────────────────────
log.info(f"Reading {log_path} ...")
raw_rows = read_log(log_path)
log.info(f" {len(raw_rows)} rows found.")
# ── Parse + validate ──────────────────────────────────────────────────────
entries = []
errors = []
for raw in raw_rows:
try:
parsed = parse_row(raw)
validate_row(parsed)
entries.append(parsed)
except ParseError as e:
errors.append({"row": e.row, "reason": e.reason})
log.warning(f" Skipping malformed row: {e.reason}")
log.info(f" {len(entries)} rows parsed OK, {len(errors)} errors.")
# ── Stats + reports ───────────────────────────────────────────────────────
stats = compute_stats(entries)
summary_path = out_dir / "summary.txt"
errors_path = out_dir / "parse_errors.txt"
write_summary(stats, len(entries), len(errors), summary_path)
log.info(f" Summary written → {summary_path}")
if errors:
write_error_log(errors, errors_path)
log.info(f" Error log written → {errors_path}")
# ── Print summary to terminal ─────────────────────────────────────────────
print()
print(summary_path.read_text(encoding="utf-8"))
# ── CLI ───────────────────────────────────────────────────────────────────────
def main() -> None:
parser = argparse.ArgumentParser(
description="Parse a server access log CSV and generate a summary report.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python processor.py data/access.log.csv
python processor.py data/access.log.csv --out-dir results/
""",
)
parser.add_argument("log_file", type=Path, help="Path to the input CSV log file.")
parser.add_argument(
"--out-dir", type=Path, default=Path("output"),
help="Directory for output files (default: output/)",
)
args = parser.parse_args()
try:
process(args.log_file, args.out_dir)
except (FileNotFoundError, ValueError) as e:
log.error(str(e))
raise SystemExit(1)
if __name__ == "__main__":
main()