Initial cross-server log inventory + anomaly scan

- 10 hosts (mo1, ams, ams2, ro1, ca1, ca2, ca3, fr1, sony, termux) - discover-logs.sh: portable inventory (Linux/FreeBSD/Termux) - scan-anomalies.sh: ERROR/WARN/CRITICAL counts + journalctl + kubectl - run-all.sh: parallel SSH fan-out - build-summary.py: aggregates into reports/SUMMARY.md - 5 HIGH-severity findings identified on ro1 (apache scanner traffic, mount_monitor warnings)
2026-04-10 21:49:17 +00:00
parent cabf4c587f
commit e96a8b03fc
26 changed files with 1636 additions and 1 deletions
--- a/scripts/build-summary.py
+++ b/scripts/build-summary.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+"""Aggregate per-host CSV inventories + anomaly text into reports/SUMMARY.md."""
+from __future__ import annotations
+import csv, glob, os, re, sys
+from pathlib import Path
+from datetime import datetime, timezone
+
+ROOT = Path(__file__).resolve().parent.parent
+INV_DIR = ROOT / "logs" / "inventory"
+ANOM_DIR = ROOT / "anomalies"
+OUT = ROOT / "reports" / "SUMMARY.md"
+
+def human(n: int) -> str:
+    for unit in ("B","K","M","G","T"):
+        if n < 1024:
+            return f"{n:.0f}{unit}" if unit == "B" else f"{n:.1f}{unit}"
+        n /= 1024
+    return f"{n:.1f}P"
+
+def load_inventory(host: str, csvpath: Path):
+    rows = []
+    if not csvpath.exists() or csvpath.stat().st_size == 0:
+        return rows
+    with csvpath.open(newline="", errors="replace") as f:
+        for r in csv.reader(f):
+            if len(r) < 4: continue
+            try:
+                rows.append((r[0], int(r[1]), r[2], r[3]))
+            except ValueError:
+                continue
+    return rows
+
+ANOM_RE = re.compile(r"^(\S+)\s+errors=(\d+)\s+warns=(\d+)\s+size=(\d+)")
+
+def parse_anomaly(host: str, txt: Path):
+    """Return list of (path, errors, warns, size) and journal error count."""
+    findings = []
+    journal_err = 0
+    if not txt.exists():
+        return findings, journal_err, "missing"
+    body = txt.read_text(errors="replace")
+    if not body.strip():
+        return findings, journal_err, "empty (host unreachable?)"
+    for line in body.splitlines():
+        m = ANOM_RE.match(line)
+        if m:
+            findings.append((m.group(1), int(m.group(2)), int(m.group(3)), int(m.group(4))))
+    # crude journal error tally
+    in_journal = False
+    for line in body.splitlines():
+        if line.startswith("--- journalctl"):
+            in_journal = True; continue
+        if line.startswith("---") and in_journal:
+            break
+        if in_journal and line.strip():
+            journal_err += 1
+    return findings, journal_err, "ok"
+
+def severity(errors: int, warns: int) -> str:
+    if errors >= 50 or warns >= 1000: return "HIGH"
+    if errors >= 10 or warns >= 200:  return "MED"
+    if errors > 0 or warns > 50:      return "LOW"
+    return "-"
+
+def main():
+    hosts = sorted({p.stem for p in INV_DIR.glob("*.csv")} |
+                   {p.stem for p in ANOM_DIR.glob("*.txt")})
+    out = []
+    out.append("# Cross-Server Log Inspection — Summary")
+    out.append("")
+    out.append(f"_Generated: {datetime.now(timezone.utc).isoformat(timespec='seconds')}_")
+    out.append("")
+    out.append("## Coverage")
+    out.append("")
+    out.append("| Host | Inventory entries | Status | Top log dirs |")
+    out.append("|------|-------------------:|--------|--------------|")
+    per_host_findings = {}
+    per_host_inv = {}
+    for h in hosts:
+        inv = load_inventory(h, INV_DIR / f"{h}.csv")
+        per_host_inv[h] = inv
+        findings, jerr, status = parse_anomaly(h, ANOM_DIR / f"{h}.txt")
+        per_host_findings[h] = (findings, jerr, status)
+        # top dirs by total size
+        dirs = {}
+        for path, sz, _, _ in inv:
+            d = "/".join(path.split("/")[:4])
+            dirs[d] = dirs.get(d, 0) + sz
+        topdirs = ", ".join(f"{d} ({human(s)})" for d, s in sorted(dirs.items(), key=lambda x:-x[1])[:3])
+        out.append(f"| {h} | {len(inv)} | {status} | {topdirs or '-'} |")
+    out.append("")
+
+    # Largest individual log files across all hosts
+    out.append("## Top 25 largest log files (cluster-wide)")
+    out.append("")
+    out.append("| Host | Path | Size | Mtime | Service |")
+    out.append("|------|------|-----:|-------|---------|")
+    flat = []
+    for h, rows in per_host_inv.items():
+        for path, sz, mt, svc in rows:
+            flat.append((h, path, sz, mt, svc))
+    flat.sort(key=lambda x: -x[2])
+    for h, p, sz, mt, svc in flat[:25]:
+        out.append(f"| {h} | `{p}` | {human(sz)} | {mt} | {svc} |")
+    out.append("")
+
+    # Anomaly findings table
+    out.append("## Anomalies — files with errors or excessive warnings")
+    out.append("")
+    out.append("| Host | Severity | Errors | Warns | Size | Path |")
+    out.append("|------|----------|-------:|------:|-----:|------|")
+    rows_sev = []
+    for h, (findings, _, _) in per_host_findings.items():
+        for path, e, w, sz in findings:
+            rows_sev.append((severity(e,w), h, e, w, sz, path))
+    sev_rank = {"HIGH":0, "MED":1, "LOW":2, "-":3}
+    rows_sev.sort(key=lambda r: (sev_rank[r[0]], -r[2], -r[3]))
+    for sev, h, e, w, sz, p in rows_sev:
+        out.append(f"| {h} | **{sev}** | {e} | {w} | {human(sz)} | `{p}` |")
+    if not rows_sev:
+        out.append("| - | - | - | - | - | _no error patterns detected in 7-day window_ |")
+    out.append("")
+
+    # journal error summary
+    out.append("## systemd journal error volume (24h)")
+    out.append("")
+    out.append("| Host | journalctl -p err lines |")
+    out.append("|------|------------------------:|")
+    for h, (_, jerr, _) in per_host_findings.items():
+        out.append(f"| {h} | {jerr} |")
+    out.append("")
+
+    # Recommendations
+    out.append("## Recommendations")
+    out.append("")
+    recs = []
+    # 1. Severity-based
+    high = [r for r in rows_sev if r[0] == "HIGH"]
+    if high:
+        recs.append(f"- **Investigate {len(high)} HIGH-severity log file(s) immediately** — see table above. "
+                    "These have either ≥50 error lines or ≥1000 warning lines in the last 7 days.")
+    # 2. Big files
+    bigfiles = [r for r in flat if r[2] > 100*1024*1024]
+    if bigfiles:
+        recs.append(f"- **{len(bigfiles)} log file(s) exceed 100 MB** — consider tightening logrotate "
+                    "(e.g. `/etc/logrotate.d/`) and/or using zstd compression. Largest: "
+                    f"`{bigfiles[0][1]}` on {bigfiles[0][0]} at {human(bigfiles[0][2])}.")
+    # 3. Hosts with no inventory (likely unprivileged)
+    empty = [h for h, inv in per_host_inv.items() if len(inv) < 30]
+    if empty:
+        recs.append(f"- **Sparse inventories on {', '.join(empty)}** — these likely require sudo to enumerate "
+                    "/var/log fully. Re-run discovery as root for a complete picture (the runner can be "
+                    "extended to use `sudo -n` on Linux hosts as it already does on FreeBSD).")
+    # 4. journal noise
+    noisy = sorted(((h, j) for h, (_, j, _) in per_host_findings.items() if j > 100),
+                   key=lambda x:-x[1])
+    if noisy:
+        h, j = noisy[0]
+        recs.append(f"- **journald noisiest on {h}** ({j} error lines/24h). Top drivers worth triaging: "
+                    "check `journalctl -p err -b` for repeating units (mbsync, sudo PAM failures, etc.).")
+    # 5. Generic
+    recs.append("- Re-run `./scripts/run-all.sh` on a schedule (cron / systemd timer) and commit the diff "
+                "to track regressions over time.")
+    recs.append("- Consider centralising logs (Loki / Vector → VictoriaLogs on mo1) so this scan becomes "
+                "a single query rather than 10 SSH fan-outs.")
+    out.extend(recs)
+    out.append("")
+
+    OUT.parent.mkdir(parents=True, exist_ok=True)
+    OUT.write_text("\n".join(out))
+    print(f"wrote {OUT} ({len(out)} lines)")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/discover-logs.sh
+++ b/scripts/discover-logs.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+# discover-logs.sh — portable log inventory.
+# Outputs CSV: path,size_bytes,mtime_iso,service
+# Works on Linux (Debian/Ubuntu), FreeBSD, and Termux.
+
+set -u
+HOST=$(hostname 2>/dev/null || uname -n)
+
+# 1. Build candidate file list using fast tools when available.
+LIST=$(mktemp 2>/dev/null || echo /tmp/discover.$$)
+trap 'rm -f "$LIST"' EXIT
+
+if command -v plocate >/dev/null 2>&1; then
+    plocate /var/log 2>/dev/null > "$LIST"
+elif command -v locate >/dev/null 2>&1; then
+    locate /var/log 2>/dev/null > "$LIST"
+else
+    # No locate db: walk /var/log with du (faster than find for our purposes).
+    if [ -d /var/log ]; then
+        du -ab /var/log 2>/dev/null | awk '{ $1=""; sub(/^ /,""); print }' > "$LIST"
+    fi
+fi
+
+# Add Kubernetes / container log dirs explicitly (they may be outside locate db).
+for extra in /var/log/pods /var/log/containers /var/lib/docker/containers /var/log/journal; do
+    [ -d "$extra" ] && du -ab "$extra" 2>/dev/null | awk '{ $1=""; sub(/^ /,""); print }' >> "$LIST"
+done
+
+# Termux logs
+if [ -n "${PREFIX:-}" ] && [ -d "${PREFIX}/var/log" ]; then
+    du -ab "${PREFIX}/var/log" 2>/dev/null | awk '{ $1=""; sub(/^ /,""); print }' >> "$LIST"
+fi
+
+# 2. Filter to regular files matching log-ish patterns, emit CSV.
+# Service guessed from path component under /var/log/.
+sort -u "$LIST" | while IFS= read -r p; do
+    [ -f "$p" ] || continue
+    case "$p" in
+        *.log|*.log.*|*.gz|*.zst|*.xz|*.zip|*/messages|*/syslog|*/auth*|*/kern*|*/daemon*|*/dmesg*|*/secure*) ;;
+        *) continue ;;
+    esac
+    sz=$(stat -c '%s' "$p" 2>/dev/null || stat -f '%z' "$p" 2>/dev/null) || continue
+    mt=$(stat -c '%y' "$p" 2>/dev/null | cut -d. -f1 || stat -f '%Sm' -t '%Y-%m-%d %H:%M:%S' "$p" 2>/dev/null) || continue
+    svc=$(echo "$p" | awk -F/ '{
+        for (i=1;i<=NF;i++) if ($i=="log" || $i=="logs") { print $(i+1); exit }
+    }')
+    [ -z "$svc" ] && svc="other"
+    # CSV-escape quotes/commas in path
+    esc=$(printf '%s' "$p" | sed 's/"/""/g')
+    printf '"%s",%s,"%s","%s"\n' "$esc" "$sz" "$mt" "$svc"
+done
--- a/scripts/run-all.sh
+++ b/scripts/run-all.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# run-all.sh — fan out discover-logs.sh and scan-anomalies.sh to every host.
+# Run from the log_analysis repo root.
+
+set -u
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+mkdir -p "$ROOT/logs/inventory" "$ROOT/anomalies"
+
+# host:ssh-prefix:needs-sudo
+HOSTS=(
+    "mo1:local:0"
+    "ams:ssh -o BatchMode=yes ams:1"
+    "ams2:ssh -o BatchMode=yes ams2:1"
+    "ro1:ssh -o BatchMode=yes ro1:1"
+    "ca1:ssh -o BatchMode=yes ca1:0"
+    "ca2:ssh -o BatchMode=yes ca2:0"
+    "ca3:ssh -o BatchMode=yes -p 15120 ca3:0"
+    "fr1:ssh -o BatchMode=yes fr1:0"
+    "sony:ssh -o BatchMode=yes -o ConnectTimeout=5 sony:0"
+    "termux:ssh -o BatchMode=yes -o ConnectTimeout=5 -p 8022 termux:0"
+)
+
+run_one() {
+    local entry="$1"
+    local host="${entry%%:*}"
+    local rest="${entry#*:}"
+    local ssh_cmd="${rest%:*}"
+    local sudo_flag="${rest##*:}"
+
+    local discover scan
+    discover="$(cat "$ROOT/scripts/discover-logs.sh")"
+    scan="$(cat "$ROOT/scripts/scan-anomalies.sh")"
+
+    local pfx=""
+    [ "$sudo_flag" = "1" ] && pfx="sudo -n "
+
+    if [ "$ssh_cmd" = "local" ]; then
+        echo "[$host] discover (local)"
+        ${pfx}sh -c "$discover" > "$ROOT/logs/inventory/$host.csv" 2>/dev/null
+        echo "[$host] scan (local)"
+        ${pfx}sh -c "$scan" > "$ROOT/anomalies/$host.txt" 2>&1
+    else
+        echo "[$host] discover via: $ssh_cmd"
+        $ssh_cmd "${pfx}sh" > "$ROOT/logs/inventory/$host.csv" 2>/dev/null <<EOF || echo "[$host] discover FAILED"
+$discover
+EOF
+        echo "[$host] scan via: $ssh_cmd"
+        $ssh_cmd "${pfx}sh" > "$ROOT/anomalies/$host.txt" 2>&1 <<EOF || echo "[$host] scan FAILED"
+$scan
+EOF
+    fi
+    local lines bytes
+    lines=$(wc -l < "$ROOT/logs/inventory/$host.csv" 2>/dev/null || echo 0)
+    bytes=$(wc -c < "$ROOT/anomalies/$host.txt" 2>/dev/null || echo 0)
+    echo "[$host] done — inventory=$lines lines, anomalies=$bytes bytes"
+}
+
+# Run hosts in parallel (background), wait at end.
+for h in "${HOSTS[@]}"; do
+    run_one "$h" &
+done
+wait
+echo "All hosts complete."
--- a/scripts/scan-anomalies.sh
+++ b/scripts/scan-anomalies.sh
@@ -0,0 +1,72 @@
+#!/bin/sh
+# scan-anomalies.sh — inspect recent log files for error/warning/critical patterns.
+# Output is human-readable; one block per file with issues.
+
+set -u
+HOST=$(hostname 2>/dev/null || uname -n)
+echo "=== Anomaly scan: $HOST ($(date -u +%FT%TZ)) ==="
+echo
+
+# 1. systemd journal (Linux only) — last 24h, error priority and above.
+if command -v journalctl >/dev/null 2>&1; then
+    echo "--- journalctl -p err --since '24 hours ago' ---"
+    journalctl -p err --since '24 hours ago' --no-pager 2>/dev/null | tail -100
+    echo
+fi
+
+# 2. kubectl events (mo1 only).
+if command -v kubectl >/dev/null 2>&1; then
+    echo "--- kubectl get events --all-namespaces (warnings) ---"
+    kubectl get events --all-namespaces --field-selector type!=Normal 2>/dev/null | tail -50
+    echo
+fi
+
+# 3. Recent (mtime < 7d) log files: count error tokens.
+PATTERN='ERROR|FATAL|CRITICAL|FAIL(ED|URE)?|panic|segfault|OOM|Out of memory|denied'
+WPAT='WARN(ING)?'
+
+scan_file() {
+    f="$1"
+    case "$f" in
+        *.gz)  cmd="zcat -- \"$f\"" ;;
+        *.xz)  cmd="xzcat -- \"$f\"" ;;
+        *.zst) cmd="zstdcat -- \"$f\"" ;;
+        *.zip) return ;;
+        *)     cmd="cat -- \"$f\"" ;;
+    esac
+    errs=$(eval "$cmd" 2>/dev/null | grep -c -E "$PATTERN")
+    warns=$(eval "$cmd" 2>/dev/null | grep -c -E "$WPAT")
+    if [ "${errs:-0}" -gt 0 ] || [ "${warns:-0}" -gt 50 ]; then
+        sz=$(stat -c '%s' "$f" 2>/dev/null || stat -f '%z' "$f" 2>/dev/null)
+        printf '%s\terrors=%s\twarns=%s\tsize=%s\n' "$f" "$errs" "$warns" "$sz"
+        # Show up to 5 sample error lines.
+        eval "$cmd" 2>/dev/null | grep -E "$PATTERN" | head -5 | sed 's/^/    > /'
+    fi
+}
+
+echo "--- recent log files (mtime < 7d) ---"
+# Use locate when possible; otherwise restrict to /var/log walk.
+{
+    if command -v plocate >/dev/null 2>&1; then plocate /var/log 2>/dev/null
+    elif command -v locate >/dev/null 2>&1; then locate /var/log 2>/dev/null
+    fi
+    [ -d /var/log ] && du -a /var/log 2>/dev/null | awk '{ $1=""; sub(/^ /,""); print }'
+} | sort -u | while IFS= read -r f; do
+    [ -f "$f" ] || continue
+    case "$f" in *.log|*.log.*|*/messages|*/syslog|*/auth*|*/kern*|*/daemon*) ;; *) continue ;; esac
+    # mtime within 7 days
+    if [ "$(find "$f" -prune -mtime -7 2>/dev/null)" = "$f" ]; then
+        scan_file "$f"
+    fi
+done
+
+# 4. Disk usage of /var/log overall.
+echo
+echo "--- /var/log disk usage ---"
+du -sh /var/log 2>/dev/null
+du -sh /var/log/* 2>/dev/null | sort -h | tail -15
+
+# 5. Largest log files
+echo
+echo "--- top 15 largest files under /var/log ---"
+du -ab /var/log 2>/dev/null | sort -nr | head -15 | awk '{ printf "%10d  %s\n", $1, $2 }'