recommend_limits.py #1

  • //
  • guest/
  • russell_jackson/
  • sdp/
  • Server/
  • Unix/
  • p4/
  • common/
  • bin/
  • recommend_limits.py
  • View
  • Commits
  • Open Download .zip Download (34 KB)
#!/usr/bin/env python3
"""
Perforce Group Limits Recommender

Analyzes a Perforce server log (P4LOG with server=3 tracking) and recommends
baseline group limits (MaxScanRows, MaxResults, MaxLockTime, MaxMemory) that
would allow normal user operations while blocking resource-intensive commands.

The approach:
  1. Parse all completed command blocks from the log.
  2. Compute per-command totals for scan rows, lock time, and memory.
  3. Build percentile distributions and identify the "knee" where commands
     transition from normal to problematic.
  4. Recommend limits at a configurable percentile (default: p99) so that
     99% of commands succeed and only the worst 1% are blocked.
  5. Print a report showing the distribution, the recommended limits,
     and the specific commands that would be blocked.

Usage:
    python recommend_limits.py <logfile> [options]

Options:
    --percentile N    Target percentile for limits (default: 99)
    --top N           Show top N blocked commands per metric (default: 15)
    --csv FILE        Export per-command data to CSV for further analysis
    --sample N        Only parse first N lines (for quick testing on huge logs)
    --quiet           Suppress progress messages
"""

import argparse
import csv
import math
import re
import sys
import time
from collections import defaultdict
from dataclasses import dataclass, field
from datetime import datetime
from typing import Dict, List, Optional, Tuple


# ---------------------------------------------------------------------------
# Data model
# ---------------------------------------------------------------------------

@dataclass
class CommandRecord:
    """Metrics extracted from one completed command block."""
    timestamp: Optional[datetime] = None
    pid: int = 0
    user: str = ""
    client: str = ""
    ip_address: str = ""
    app: str = ""
    command: str = ""
    args: str = ""

    lapse_seconds: float = 0.0
    memory_cmd_mb: float = 0.0
    memory_proc_mb: float = 0.0

    rpc_msgs_in: int = 0
    rpc_msgs_out: int = 0
    rpc_size_in_mb: float = 0.0
    rpc_size_out_mb: float = 0.0
    rpc_snd_seconds: float = 0.0
    rpc_rcv_seconds: float = 0.0

    # Accumulated across all db.* tables for this command
    total_rows_scan: int = 0
    total_rows_get: int = 0
    total_rows_pos: int = 0
    total_rows_put: int = 0
    total_rows_del: int = 0

    # Lock times in ms, accumulated across all tables
    total_read_lock_wait_ms: float = 0.0
    total_read_lock_held_ms: float = 0.0
    total_write_lock_wait_ms: float = 0.0
    total_write_lock_held_ms: float = 0.0

    # Max individual table lock times
    max_read_lock_wait_ms: float = 0.0
    max_read_lock_held_ms: float = 0.0
    max_write_lock_wait_ms: float = 0.0
    max_write_lock_held_ms: float = 0.0
    max_lock_table: str = ""

    # Peek (held under shared/intent lock) times - for MaxLockTime
    total_peek_held_ms: float = 0.0
    max_peek_held_ms: float = 0.0

    # Pages read (correlates with I/O pressure)
    total_pages_in: int = 0

    @property
    def total_scan_rows(self) -> int:
        """Total rows scanned (get + pos + scan) - maps to MaxScanRows."""
        return self.total_rows_get + self.total_rows_pos + self.total_rows_scan

    @property
    def max_lock_time_ms(self) -> float:
        """Longest lock held on any single table - maps to MaxLockTime.

        MaxLockTime limits how long any single database table can be locked
        during a scan. We use the maximum of:
        - max read lock held
        - max write lock held
        - max peek held
        """
        return max(
            self.max_read_lock_held_ms,
            self.max_write_lock_held_ms,
            self.max_peek_held_ms,
        )


# ---------------------------------------------------------------------------
# Parser
# ---------------------------------------------------------------------------

# Header: timestamp pid N user@client ip/proxy [app] 'command args'
# The IP field can be ip or ip/proxy-ip
RE_HEADER = re.compile(
    r'^\s*(\d{4}/\d{2}/\d{2}\s+\d{2}:\d{2}:\d{2})\s+'
    r'pid\s+(\d+)\s+'
    r'(\S+?)@(\S+)\s+'
    r'([\d.:/a-fA-F]+(?:/[\d.:/a-fA-F]+)?)\s+'
    r'\[(.+?)\]\s*'
    r"'(.+)'(.*)$"
)

# Completed line: timestamp pid N completed NNNs
RE_COMPLETED = re.compile(
    r'^\s*(\d{4}/\d{2}/\d{2}\s+\d{2}:\d{2}:\d{2})\s+'
    r'pid\s+(\d+)\s+completed\s+(\d+(?:\.\d+)?)(s|ms)'
)

RE_LAPSE = re.compile(r'^---\s+lapse\s+(\d*\.?\d+)(s|ms)')
RE_MEMORY = re.compile(
    r'^---\s+memory\s+cmd/proc\s+(\d+(?:\.\d+)?)mb/(\d+(?:\.\d+)?)mb'
)
RE_RPC = re.compile(
    r'^---\s+rpc\s+msgs/size\s+in\+out\s+'
    r'(\d+)\+(\d+)/(\d+(?:\.\d+)?)mb\+(\d+(?:\.\d+)?)mb'
    r'(?:\s+himarks\s+\d+/\d+)?'
    r'(?:\s+snd/rcv\s+(\d*\.?\d+)s/(\d*\.?\d+)s)?'
)
RE_DB_TABLE = re.compile(r'^---\s+(db\.\w+)\s*$')
RE_PAGES = re.compile(
    r'^---\s+pages\s+in\+out\+cached\s+(\d+)\+(\d+)\+(\d+)'
)
RE_LOCKS_ROWS = re.compile(
    r'^---\s+locks\s+read/write\s+(\d+)/(\d+)\s+'
    r'rows\s+get\+pos\+scan\s+put\+del\s+'
    r'(\d+)\+(\d+)\+(\d+)\s+(\d+)\+(\d+)'
)
RE_TOTAL_LOCK = re.compile(
    r'^---\s+total\s+lock\s+wait\+held\s+read/write\s+'
    r'(\d+)ms\+(\d+)ms/(\d+)ms\+(\d+)ms'
)
RE_MAX_LOCK = re.compile(
    r'^---\s+max\s+lock\s+wait\+held\s+read/write\s+'
    r'(\d+)ms\+(\d+)ms/(\d+)ms\+(\d+)ms'
)
RE_PEEK = re.compile(
    r'^---\s+peek\s+count\s+\d+\s+'
    r'wait\+held\s+total/max\s+'
    r'(\d+)ms\+(\d+)ms/(\d+)ms\+(\d+)ms'
)
RE_FILETOTALS = re.compile(r'^---\s+filetotals')


def parse_log(filepath: str, max_lines: int = 0,
              quiet: bool = False) -> List[CommandRecord]:
    """Parse a Perforce server log and return completed command records.

    This parser handles the interleaved nature of P4LOG output where
    multiple PIDs write concurrently. It tracks state per PID and
    emits a record whenever a lapse line is seen (indicating the
    command block is complete).
    """
    records: List[CommandRecord] = []
    # Track in-flight commands by PID
    by_pid: Dict[int, CommandRecord] = {}
    # Current db table context per PID
    db_table_ctx: Dict[int, str] = {}
    # Current PID context (last header or tracking line seen)
    current_pid: Optional[int] = None
    line_count = 0
    t0 = time.time()
    last_progress = t0

    def emit(pid: int):
        """Emit the record for a PID and remove it from tracking."""
        if pid in by_pid:
            rec = by_pid.pop(pid)
            if rec.command and rec.lapse_seconds > 0:
                records.append(rec)
            db_table_ctx.pop(pid, None)

    try:
        with open(filepath, 'r', encoding='utf-8', errors='replace') as fh:
            for line in fh:
                line_count += 1
                if max_lines and line_count > max_lines:
                    break

                if not quiet and line_count % 5_000_000 == 0:
                    now = time.time()
                    elapsed = now - t0
                    rate = line_count / elapsed if elapsed > 0 else 0
                    print(
                        f"\r  Parsed {line_count:,} lines "
                        f"({rate:,.0f} lines/s, "
                        f"{len(records):,} commands) ...",
                        end='', flush=True, file=sys.stderr
                    )

                stripped = line.strip()
                if not stripped or stripped in (
                    'Perforce server info:', 'Perforce server error:'
                ):
                    continue

                # --- Header line (new command block for a PID) ---
                m = RE_HEADER.match(stripped)
                if m:
                    pid = int(m.group(2))
                    current_pid = pid
                    full_cmd = m.group(7)
                    if pid in by_pid:
                        old = by_pid[pid]
                        # If the existing record already has a lapse time
                        # AND the new header is for a different command,
                        # this is a new command reusing the PID - emit
                        # the old one. Otherwise it's a repeated header
                        # for the same command (triggers, virtual locks).
                        if old.lapse_seconds > 0 and old.args != full_cmd:
                            emit(pid)
                        else:
                            # Same command - just update context
                            continue
                    rec = CommandRecord()
                    try:
                        rec.timestamp = datetime.strptime(
                            m.group(1), '%Y/%m/%d %H:%M:%S'
                        )
                    except ValueError:
                        pass
                    rec.pid = pid
                    rec.user = m.group(3)
                    rec.client = m.group(4)
                    rec.ip_address = m.group(5)
                    rec.app = m.group(6)
                    rec.args = full_cmd
                    parts = full_cmd.split(None, 1)
                    rec.command = parts[0] if parts else full_cmd
                    by_pid[pid] = rec
                    continue

                # --- Completed line ---
                m = RE_COMPLETED.match(stripped)
                if m:
                    pid = int(m.group(2))
                    current_pid = pid
                    # A completed line means the overall command is done.
                    # The detailed stats block follows, so don't emit yet.
                    if pid not in by_pid:
                        by_pid[pid] = CommandRecord(pid=pid)
                    val = float(m.group(3))
                    unit = m.group(4)
                    by_pid[pid].lapse_seconds = val if unit == 's' else val / 1000.0
                    continue

                # --- Tracking lines (--- prefix) ---
                if not stripped.startswith('---'):
                    # Check if it's a header without brackets (e.g., rmt-Journal)
                    # or some other non-tracking line. Try simpler header.
                    continue

                # Determine which PID this tracking line belongs to.
                # Use current_pid context.
                pid = current_pid
                if pid is None or pid not in by_pid:
                    continue
                rec = by_pid[pid]

                # Lapse
                m = RE_LAPSE.match(stripped)
                if m:
                    val = float(m.group(1))
                    unit = m.group(2)
                    lapse = val if unit == 's' else val / 1000.0
                    # The lapse line ends a command block. Record it.
                    rec.lapse_seconds = lapse
                    # Don't emit yet - more stats follow the lapse line
                    continue

                # Memory
                m = RE_MEMORY.match(stripped)
                if m:
                    rec.memory_cmd_mb = float(m.group(1))
                    rec.memory_proc_mb = float(m.group(2))
                    continue

                # RPC
                m = RE_RPC.match(stripped)
                if m:
                    rec.rpc_msgs_in = int(m.group(1))
                    rec.rpc_msgs_out = int(m.group(2))
                    rec.rpc_size_in_mb = float(m.group(3))
                    rec.rpc_size_out_mb = float(m.group(4))
                    if m.group(5):
                        rec.rpc_snd_seconds = float(m.group(5))
                        rec.rpc_rcv_seconds = float(m.group(6))
                    continue

                # File totals (skip, just consume the line)
                if RE_FILETOTALS.match(stripped):
                    continue

                # DB table header
                m = RE_DB_TABLE.match(stripped)
                if m:
                    db_table_ctx[pid] = m.group(1)
                    continue

                # Pages
                m = RE_PAGES.match(stripped)
                if m:
                    rec.total_pages_in += int(m.group(1))
                    continue

                # Locks and rows
                m = RE_LOCKS_ROWS.match(stripped)
                if m:
                    rec.total_rows_get += int(m.group(3))
                    rec.total_rows_pos += int(m.group(4))
                    rec.total_rows_scan += int(m.group(5))
                    rec.total_rows_put += int(m.group(6))
                    rec.total_rows_del += int(m.group(7))
                    continue

                # Total lock times
                m = RE_TOTAL_LOCK.match(stripped)
                if m:
                    rw = float(m.group(1))
                    rh = float(m.group(2))
                    ww = float(m.group(3))
                    wh = float(m.group(4))
                    rec.total_read_lock_wait_ms += rw
                    rec.total_read_lock_held_ms += rh
                    rec.total_write_lock_wait_ms += ww
                    rec.total_write_lock_held_ms += wh
                    table = db_table_ctx.get(pid, "unknown")
                    if rh > rec.max_read_lock_held_ms:
                        rec.max_read_lock_held_ms = rh
                    if wh > rec.max_write_lock_held_ms:
                        rec.max_write_lock_held_ms = wh
                        rec.max_lock_table = table
                    if rw > rec.max_read_lock_wait_ms:
                        rec.max_read_lock_wait_ms = rw
                    if ww > rec.max_write_lock_wait_ms:
                        rec.max_write_lock_wait_ms = ww
                    continue

                # Max lock times
                m = RE_MAX_LOCK.match(stripped)
                if m:
                    rw = float(m.group(1))
                    rh = float(m.group(2))
                    ww = float(m.group(3))
                    wh = float(m.group(4))
                    table = db_table_ctx.get(pid, "unknown")
                    if rh > rec.max_read_lock_held_ms:
                        rec.max_read_lock_held_ms = rh
                    if wh > rec.max_write_lock_held_ms:
                        rec.max_write_lock_held_ms = wh
                        rec.max_lock_table = table
                    if rw > rec.max_read_lock_wait_ms:
                        rec.max_read_lock_wait_ms = rw
                    if ww > rec.max_write_lock_wait_ms:
                        rec.max_write_lock_wait_ms = ww
                    continue

                # Peek counts (shared/intent lock times)
                m = RE_PEEK.match(stripped)
                if m:
                    # total wait, total held, max wait, max held
                    total_held = float(m.group(2))
                    max_held = float(m.group(4))
                    rec.total_peek_held_ms += total_held
                    if max_held > rec.max_peek_held_ms:
                        rec.max_peek_held_ms = max_held
                    continue

                # "locks acquired by blocking" - informational, skip
                if stripped.startswith('locks acquired'):
                    continue

                # "lbr" lines, "pages split" lines - skip
                if stripped.startswith(('--- lbr', '--- pages split',
                                       '--- opens', '--- reads',
                                       '--- flush', '--- rmt-',
                                       'lapse')):
                    continue

    except IOError as e:
        print(f"Error reading log file: {e}", file=sys.stderr)
        sys.exit(1)

    # Emit any remaining in-flight records
    for pid in list(by_pid.keys()):
        emit(pid)

    if not quiet:
        elapsed = time.time() - t0
        print(
            f"\r  Parsed {line_count:,} lines in {elapsed:.1f}s "
            f"-> {len(records):,} completed commands",
            file=sys.stderr
        )

    return records


# ---------------------------------------------------------------------------
# Analysis helpers
# ---------------------------------------------------------------------------

def percentile(values: List[float], pct: float) -> float:
    """Compute the pct-th percentile of a sorted list of values."""
    if not values:
        return 0.0
    k = (len(values) - 1) * (pct / 100.0)
    f = math.floor(k)
    c = math.ceil(k)
    if f == c:
        return values[int(k)]
    return values[f] * (c - k) + values[c] * (k - f)


def format_count(n: int) -> str:
    """Format a large number with commas."""
    return f"{n:,}"


def format_ms(ms: float) -> str:
    """Format milliseconds as a human-readable string."""
    if ms < 1000:
        return f"{ms:.0f}ms"
    elif ms < 60_000:
        return f"{ms / 1000:.1f}s"
    else:
        return f"{ms / 60_000:.1f}m"


def round_up_nice(value: float) -> int:
    """Round up to a 'nice' number for limit setting (e.g., 500000, 10000)."""
    if value <= 0:
        return 0
    magnitude = 10 ** math.floor(math.log10(value))
    # Round up to next multiple of magnitude/2
    step = max(magnitude // 2, 1)
    return int(math.ceil(value / step) * step)


def truncate_args(args: str, max_len: int = 80) -> str:
    """Truncate command args for display."""
    if len(args) <= max_len:
        return args
    return args[:max_len - 3] + "..."


# ---------------------------------------------------------------------------
# Report generation
# ---------------------------------------------------------------------------

def generate_report(
    records: List[CommandRecord],
    target_percentile: float = 99.0,
    top_n: int = 15,
) -> Tuple[Dict[str, int], str]:
    """Analyze records and generate a text report with recommendations.

    Returns (recommended_limits_dict, report_text).
    """
    if not records:
        return {}, "No command records found in the log file.\n"

    lines: List[str] = []
    w = lines.append  # shorthand

    # --- Collect metric arrays ---
    scan_rows_list = []
    lock_time_list = []
    memory_list = []
    lapse_list = []

    for r in records:
        scan_rows_list.append(r.total_scan_rows)
        lock_time_list.append(r.max_lock_time_ms)
        memory_list.append(r.memory_cmd_mb)
        lapse_list.append(r.lapse_seconds)

    scan_rows_sorted = sorted(scan_rows_list)
    lock_time_sorted = sorted(lock_time_list)
    memory_sorted = sorted(memory_list)
    lapse_sorted = sorted(lapse_list)

    n = len(records)
    tp = target_percentile

    # --- Compute percentiles ---
    pcts = [50, 75, 90, 95, 99, 99.5, 99.9, 100]

    w("=" * 78)
    w("  PERFORCE GROUP LIMITS RECOMMENDER - ANALYSIS REPORT")
    w("=" * 78)
    w("")
    w(f"  Log commands analyzed:  {n:,}")
    if records[0].timestamp and records[-1].timestamp:
        t_min = min(r.timestamp for r in records if r.timestamp)
        t_max = max(r.timestamp for r in records if r.timestamp)
        w(f"  Time range:             {t_min} to {t_max}")
    w(f"  Target percentile:      p{tp}")
    w("")

    # --- Command distribution ---
    cmd_counts: Dict[str, int] = defaultdict(int)
    for r in records:
        cmd_counts[r.command] += 1
    top_cmds = sorted(cmd_counts.items(), key=lambda x: -x[1])[:20]

    w("-" * 78)
    w("  COMMAND DISTRIBUTION (top 20)")
    w("-" * 78)
    w(f"  {'Command':<40s} {'Count':>10s} {'%':>7s}")
    w(f"  {'-'*40} {'-'*10} {'-'*7}")
    for cmd, cnt in top_cmds:
        w(f"  {cmd:<40s} {cnt:>10,d} {100*cnt/n:>6.1f}%")
    w("")

    # --- User distribution (top offenders by scan rows) ---
    user_scans: Dict[str, int] = defaultdict(int)
    user_cmds: Dict[str, int] = defaultdict(int)
    for r in records:
        user_scans[r.user] += r.total_scan_rows
        user_cmds[r.user] += 1
    top_scanners = sorted(user_scans.items(), key=lambda x: -x[1])[:15]

    w("-" * 78)
    w("  TOP USERS BY TOTAL SCAN ROWS")
    w("-" * 78)
    w(f"  {'User':<35s} {'Total Scan Rows':>18s} {'Commands':>10s} {'Avg Scan':>12s}")
    w(f"  {'-'*35} {'-'*18} {'-'*10} {'-'*12}")
    for user, total in top_scanners:
        cnt = user_cmds[user]
        avg = total // cnt if cnt else 0
        w(f"  {user:<35s} {total:>18,d} {cnt:>10,d} {avg:>12,d}")
    w("")

    # --- Percentile tables ---
    w("-" * 78)
    w("  METRIC DISTRIBUTIONS")
    w("-" * 78)

    def pct_table(name, sorted_vals, fmt_func, unit):
        w(f"\n  {name}:")
        w(f"  {'Percentile':>12s}  {'Value':>18s}  {'Commands at/below':>20s}")
        w(f"  {'-'*12}  {'-'*18}  {'-'*20}")
        for p in pcts:
            v = percentile(sorted_vals, p)
            count_at = int(len(sorted_vals) * p / 100)
            w(f"  {'p' + str(p):>12s}  {fmt_func(v):>18s}  {count_at:>18,d}  ")
        w("")

    pct_table(
        "MaxScanRows (total get+pos+scan per command)",
        scan_rows_sorted,
        lambda v: format_count(int(v)),
        "rows"
    )
    pct_table(
        "MaxLockTime (max single-table lock held per command)",
        lock_time_sorted,
        lambda v: format_ms(v),
        "ms"
    )
    pct_table(
        "MaxMemory (cmd memory per command)",
        memory_sorted,
        lambda v: f"{v:.0f} MB",
        "MB"
    )
    pct_table(
        "Lapse Time (for reference - not a group limit)",
        lapse_sorted,
        lambda v: f"{v:.3f}s" if v < 10 else f"{v:.1f}s",
        "s"
    )

    # --- Compute recommendations ---
    rec_scan = percentile(scan_rows_sorted, tp)
    rec_lock = percentile(lock_time_sorted, tp)
    rec_mem = percentile(memory_sorted, tp)

    # Round up to nice numbers
    rec_scan_nice = max(round_up_nice(rec_scan), 100_000)
    rec_lock_nice = max(round_up_nice(rec_lock), 30_000)
    rec_mem_nice = max(round_up_nice(rec_mem), 2048)

    # Also compute MaxResults as total_rows_get+pos (results returned)
    results_list = sorted(r.total_rows_get + r.total_rows_pos for r in records)
    rec_results = percentile(results_list, tp)
    rec_results_nice = max(round_up_nice(rec_results), 50_000)

    recommendations = {
        'MaxResults': rec_results_nice,
        'MaxScanRows': rec_scan_nice,
        'MaxLockTime': rec_lock_nice,
        'MaxMemory': rec_mem_nice,
    }

    w("=" * 78)
    w("  RECOMMENDED GROUP LIMITS")
    w("=" * 78)
    w("")
    w(f"  Based on p{tp} of {n:,} commands:")
    w("")
    w(f"  {'Limit':<20s} {'p{0} Value'.format(tp):>18s} {'Recommended':>18s} {'Rationale'}")
    w(f"  {'-'*20} {'-'*18} {'-'*18} {'-'*30}")
    w(f"  {'MaxResults':<20s} {format_count(int(rec_results)):>18s} "
      f"{format_count(rec_results_nice):>18s} "
      f"rows returned (get+pos)")
    w(f"  {'MaxScanRows':<20s} {format_count(int(rec_scan)):>18s} "
      f"{format_count(rec_scan_nice):>18s} "
      f"rows scanned (get+pos+scan)")
    w(f"  {'MaxLockTime':<20s} {format_ms(rec_lock):>18s} "
      f"{format_count(rec_lock_nice):>18s} "
      f"ms max lock held")
    w(f"  {'MaxMemory':<20s} {rec_mem:.0f} MB{' '*(14 - len(f'{rec_mem:.0f}'))} "
      f"{format_count(rec_mem_nice):>18s} "
      f"MB cmd memory")
    w("")

    w("  p4 group spec format:")
    w("  ----------------------")
    w(f"  MaxResults:    {rec_results_nice}")
    w(f"  MaxScanRows:   {rec_scan_nice}")
    w(f"  MaxLockTime:   {rec_lock_nice}")
    w(f"  MaxMemory:     {rec_mem_nice}")
    w("")

    # --- Commands that WOULD be blocked ---
    blocked_scan = [r for r in records if r.total_scan_rows > rec_scan_nice]
    blocked_lock = [r for r in records
                    if r.max_lock_time_ms > rec_lock_nice]
    blocked_mem = [r for r in records if r.memory_cmd_mb > rec_mem_nice]
    blocked_results = [r for r in records
                       if (r.total_rows_get + r.total_rows_pos) > rec_results_nice]

    # Union of all blocked
    blocked_ids = set()
    for r in blocked_scan + blocked_lock + blocked_mem + blocked_results:
        blocked_ids.add(id(r))
    total_blocked = len(blocked_ids)

    w("=" * 78)
    w("  COMMANDS THAT WOULD BE BLOCKED")
    w("=" * 78)
    w("")
    w(f"  Total commands blocked: {total_blocked:,} "
      f"of {n:,} ({100*total_blocked/n:.2f}%)")
    w(f"    - Blocked by MaxScanRows:  {len(blocked_scan):,}")
    w(f"    - Blocked by MaxLockTime:  {len(blocked_lock):,}")
    w(f"    - Blocked by MaxMemory:    {len(blocked_mem):,}")
    w(f"    - Blocked by MaxResults:   {len(blocked_results):,}")
    w("")

    def print_blocked(title, blocked, sort_key, fmt_value):
        if not blocked:
            return
        w(f"  --- {title} (top {min(top_n, len(blocked))}) ---")
        w(f"  {'User':<25s} {'Value':>14s} {'Lapse':>8s} {'Command'}")
        w(f"  {'-'*25} {'-'*14} {'-'*8} {'-'*50}")
        for r in sorted(blocked, key=sort_key, reverse=True)[:top_n]:
            lapse_str = f"{r.lapse_seconds:.1f}s" if r.lapse_seconds < 1000 \
                else f"{r.lapse_seconds:.0f}s"
            w(f"  {r.user:<25s} {fmt_value(r):>14s} "
              f"{lapse_str:>8s} {truncate_args(r.args)}")
        w("")

    print_blocked(
        f"Blocked by MaxScanRows > {format_count(rec_scan_nice)}",
        blocked_scan,
        lambda r: r.total_scan_rows,
        lambda r: format_count(r.total_scan_rows)
    )
    print_blocked(
        f"Blocked by MaxLockTime > {format_count(rec_lock_nice)} ms",
        blocked_lock,
        lambda r: r.max_lock_time_ms,
        lambda r: format_ms(r.max_lock_time_ms)
    )
    print_blocked(
        f"Blocked by MaxMemory > {rec_mem_nice} MB",
        blocked_mem,
        lambda r: r.memory_cmd_mb,
        lambda r: f"{r.memory_cmd_mb:.0f} MB"
    )
    print_blocked(
        f"Blocked by MaxResults > {format_count(rec_results_nice)}",
        blocked_results,
        lambda r: r.total_rows_get + r.total_rows_pos,
        lambda r: format_count(r.total_rows_get + r.total_rows_pos)
    )

    # --- Patterns to investigate ---
    w("=" * 78)
    w("  PATTERNS TO INVESTIGATE")
    w("=" * 78)
    w("")

    # Find commands with wildcard patterns that scan heavily
    wildcard_cmds: Dict[str, List[CommandRecord]] = defaultdict(list)
    for r in blocked_scan:
        # Extract the base command pattern
        pattern = r.command
        wildcard_cmds[pattern].append(r)

    if wildcard_cmds:
        w("  Commands frequently exceeding MaxScanRows:")
        w(f"  {'Command':<35s} {'Count':>8s} {'Avg Scan Rows':>16s} "
          f"{'Max Scan Rows':>16s}")
        w(f"  {'-'*35} {'-'*8} {'-'*16} {'-'*16}")
        for cmd, recs in sorted(wildcard_cmds.items(),
                                 key=lambda x: -len(x[1])):
            avg = sum(r.total_scan_rows for r in recs) // len(recs)
            mx = max(r.total_scan_rows for r in recs)
            w(f"  {cmd:<35s} {len(recs):>8,d} "
              f"{format_count(avg):>16s} {format_count(mx):>16s}")
        w("")

    # Service accounts vs interactive users
    svc_blocked = [r for r in records
                   if id(r) in blocked_ids
                   and r.user.lower().startswith(('svc', 'svcrpl', 'svcbld',
                                                   'svcdev', 'svcmon',
                                                   'svcswr'))]
    user_blocked = [r for r in records
                    if id(r) in blocked_ids
                    and not r.user.lower().startswith(('svc', 'svcrpl',
                                                       'svcbld', 'svcdev',
                                                       'svcmon', 'svcswr'))]

    w(f"  Blocked commands by account type:")
    w(f"    Service accounts (svc*):    {len(svc_blocked):,}")
    w(f"    Interactive users:          {len(user_blocked):,}")
    w("")

    if user_blocked:
        w("  Interactive users with blocked commands:")
        user_blocked_counts: Dict[str, int] = defaultdict(int)
        for r in user_blocked:
            user_blocked_counts[r.user] += 1
        for user, cnt in sorted(user_blocked_counts.items(),
                                 key=lambda x: -x[1])[:15]:
            w(f"    {user:<35s} {cnt:>6,d} commands blocked")
        w("")

    # --- Service account recommendations ---
    w("=" * 78)
    w("  GROUP CONFIGURATION RECOMMENDATIONS")
    w("=" * 78)
    w("")
    w("  1. Create a 'limits' group with the recommended limits above")
    w("     and add ALL users to it (use update_limits.py).")
    w("")
    w("  2. Create a 'no_limits' group with 'unlimited' for each field")
    w("     and add trusted service accounts that need higher limits:")
    w("       - Replication service accounts (svcrpl_*)")
    w("       - Build/automation accounts that legitimately need broad access")
    w("       - Swarm/review service accounts")
    w("")
    w("  3. For build service accounts that are blocked but should work,")
    w("     consider an intermediate 'build_limits' group with higher")
    w("     MaxScanRows and MaxLockTime.")
    w("")
    w("  4. Review the 'Patterns to Investigate' section above for")
    w("     commands that should be optimized rather than given higher limits")
    w("     (e.g., broad wildcard depot paths, unfiltered changelists).")
    w("")
    w("  Example p4 group specs:")
    w("")
    w("  --- 'limits' group (all users) ---")
    w(f"  MaxResults:    {rec_results_nice}")
    w(f"  MaxScanRows:   {rec_scan_nice}")
    w(f"  MaxLockTime:   {rec_lock_nice}")
    w(f"  MaxMemory:     {rec_mem_nice}")
    w("")
    w("  --- 'no_limits' group (trusted service accounts) ---")
    w("  MaxResults:    unlimited")
    w("  MaxScanRows:   unlimited")
    w("  MaxLockTime:   unlimited")
    w("  MaxMemory:     unlimited")
    w("")

    return recommendations, '\n'.join(lines)


# ---------------------------------------------------------------------------
# CSV export
# ---------------------------------------------------------------------------

def export_csv(records: List[CommandRecord], filepath: str):
    """Export per-command data to CSV for external analysis."""
    fieldnames = [
        'timestamp', 'pid', 'user', 'client', 'ip_address', 'app',
        'command', 'args', 'lapse_seconds',
        'memory_cmd_mb', 'memory_proc_mb',
        'rpc_msgs_in', 'rpc_msgs_out',
        'rpc_size_in_mb', 'rpc_size_out_mb',
        'rpc_snd_seconds', 'rpc_rcv_seconds',
        'total_scan_rows', 'total_rows_get', 'total_rows_pos',
        'total_rows_scan', 'total_rows_put', 'total_rows_del',
        'total_read_lock_wait_ms', 'total_read_lock_held_ms',
        'total_write_lock_wait_ms', 'total_write_lock_held_ms',
        'max_read_lock_held_ms', 'max_write_lock_held_ms',
        'max_peek_held_ms', 'max_lock_time_ms',
        'max_lock_table', 'total_pages_in',
    ]
    with open(filepath, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for r in records:
            writer.writerow({
                'timestamp': r.timestamp.isoformat() if r.timestamp else '',
                'pid': r.pid,
                'user': r.user,
                'client': r.client,
                'ip_address': r.ip_address,
                'app': r.app,
                'command': r.command,
                'args': r.args,
                'lapse_seconds': r.lapse_seconds,
                'memory_cmd_mb': r.memory_cmd_mb,
                'memory_proc_mb': r.memory_proc_mb,
                'rpc_msgs_in': r.rpc_msgs_in,
                'rpc_msgs_out': r.rpc_msgs_out,
                'rpc_size_in_mb': r.rpc_size_in_mb,
                'rpc_size_out_mb': r.rpc_size_out_mb,
                'rpc_snd_seconds': r.rpc_snd_seconds,
                'rpc_rcv_seconds': r.rpc_rcv_seconds,
                'total_scan_rows': r.total_scan_rows,
                'total_rows_get': r.total_rows_get,
                'total_rows_pos': r.total_rows_pos,
                'total_rows_scan': r.total_rows_scan,
                'total_rows_put': r.total_rows_put,
                'total_rows_del': r.total_rows_del,
                'total_read_lock_wait_ms': r.total_read_lock_wait_ms,
                'total_read_lock_held_ms': r.total_read_lock_held_ms,
                'total_write_lock_wait_ms': r.total_write_lock_wait_ms,
                'total_write_lock_held_ms': r.total_write_lock_held_ms,
                'max_read_lock_held_ms': r.max_read_lock_held_ms,
                'max_write_lock_held_ms': r.max_write_lock_held_ms,
                'max_peek_held_ms': r.max_peek_held_ms,
                'max_lock_time_ms': r.max_lock_time_ms,
                'max_lock_table': r.max_lock_table,
                'total_pages_in': r.total_pages_in,
            })


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(
        description='Analyze a Perforce server log and recommend group limits.',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python recommend_limits.py server.log
  python recommend_limits.py server.log --percentile 95 --csv data.csv
  python recommend_limits.py server.log --sample 10000000 --quiet
"""
    )
    parser.add_argument('logfile', help='Path to the Perforce server log file')
    parser.add_argument(
        '--percentile', type=float, default=99.0,
        help='Target percentile for limits (default: 99)'
    )
    parser.add_argument(
        '--top', type=int, default=15,
        help='Show top N blocked commands per metric (default: 15)'
    )
    parser.add_argument(
        '--csv', metavar='FILE', default=None,
        help='Export per-command data to CSV'
    )
    parser.add_argument(
        '--sample', type=int, default=0,
        help='Only parse first N lines (0 = all)'
    )
    parser.add_argument(
        '--quiet', action='store_true',
        help='Suppress progress messages'
    )

    args = parser.parse_args()

    if not args.quiet:
        print(f"Parsing {args.logfile} ...", file=sys.stderr)

    records = parse_log(
        args.logfile,
        max_lines=args.sample,
        quiet=args.quiet
    )

    if not records:
        print("No completed commands found in the log file.", file=sys.stderr)
        sys.exit(1)

    if args.csv:
        if not args.quiet:
            print(f"Exporting {len(records):,} records to {args.csv} ...",
                  file=sys.stderr)
        export_csv(records, args.csv)

    recommendations, report = generate_report(
        records,
        target_percentile=args.percentile,
        top_n=args.top,
    )

    print(report)


if __name__ == '__main__':
    main()
# Change User Description Committed
#1 32517 Russell C. Jackson (Rusty) New log analysis files.