#!/usr/bin/perl -w
#
# lockhist - report transactions with significant time spent holding
# (or optionally waiting for) locks
#
# Dan Bloch - dbloch@google.com
### Miles O'Neal <miles.oneal@cirrus.com>
#
# Copyright (C) 2007 Google Inc.
### Changes labeled CLI copyright (C) 2016 Cirrus Logic, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#---------- CONSTANTS ----------
#
# The location of today's server log. Used if no log is specified on the
# command line or with the -z (tail) flag.
#
$TODAYS_LOG = "/p4/1/logs/log";
#
# The location of yesterday's server log. Used with the -y flag.
# Comment out this line to remove the -y flag if you don't rotate your log.
#
#--$YESTERDAYS_LOG = "/p4/logs/old/errorlog.1.gz";
#
# The number of lines at the end of the error log to process in -z (tail)
# mode. This can be tuned depending on your server's activity and how much
# tail information you want to see.
#
$TAIL_SIZE = 100000;
#
# The lock time threshold in seconds above which commands will be
# reported. Can be overridden with -t flag.
#
$THRESHOLD = 60;
#----------------------------------------------------------------------
$options = "t:yz:kKw";
$options =~ s/y// if ! $YESTERDAYS_LOG;
($_options = $options) =~ s/://g;
($prog = $0) =~ s#.*[/\\]##;
### Start CLI mod
$usage = "usage: $prog [-$_options] [<logfile> ...]
-t <seconds> : lock times greater than <seconds> seconds only (default=$THRESHOLD)
-y : use yesterday's logfile
-z <num_lines> : use last <num_lines> lines of current log (default=$TAIL_SIZE)
-k : only show processes killed by the server (MaxScanRows, etc.)
-K : exclude processes killed by administrator (signal 15)
-w : include processes with wait time but no time holding locks
By default, the current logfile is used.
";
### End CLI mod
$usage =~ s/^.*yesterday.*\n$//m if ! $YESTERDAYS_LOG;
die $usage if @ARGV && $ARGV[0] =~ /^(-?help|-\?+)$/i;
use Getopt::Std;
getopts($options) || die $usage;
$SHOW_KILLED_ONLY = 1, if defined $opt_k;
$IGNORE_KILLED_BY_SIGNAL = 1 if defined $opt_K;
$PRINT_WAITERS = 1 if defined $opt_w;
$THRESHOLD = $opt_t if defined $opt_t;
$YESTERDAY = 1 if defined $opt_y;
if (defined $opt_z) {
$TAIL = 1;
$TAIL_SIZE = $opt_z;
}
our($opt_k, $opt_K, $opt_w, $opt_y, $opt_z);
push(@INC, $1) if ($0 =~ /(.*)\//);
$THRESHOLD *= 1000; # seconds -> milliseconds
$IGNORE_KILLED_BY_SIGNAL = 1 if $SHOW_KILLED_ONLY;
#++ If we found a db line and subsequent lock time record, print and set
#++ $db to none. If we find a "--- client" record, set $db to none. This
#++ is to work around a bug induced by a log format change somewhere in
#++ a 2014.2 or later release. -Miles @Cirrus Logic
$record = "";
while (log_open()) {
while (<LOGFILE>) {
# process start
# 2005/12/19 10:03:55 pid 32356 foo@bar 10.253.8.73 [p4v] 'user-client
if (/^\t\d\d\d\d\/\d\d\/\d\d (\d\d:\d\d:\d\d).* '(user|dm)-/) {
possiblyPrintRecord($record) if $print_record; # previous process
undef $print_record;
($record = $_) =~ s/^\t/\n/;
} elsif (/^--- killed /) {
$record .= $_;
### Start CLI mod
} elsif (/^--- client/) {
$db = '<none>';
} elsif (/^$/) {
$db = '<none>';
### End CLI mod
} elsif (/^--- (db.*)/) {
# --- db.have
$db = $1;
} elsif (/^--- +(?:locks|total lock) +wait.held +read.write +(\d+)ms\+(\d+)ms\/(\d+)ms\+(\d+)ms/) {
### Start CLI mod
if ($db ne '<none>') {
### End CLI mod
# 2006.2:
# --- locks wait+held read/write 0ms+0ms/0ms+351007ms
# 2007.2:
# --- total lock wait+held read/write 0ms+0ms/0ms+822ms
# --- max lock wait+held read/write 0ms+0ms/0ms+822ms
# XXX This is a hack. Should be using "max" if it's present.
# In practice, it doesn't appear to make much difference.
# 2010.2:
# --- total lock wait+held read/write 0ms+9524ms/0ms+0ms
# XXX They added a aspace. Sheesh. Check for multiple spaces anywhere there might be one.
($readwait, $read, $writewait, $write) = ($1, $2, $3, $4);
$wait = $readwait + $writewait;
$print_record = 1 if $read >= $THRESHOLD || $write >= $THRESHOLD
|| ($wait >= $THRESHOLD && $PRINT_WAITERS);
$record .= " $db:";
$record .= " read: " . $read/1000 if $read > 1000; # 1 sec
$record .= " WRITE: " . $write/1000 if $write > 1000; # ""
$record .= " (wait: " . $wait/1000 . ")" if $wait > 1000; # ""
$record .= "\n";
### Start CLI mod
$db = '<none>';
}
### End CLI mod
# TO DO: more info would be better :-)
} elsif (/Process (\d+) exited on a signal \d+!/
&& ! $IGNORE_KILLED_BY_SIGNAL) {
print "\n$_";
}
}
}
possiblyPrintRecord($record) if $print_record;
if ($total_locked) {
$total_locked = int $total_locked/1000; # milliseconds -> seconds
print "\n$nlockers commands holding locks for a total of "
. "$total_locked seconds\n"; # is this actually useful?
# read locks can overlap, but assume it's uncommon
} else {
$THRESHOLD /= 1000; # milliseconds -> seconds
print "[no commands holding locks > $THRESHOLD seconds]\n";
}
exit 0;
#
# possiblyPrintRecord($record) - print record if it qualifies
#
# We call this function if some lock is held for more than $THRESHOLD,
# but that may have just been because the process was waiting on another
# process, in which case we don't want to report this one. Try to only
# display processes with significant non-wait lock
#
sub BEGIN { # block for static variables
my $last_printed = "";
my $last_non_wait;
sub possiblyPrintRecord {
my($record) = @_;
return if ! $record; # called with empty arg first time
return if $record eq $last_printed; # log sometimes repeats info
return if $SHOW_KILLED_ONLY && $record !~ /^--- killed/m;
print($record), $last_printed = $record, return if $PRINT_WAITERS;
return if ! hasSignificantLockTime($record);
$nlockers++;
$total_locked += $last_non_wait;
print $record;
$last_printed = $record;
}
#
# hasSignificantLockTime($record) - return true/false
#
# Algorithm is heuristic. Heuristic is to look at non-wait time on last
# lock OR, as of 2007.3, to check db.integed time (really db.integed -
# db.resolve wait) for dm-CommitSubmits. See additional discussion below.
#
# Record contains the process start line, and lock records as
# db.rev: read: 17.691
# db.rev: WRITE: 57.365 (wait: 5.054)
# db.rev: (wait: 57.2)
#
sub hasSignificantLockTime {
my($record) = @_;
# In 2007.2, Perforce has started dropping locks when commands are
# done with them, instead of dropping them all at once. So far,
# AFAIK this effects only dm-CommitSubmit, where db.counters and
# db.integed are held much longer than the rest of the locks.
# We look at db.integed because (1) it's acquired after db.counters,
# so db.counters is (slightly) more likely to include wait time, and
# (2) in 2007.3 db.counters will also be dropped before db.integed.
#
# Per Perforce support call #1823614, this time can include time
# spent waiting for db.resolve to be released by another command,
# so we subtract that. It will be shown separately if it has
# significant lock time of it's own.
#
if ($record =~ /'dm-CommitSubmit'/ && $record =~ /db.integed: WRITE: ([\d.]+)/) {
$last_non_wait = $1;
# now subtract db.resolve wait time
$last_non_wait -= $1 if $record =~ /db.resolve: .*wait: ([\d.]+)/;
$last_non_wait *= 1000;
return 1 if $last_non_wait >= $THRESHOLD;
}
# For anything besides dm-CommitSubmit, lock time is considered
# significant if in addition to the longest time being > $THRESHOLD
# (we wouldn't be in this function otherwise), the last non-wait time
# is more than a third of the total time (5 seconds for the default
# 15-second threshold).
#
my $hasSignificantLockTime = 0;
if ($record =~ /db.\w+: \w+: ([\d.]+).*$/) { # has non-wait time
$last_non_wait = $1;
$last_non_wait *= 1000; # seconds -> milliseconds
$hasSignificantLockTime = $last_non_wait >= $THRESHOLD/3;
}
# If the lock we looked at was on db.monitor, try again without it.
# Locks on db.monitor are taken while no other locks are held, so
# the other locks should be considered without it. (Visible locks
# on db.monitor are rare, but it has happened.)
if (! $hasSignificantLockTime && $record =~ /^\s*db.monitor:/m) {
$record =~ s/^\s*db.monitor:.*\n//m;
$hasSignificantLockTime = hasSignificantLockTime($record);
}
$hasSignificantLockTime;
}
} # end block for static variables
#
# log_open() - figure out log files based on @ARGV. Open on filehandle
# LOGFILE and return filename. Each call to log_open opens
# next file on list. Last call returns undef.
#
sub BEGIN { # block for static variables
my $first_time = 1;
my @argv;
sub log_open {
if ($first_time) {
$first_time = 0;
@argv = @ARGV;
# Handle request to tail log. Note return from within block.
# This is a bit of a hack. The second time through, @argv is
# empty so function will just return.
if ($TAIL) {
die "-z cannot be used if filenames are specified\n" if @ARGV;
open(LOGFILE, "tail -$TAIL_SIZE $TODAYS_LOG|") || die "couldn't tail $TODAYS_LOG: $!\n";
print_log_time();
return "tail $TODAYS_LOG";
}
# handle request for yesterday's log
if ($YESTERDAY) {
die "-y cannot be used if filenames are specified\n" if @ARGV;
@argv = ($YESTERDAYS_LOG);
}
if (! @argv) {
if (-t) { # not a pipe, use today's log
@argv = ($TODAYS_LOG);
} else { # pipe
*LOGFILE= *STDIN;
return "stdin";
}
}
} else {
close LOGFILE;
}
my $file;
if ($file = shift @argv) {
open(LOGFILE, $file) || die "couldn't open $file: $!\n";
if ($file =~ /\.gz$/) {
# previous open is useful to report errors
open(LOGFILE, "zcat $file|") || die "couldn't zcat $file: $!\n";
}
}
$file;
}
} # end block for static variables
sub print_log_time {
my($yymmdd, $hhmmss);
while (<LOGFILE>) {
($yymmdd, $hhmmss) = ($1, $2), last if /^\t([\d\/]+) ([\d:]+) /;
}
die("no date found\n") if ! $yymmdd;
print "[starting at: $yymmdd $hhmmss]\n";
}
### vi:set ts=4: