#!/usr/bin/python # p4clog.py [ options ] < p4d-log-file > tables.csv # # $Id: //guest/roadkills_r_us/log-analyzer/psla/p4clog.py#1 $ # # -s - submit only # -c - show locking by command # -d - show delay analysis # -h - show lock held histogram # -i - interval for histogram instead of 600 # -w - show wait vs held analysis # -v - log page I/Os, wait/hold more than 100ms, delays # import sys, re, time, getopt, math # logParse - parse a line of p4d log tracking class logParse: def __init__( p ): # what we're parsing: # 2013/01/17 22:01:11 pid 28322 u@c 10.0.50.39/10.0.0.40 [p4tter/v72 (brokered)] 'command args' # --- db.rev # --- pages in+out+cached 222520+0+96 # --- locks read/write 1/0 rows get+pos+scan put+del 0+1+7372595 0+0 # --- total lock wait+held read/write 0ms+9578ms/0ms+0ms # --- db.change # --- total lock wait+held read/write 0ms+9583ms/0ms+0ms p.pa = re.compile( "^\t(..../../.. ..:..:..) pid [^']*'([^ ']*)([^']*)" ) p.pb = re.compile( "^--- (db.*)$" ) p.pc = re.compile( "^--- pages in\+out\+cached ([^+]*)\+([^+]*)\+(.*)$" ) p.pd = re.compile( "^--- locks .* put\+del ([^+]*)\+([^+]*)\+([^ ]*) ([^+]*)\+(.*)$" ) p.pe = re.compile( "^--- total .* read/write ([^m]*)ms\+([^m]*)ms/([^m]*)ms\+([^m]*)ms$" ) p.pf = re.compile( "^$" ) p.reset() def reset( p ): p.date = p.command = p.args = p.table = "" # parse a single line of log def parse( p, line ): p.lineType = "none" ll = len( line ) if( line[0] == "\n" ): p.lineType = "end" return if( ll > 5 and line[5] == "/" ): m = p.pa.search( line ) if( m ): p.date,p.command,p.args = m.group(1,2,3) p.lineType = "command" return if( ll > 4 and line[4] == "d" ): m = p.pb.search( line ) if( m ): p.table = m.group(1) return if( ll > 6 and line[6] == "t" ): m = p.pe.search( line ) if( m ): p.lrw,p.lrh,p.lww,p.lwh = map(int,m.group(1,2,3,4)) p.lineType = "locks" return if( ll > 6 and line[6] == "p" ): m = p.pc.search( line ) if( m ): p.pgi,p.pgo,p.pgc = map(int,m.group(1,2,3)) p.lineType = "pages" p.isPages = True return return # don't use these yet if( ll > 6 and line[6] == "l" ): m = p.pd.search( line ) if( m ): rgt,rpo,rsc,rpt,rdl = map(int,m.group(1,2,3,4,5)) return # tableCounts - read/write wait/held time counter class tableCounts: def __init__( s, lrw, lrh, lww, lwh ): s.readWaitCount = lrw > 0; s.readWaitTime = lrw s.readHeldCount = lrh > 0; s.readHeldTime = lrh s.writeWaitCount = lww > 0; s.writeWaitTime = lww s.writeHeldCount = lwh > 0; s.writeHeldTime = lwh def __iadd__( s, o ): s.readHeldCount += o.readHeldCount s.readHeldTime += o.readHeldTime s.readWaitCount += o.readWaitCount s.readWaitTime += o.readWaitTime s.writeHeldCount += o.writeHeldCount s.writeHeldTime += o.writeHeldTime s.writeWaitCount += o.writeWaitCount s.writeWaitTime += o.writeWaitTime return s def __ior__( s, o ): # add waits, but max of held s.readWaitTime += o.readWaitTime s.writeWaitTime += o.writeWaitTime if( o.readHeldTime > s.readHeldTime ): s.readHeldTime = o.readHeldTime if( o.writeHeldTime > s.writeHeldTime ): s.writeHeldTime = o.writeHeldTime return s # Spanner - track how long locks are held for one db file class Spanner: # per interval result class Result: def __init__( r, time, secsHeld, secsFree, longestHeld ): r.time = time r.secsHeld = secsHeld r.secsFree = secsFree r.longestHeld = longestHeld # when the log file begins epoch = 0 def __init__( s ): # number of locks held in each second # since log file begins (gets long!) s.byTheSeconds = [] s.byTheInterval = [] # translate log file date into seconds since epoch def secs( s, date ): try: t = int( time.mktime( time.strptime( date, "%Y/%m/%d %H:%M:%S" ) ) ) if( Spanner.epoch == 0 ): Spanner.epoch = t return t - Spanner.epoch except ValueError: print >> sys.stderr, "bad date %s\n" % date, return 0 # process a single lock hold def span( s, start, held ): # translate start/held to start/end in second since epoch heldStart = s.secs( start ) heldEnd = heldStart + ( held + 500 ) / 1000 # sometime the log file is out of order if( heldStart < 0 or heldStart == heldEnd ): return # dynamically extend array if( len( s.byTheSeconds ) <= heldEnd ): s.byTheSeconds.extend( [ 0 ] * ( heldEnd + 1 - len( s.byTheSeconds ) ) ) # count this as an instance this lock was held this second for x in range( heldStart, heldEnd ): s.byTheSeconds[ x ] += 1 # condense into intervals def condense( s, interval ): lastFree = -1 longestHeld = 0 # for each 'interval' seconds # find the time held/free/longest # and dump out a single line for x in range( 0, len( s.byTheSeconds ), interval ): secsHeld = 0 secsFree = interval for y in range( x, min( x + interval, len( s.byTheSeconds ) ) ): sex = s.byTheSeconds[ y ] secsHeld += sex secsFree -= sex == 0 if( sex == 0 ): lastFree = y elif( y - lastFree > longestHeld ): longestHeld = y - lastFree s.byTheInterval.append( s.Result( Spanner.epoch + x, secsHeld, secsFree, longestHeld ) ) longestHeld = 0 # dump one table's locks def dump( s, t, interval ): s.condense( interval ) if( len( s.byTheInterval ) == 0 ): return print ">>> %s saturation each %d seconds" % ( t, interval ) print "%s,%s,%s,%s" % ( "time", "not free", "held", "longest held" ) # for each 'interval' seconds # find the time held/free/longest # and dump out a single line for x in s.byTheInterval: print "%s,%d,%d,%d" % ( time.ctime( x.time ), x.secsFree, x.secsHeld, x.longestHeld ) # LocksByTime: one spanner for each table class LocksByTime: def __init__( s ): s.tableSpanner = {} def tally( s, commandTables, date ): for t in commandTables: if( not t in s.tableSpanner ): s.tableSpanner[ t ] = Spanner() s.tableSpanner[ t ].span( date, commandTables[ t ].writeHeldTime + commandTables[ t ].readHeldTime ) def dump( s, spannerInterval ): for t in s.tableSpanner: s.tableSpanner[ t ].dump( t, spannerInterval ) # # WaitVsHeld -- calculate how long locks are held waiting for others # # .tally() - given lock+held times for all tables used by a command's # processing, calculate which tables are holding up others # # .dump() - dump out all wait vs held times for tables held more than # 1 seconds total # class WaitVsHeld: def __init__( s ): s.table = {} def tally( s, commandTables, showWaitVsHeld, count, command ): for t in commandTables: # wait vs held held = commandTables[ t ].writeHeldTime if( held == 0 ): continue for t2 in commandTables: wait = commandTables[ t2 ].writeWaitTime if( t == t2 or wait > held or wait <= held / 2 ): continue if( not t in s.table ): s.table[ t ] = {} if( not t2 in s.table[ t ] ): s.table[ t ][ t2 ] = tableCounts( 0, 0, 0, 0 ) s.table[ t ][ t2 ] += commandTables[ t2 ] # for interest sake if( showWaitVsHeld ): print count, "held", t, held, "wait", t2, wait, command def dump( s, totalTables ): print ">>> Wait times causing hold times" print "%s,%s,%s,%s,%s,%s" % ( "table", "write-wait-Time", "read-wait-Time", "table", "write-held-Time", "read-held-Time" ) for t in s.table: tc = totalTables[ t ] for t2 in s.table[ t ]: tg = s.table[ t ][ t2 ] if( tg.writeWaitTime > 1000 and tc.writeHeldTime > 1000 or tg.readWaitTime > 1000 and tc.readHeldTime > 1000 ): print "%s,%d,%d,%s,%d,%d" % ( t2, tg.writeWaitTime / 1000, tg.readWaitTime / 1000, t, tc.writeHeldTime / 1000, tc.readHeldTime / 1000 ) # # TimesByCommand -- calculate read+write hold time for each command # # .tally() - add a single read+write hold to a command's total # .dump() - dump out each command's total # class TimesByCommand: def __init__( s ): s.byCommand = {} def tally( s, table, command, tc ): # by table then command if( not table in s.byCommand ): s.byCommand[ table ] = {} if( not command in s.byCommand[ table ] ): s.byCommand[ table ][ command ] = tableCounts( 0, 0, 0, 0 ) s.byCommand[ table ][ command ] += tc def dump( s ): print ">>> Table hold times by command" print "%s,%s,%s,%s" % ( "table", "command", "read-held-Time", "write-held-Time" ) for t in s.byCommand: for c in s.byCommand[ t ]: tt = s.byCommand[ t ][ c ] if( tt.readHeldTime >= 1000 or tt.writeHeldTime >= 1000 ): print "%s,%s,%d,%d" % ( t, c, tt.readHeldTime/1000, tt.writeHeldTime/1000 ) # # TimesByTable -- calculate read+write/wait+hold time for each table # # .tally() - add a single read+write/wait+hold to a table's total # .dump() - dump out each table's tally, using a grandTotal for %age # class TimesByTable: def __init__( s ): s.byTable = {} def tally( s, table, tc ): if( not table in s.byTable ): s.byTable[ table ] = tableCounts( 0, 0, 0, 0 ) s.byTable[ table ] += tc def dump( s, grandTotal ): grandTotal += tableCounts( 1, 1, 1, 1 ) # no divide by zero errors print ">>> Hold and wait times per table" print "%s,%s,%s,%s,%s,%s,%s" % ( "table", "read-wait-Time", "%-of-Total", "read-held-Time", "write-wait-Time", "%-of-Total", "write-held-Time" ) # sort descending by readWaitTime for t,tc in sorted( s.byTable.iteritems(), key= lambda (k,v): -v.readWaitTime ): print "%s,%d,%d%%,%d,%d,%d%%,%d" % ( t, tc.readWaitTime/1000, tc.readWaitTime * 100 / grandTotal.readWaitTime, tc.readHeldTime/1000, tc.writeWaitTime/1000, tc.writeWaitTime * 100 / grandTotal.writeWaitTime, tc.writeHeldTime/1000 ) # # WaitDelays - how much commands are delayed by wait times # class WaitDelays: def __init__( s ): s.maxSlots = 10 s.power = 2 s.delaySlots = [ 0 ] * s.maxSlots def tally( s, commandTotal, logDelays, count, command ): wait = commandTotal.readWaitTime + commandTotal.writeWaitTime held = commandTotal.readHeldTime + commandTotal.writeHeldTime if( wait < 1000 or wait < held / 10 ): return if( logDelays ): print count, "commandWait", wait, "commandHeld", held, command slot = int( math.log( wait / 1000, s.power ) ) if( slot >= s.maxSlots ): slot = s.maxSlots-1 s.delaySlots[ slot ] += 1 def dump( s ): print ">>> Commands delayed" print "%s,%s" % ( "Seconds of delay", "Commands affected" ) for x in range( 0, s.maxSlots ): print "%d,%d" % ( s.power**x, s.delaySlots[ x ] ) def main(): # process args showByCommand = False showDelays = False showWaitVsHeld = False showHisto = False showTally = False spannerInterval = 600 submitOnly = False verboseLogging = False try: opts, args = getopt.getopt( sys.argv[1:], "acdhi:stwv" ) except getopt.error, msg: print msg print "Usage: p4clog.py [ -v ] [ -acdhswt ] [ -i interval ] < log-file" sys.exit(2) for o, a in opts: if o == '-h': showHisto = True elif o == '-s': submitOnly = True elif o == '-i': spannerInterval = int(a) elif o == '-c': showByCommand = True elif o == '-d': showDelays = True elif o == '-t': showTally = True elif o == '-a': showHisto = showByCommand = showDelays = showTally = showWaitVsHeld = True elif o == '-w': showWaitVsHeld = True elif o == '-v': verboseLogging = True # per table counts of held and wait times lineCount = 0 commandCount = 0 pagesIn = 0 pagesOut = 0 firstDate = 0 lastDate = 0 p = logParse() commandTables = {} timesByTable = TimesByTable() timesByCommand = TimesByCommand() waitVsHeld = WaitVsHeld() locksByTime = LocksByTime() waitDelays = WaitDelays() commandTotal = tableCounts( 0, 0, 0, 0 ) grandTotal = tableCounts( 0, 0, 0, 0 ) for line in sys.stdin: lineCount += 1 if( lineCount % 1000000 == 0 ): print >> sys.stderr, "%s\n" % lineCount, # match one of the lines p.parse( line ) # per table tallying if( submitOnly and p.command != "dm-CommitSubmit" ): continue # command count if( p.lineType == "command" ): commandCount += 1 if( firstDate == 0 ): firstDate = p.date lastDate = p.date # pages if( p.lineType == "pages" ): pagesIn += p.pgi pagesOut += p.pgo if( verboseLogging ): if( p.pgi + p.pgo > 0 ): print lineCount, "pages", p.table, p.pgi + p.pgo, p.command, p.args continue # locks if( p.lineType == "locks" ): # for interest sake if( verboseLogging ): if( p.lrh > 100 ): print lineCount, "readheld", p.table, p.lrh, p.command, p.args if( p.lwh > 100 ): print lineCount, "writeheld", p.table, p.lwh, p.command, p.args if( p.lrw > 100 ): print lineCount, "readwait", p.table, p.lrw, p.command, p.args if( p.lww > 100 ): print lineCount, "writewait", p.table, p.lww, p.command, p.args # remember count for this table for this command # and add it to the grand total tc = tableCounts( p.lrw, p.lrh, p.lww, p.lwh ) commandTables[ p.table ] = tc commandTotal |= tc grandTotal += tc # tally by table and by command name timesByTable.tally( p.table, tc ) timesByCommand.tally( p.table, p.command, tc ) # per command tallying if( p.lineType == "end" ): if( showHisto ): locksByTime.tally( commandTables, p.date ) if( showWaitVsHeld ): waitVsHeld.tally( commandTables, verboseLogging, lineCount, p.command ) if( showDelays ): waitDelays.tally( commandTotal, verboseLogging, lineCount, p.command ) commandTotal = tableCounts( 0, 0, 0, 0 ) commandTables = {} p.reset() # Dump it all out if( showTally ): print ">>> Log file summary" print "Lines,Commands,Pages Read,Pages Written,First Date,Last Date" print "%d,%d,%d,%d,%s,%s" % ( lineCount, commandCount, pagesIn, pagesOut, firstDate, lastDate ) if( showDelays ): waitDelays.dump() timesByTable.dump( grandTotal ) if( showWaitVsHeld ): waitVsHeld.dump( timesByTable.byTable ) if( showByCommand ): timesByCommand.dump() if( showHisto ): locksByTime.dump( spannerInterval ) main()