#!/usr/bin/env python3 # -*- encoding: UTF8 -*- """ p4storageanalyzer.py - Performs analysis of perforce server storage. Data is output in CSV flat file. Features: - Parses all submitted changelists - Must run either on the p4d server itself or on a system with access to the filesystem or a copy of the filesystem where depot files are used. - Will output a CSV file which can be used for aggregated reporting (see below) Note that path will be down to a level of N directories deep (specified as a parameter to the script), to avoid just listing files by complete path even if 10 or 15 levels deep in the directory tree, where the number of files output might become overwhelming Detailed CSV output (per change): change,user,workspace,submitDateTime,fileCount,lazycount,path,clientSize,archiveSizeRCS,archiveSizeBin 1234,Fred,fred-ws,2018-07-01 12:01:02,1,0,//archive/a/b/c/x.bin,,9900,0,2000 1234,Fred,fred-ws,2018-07-01 12:01:02,1,0,//archive/a/b/c/y.txt,9900,0,2000,0 1234,Fred,fred-ws,2018-07-01 12:01:02,5,0,//archive/a/b/c/d/,123000,50000,0 Summary CSV output (over all changes): path,fileCount,lazyCount,clientSize,archiveSizeRCS,archiveSizeBinary //archive/a/b/c/,6,2,129099,1242,122342 Note that paths ending with / mean that the data is for that directory and all its sub-folder ClientSize field reports the output of “p4 sizes” command and shows size on disk if you sync the file. ArchiveSize is the size of the file on the server in the depot file system. If you specify --verbose-rcs then the detailed processing is done for every RCS file which can be time consuming. If the RCS file being looked up is not the latest revision (HEAD) then the RCS file will be copied to a temp location, and all revisions deleted greater than the specified revision, and finally the size of the newly updated temp file will be reported. This option requires the following RCS tools to be installed and available in the path: rcs, rlog """ from __future__ import print_function import sys import P4 import argparse import os import logging import time import re import subprocess import tempfile import shutil from collections import OrderedDict python3 = sys.version_info[0] >= 3 DEFAULT_VERBOSITY = 'INFO' DEFAULT_LOG_FILE = 'log-p4storageanalyzer.log' LOGGER_NAME = 'P4STORAGE' def getTime(epoch): return time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(int(epoch))) def human_fmt(num, suffix='B'): for unit in ['','K','M','G','T','P','E','Z']: if abs(num) < 1024.0: return "%3.1f%s%s" % (num, unit, suffix) num /= 1024.0 return "%.1f%s%s" % (num, 'Y', suffix) def getOutput(cmdArgs): "Returns output" result = subprocess.check_output(cmdArgs) if python3: result = result.decode('utf-8') else: result = str(result) return result class RCSLbrDetails: "Work out details of RCS version size - may require munging of a copy of RCS file!" def __init__(self, logger): self.logger = logger self.dirpath = tempfile.mkdtemp() def __del__(self): shutil.rmtree(self.dirpath) def setRCSSize(self, fileInfo): "Updates fileInfo with size of RCS revision" # RCS file: /test/some_file.txt, v # Working file: some_file.txt # head: 1.22 # branch: # locks: # access list: # symbolic names: # keyword substitution: kv # total revisions: 12; selected revisions: 12 # description: # ---------------------------- # revision 1.22 # date: 2018/07/13 11:16:09; author: p4; state: Exp; lines: +1 -1 # *** empty log message *** # ---------------------------- # revision 1.21 # date: 2018/07/13 10:25:17; author: p4; state: Exp; lines: +5 -5 # *** empty log message *** self.logger.debug("Checking RCS File: %s" % fileInfo.lbrPath) result = getOutput(['rlog', fileInfo.lbrPath]) self.logger.debug("rlog info: %s" % result) revs = [] for line in result.split("\n"): if line.startswith("revision "): revs.append(line[len("revision "):]) if not revs: self.logger.warning("Failed to find revisions for file: %s" % fileInfo.lbrPath) return if len(revs) == 1: # Single rev in the file so just take size of file on disk if revs[0] == fileInfo.lbrRev: size = os.path.getsize(fileInfo.lbrPath) fileInfo.setSize(size) else: self.logger.warning("Single revision is not correct: %s/%s/%s" % (fileInfo.lbrPath, fileInfo.lbrRev, revs[0])) return if fileInfo.lbrRev not in revs: self.logger.warning("Specified rev %s not found: %s" % (fileInfo.lbrRev, ",".join(revs))) return # Copy the file because we need to modify it. If we are looking for the last rev in the file, # we delete all the more recent revs and records size of new file. # Otherwise we delete all revs so file containes the specified one, record file size, # delete the specified rev and subtract one value from the other for the difference to file # resulting from adding specified rev. # Requires some processing! tmppath = os.path.join(self.dirpath, os.path.basename(fileInfo.lbrPath)) shutil.copy(fileInfo.lbrPath, tmppath) self.logger.debug("Copied temp file: %s" % tmppath) if revs[0] != fileInfo.lbrRev: startRev = revs[revs.index(fileInfo.lbrRev) - 1] endRev = revs[0] if startRev == endRev: revRange = startRev else: revRange = '%s:%s' % (startRev, endRev) result = getOutput(['rcs', '-o%s' % revRange, tmppath]) self.logger.debug("Deleted later revs: %s %s" % (revRange, result)) size1 = os.path.getsize(tmppath) # Delete revision itself unless it is first revision of file if revs[-1] == fileInfo.lbrRev: size2 = 0 else: result = getOutput(['rcs', '-o%s' % fileInfo.lbrRev, tmppath]) self.logger.debug("Deleted actual rev: %s %s" % (fileInfo.lbrRev, result)) size2 = os.path.getsize(tmppath) fileInfo.setSize(size1 - size2) os.remove(tmppath) class FileInfo: "Info about the file from fstat record" def __init__(self, fstat): self.fstat = fstat self.depotFile = fstat['depotFile'] try: self.clientSize = int(fstat['fileSize']) except: self.clientSize = 0 self.lazyCount = 0 if 'lbrIsLazy' in fstat and fstat['lbrIsLazy'] == '1': self.lazyCount = 1 self.archiveSizeBinary = 0 self.archiveSizeRCS = 0 self.isRCS = False self.lbrPath = None def setSize(self, size): if self.isRCS: self.archiveSizeRCS = size else: self.archiveSizeBinary = size def depthPath(self, depth): "Return the path at appropriate level" parts = self.depotFile[2:].split("/") if depth == 0 or len(parts) <= depth: return self.depotFile return "//%s/" % "/".join(parts[:depth]) class SummaryInfo: "Info about the files summarised at a path depth" def __init__(self, path, lazyCount, clientSize, archiveSizeBinary, archiveSizeRCS): self.count = 1 self.lazyCount = lazyCount self.path = path self.clientSize = clientSize self.archiveSizeBinary = archiveSizeBinary self.archiveSizeRCS = archiveSizeRCS def update(self, lazyCount, clientSize, archiveSizeBinary, archiveSizeRCS): self.count += 1 self.lazyCount += lazyCount self.clientSize += clientSize self.archiveSizeBinary += archiveSizeBinary self.archiveSizeRCS += archiveSizeRCS class ArchiveLocator: """Knows how to calculate archive locations. This is much simpler in 2017.1+ servers where you have fstat -Ob to directly give you the path!""" def __init__(self, options, p4, logger): self.p4 = p4 self.options = options self.logger = logger self.depots = {} self.roots = {} self.root = None self.server_depot_root = None for d in p4.run_depots(): self.depots[d['name']] = d configs = p4.run_configure("show") for c in configs: self.logger.debug(c) if c['Type'] == 'option' and c['Name'] == 'r': self.root = c['Value'] if c['Type'] == 'configure' and c['Name'] == 'server.depot.root': self.server_depot_root = c['Value'] self.rcsdetails = RCSLbrDetails(self.logger) def getDepotRoot(self, path): "Return depot root - either relative or absolute path" path = path[2:] # Remove // depot = path.split("/")[0] if depot in self.roots: return self.roots[depot] # Otherwise calculate (and cache) relroot = re.compile("^([a-zA-Z0-9_\-\.]+.*)") try: map = self.depots[depot]['map'] except: return "" m = relroot.match(map) if m and not (len(map) > 1 and map[1] == ":"): # Relative root = m.group(1) if self.options.depot_root: root = os.path.join(self.options.depot_root, root) elif self.server_depot_root: root = os.path.join(self.server_depot_root, root) else: root = os.path.join(self.root, root) else: # Assume absolute root if self.options.depot_root: root = os.path.join(self.options.depot_root, depot) + "/..." else: root = map if not root.endswith("..."): raise Exception("Invalid Map %s" % root) root = root[:-3] if root.endswith("/") or root.endswith("\\"): root = root[:-1] # if not os.path.exists(root): # raise Exception("Depot root doesn't exist: %s" % root) self.roots[depot] = root return root def getLbrExt(self, path, lbrType, lbrRev): isRCS = False isCompressed = False base = lbrType opts = "" if "+" in lbrType: (base, opts) = lbrType.split('+') if base in ["text", "symlink", "unicode", "utf8", "utf16"]: isRCS = True if "C" in opts: isRCS = False isCompressed = True if base in ["binary"]: isRCS = False isCompressed = True if "F" in opts: isCompressed = False if "D" in opts: isRCS = True if isRCS: return (isRCS, "%s,v" % path) else: ext = "" if isCompressed: ext = ".gz" return (isRCS, os.path.join("%s,d" % path, "%s%s" % (lbrRev, ext))) def setLbrPath(self, fileInfo, fstat): # Remove // and then the depot component if not 'lbrFile' in fstat: fileInfo.lbrPath = None return path = fstat['lbrFile'][2:] path = os.path.sep.join(path.split(os.path.sep)[1:]) (isRCS, ext) = self.getLbrExt(path, fstat['lbrType'], fstat['lbrRev']) fileInfo.isRCS = isRCS fileInfo.lbrRev = fstat['lbrRev'] fileInfo.lbrPath = os.path.join(self.getDepotRoot(fstat['depotFile']), ext) def getLbrSize(self, fileInfo, fstat): self.setLbrPath(fileInfo, fstat) if fileInfo.lazyCount or not fileInfo.lbrPath: return try: if fileInfo.isRCS and self.options.verbose_rcs: self.rcsdetails.setRCSSize(fileInfo=fileInfo) else: size = os.path.getsize(fileInfo.lbrPath) fileInfo.setSize(size) except Exception as e: self.logger.debug("Problems processing file: %s\n%s" % (fileInfo.lbrPath, str(e))) def setVerboseRCSSize(self, fileInfo): self.logger.debug("Checking RCS File: %s" % fileInfo.lbrPath) result = getOutput(['rlog', '-h', fileInfo.lbrPath]) self.logger.debug("rlog info: %s" % result) for line in result.split("\n"): if line.startswith("head: "): rev = line[len("head: "):] if rev == fileInfo.lbrRev: size = os.path.getsize(fileInfo.lbrPath) fileInfo.setSize(size) return raise Exception("Could not find right string: head") def getFileInfo(self, fstat): "Returns FileInfo instance with all details completed" fi = FileInfo(fstat) self.getLbrSize(fi, fstat) return fi class P4StorageAnalyzer(): def __init__(self, options, outstream=None, sumstream=None): self.options = options self.options.logfile = DEFAULT_LOG_FILE self.p4 = P4.P4() if options.port: self.p4.port = options.port if options.user: self.p4.user = options.user if outstream: self.outstream = outstream else: self.outstream = open(options.output, "w") if sumstream: self.sumstream = sumstream else: if options.summary == "-": self.sumstream = sys.stdout else: self.sumstream = open(options.summary, "w") self.init_logger() self.logger.debug("p4storageanalyzer.py: %s" % self.options) self.logger.debug("Currdir: %s" % os.getcwd()) if options.verbose_rcs: self.ensureRCSBinaries() self.p4.connect() self.locator = ArchiveLocator(self.options, self.p4, self.logger) def init_logger(self): self.logger = logging.getLogger(LOGGER_NAME) self.logger.setLevel(self.options.verbosity) outformatter = logging.Formatter('%(message)s') ch = logging.StreamHandler(self.outstream) ch.setLevel(logging.INFO) ch.setFormatter(outformatter) self.logger.addHandler(ch) if self.options.verbosity != logging.INFO and self.options.logfile: formatter = logging.Formatter('%(asctime)s:%(levelname)s %(message)s') fh = logging.FileHandler(self.options.logfile, mode='w') fh.setFormatter(formatter) self.logger.addHandler(fh) def ensureRCSBinaries(self): for b in ["rcs", "rlog"]: self.logger.debug("Checking for binary: %s" % b) result = subprocess.check_output([b, '--version']) if python3: ver = result.decode('utf-8') else: ver = str(result) self.logger.debug("Version info: %s" % ver) if "GNU RCS" not in ver: raise Exception("Could not find RCS executable in path: %s" % b) def output(self, line): self.logger.info(line) def getFileInfo(self, fstats): for fstat in fstats: yield self.locator.getFileInfo(fstat) def run(self): self.output("change,user,workspace,submitDateTime,fileCount,lazyCount,path,clientSize,archiveSizeRCS,archiveSizeBinary") changes = [] for path in self.options.path: changes.extend(self.p4.run_changes(path)) summary = OrderedDict() for change in changes: self.logger.debug(change) fstats = self.p4.run_fstat("-Oc", "-Ol", "@=%s" % change['change']) self.logger.debug(fstats) pathSummary = OrderedDict() for fileInfo in self.getFileInfo(fstats): path = fileInfo.depthPath(self.options.depth) if path in pathSummary: pathSummary[path].update(fileInfo.lazyCount, fileInfo.clientSize, fileInfo.archiveSizeBinary, fileInfo.archiveSizeRCS) else: pathSummary[path] = SummaryInfo(path, fileInfo.lazyCount, fileInfo.clientSize, fileInfo.archiveSizeBinary, fileInfo.archiveSizeRCS) for path in pathSummary.keys(): sumInfo = pathSummary[path] if path in summary: summary[path].update(sumInfo.lazyCount, sumInfo.clientSize, sumInfo.archiveSizeBinary, sumInfo.archiveSizeRCS) else: summary[path] = SummaryInfo(path, sumInfo.lazyCount, sumInfo.clientSize, sumInfo.archiveSizeBinary, sumInfo.archiveSizeRCS) self.output("{change},{user},{workspace},{submitTime},{fileCount},{lazyCount},{path},{clientSize}," "{archiveSizeRCS},{archiveSizeBinary}".format( change=change['change'], user=change['user'], workspace=change['client'], path=path, submitTime=getTime(change['time']), fileCount=sumInfo.count, lazyCount=sumInfo.lazyCount, clientSize=sumInfo.clientSize, archiveSizeRCS=sumInfo.archiveSizeRCS, archiveSizeBinary=sumInfo.archiveSizeBinary)) self.sumstream.write("Path,Revisions,LazyCopies,clientSize,archiveSizeRCS,archiveSizeBinary\n") for path in summary.keys(): sumInfo = summary[path] self.sumstream.write("{path},{fileCount},{lazyCount},{clientSize},{archiveSizeRCS},{archiveSizeBinary}\n".format( path=path, fileCount=sumInfo.count, lazyCount=sumInfo.lazyCount, clientSize=human_fmt(sumInfo.clientSize), archiveSizeRCS=human_fmt(sumInfo.archiveSizeRCS), archiveSizeBinary=human_fmt(sumInfo.archiveSizeBinary))) self.sumstream.close() def main(): parser = argparse.ArgumentParser(add_help=True) parser.add_argument('-p', '--port', help="Perforce P4PORT to use (default is current environment).", default=None) parser.add_argument('-u', '--user', help="Perforce P4USER to use (default is current environment).", default=None) parser.add_argument('-r', '--depot-root', help="Equivalent of server.depot.root (optional). Can be used to run script not " "directly on the server, but with access to copy of server filesystem." "Do not supply this parameter if directly on server machine. " "This overrides P4ROOT and absolute depot Map paths.", default=None) parser.add_argument('-d', '--depth', help="Maximum number of levels (depot/dir1/dir2/etc...) to report at, 0 = all", type=int, default=4) parser.add_argument('-o', '--output', help="Output (report) file - default output.csv", default="output.csv") parser.add_argument('-a', '--append', default=False, action='store_true', help="Append to report file") parser.add_argument('--verbose-rcs', default=False, action='store_true', help="Analyse detailed size of RCS files (can result in rewriting copies of RCS format files). " "If not specified then only current size of RCS files on disk is reported which can be misleading. " "Default is False in which case it will report current size of RCS file.") parser.add_argument('-s', '--summary', help="Summary report file - default summary.csv", default="summary.csv") parser.add_argument('path', nargs='+', help="Perforce depot path to analyse." "Note that changelist or other qualifiers are valid, e.g. //depot/...@100,200" " default value: //...", default="//...") parser.add_argument('-v', '--verbosity', nargs='?', const="INFO", default=DEFAULT_VERBOSITY, choices=('DEBUG', 'INFO', 'WARNING', 'ERROR', 'FATAL'), help="Output verbosity level. Default is: " + DEFAULT_VERBOSITY) try: options = parser.parse_args() except Exception as e: parser.print_help() sys.exit(1) obj = P4StorageAnalyzer(options) obj.run() if __name__ == '__main__': main()