unneeded_binary_revisions.py #8

#!/usr/bin/python
#
# This is a beta script!
# The author takes no responsibility for the results of using this tool.
#
# Motivation:
# If you are short of storage space on your Perforce server because of big binary files
# then sooner or later you get to consider using single head revisions.
# Two things I don't like about single head revisions:
#  * When you integrate a file with a single head revision then the Perforce server makes
#    a physical copy even if the file hasn't changed in the integration.
#    So it still wastes storage space if you work with branches.
#    It doesn't happen with normal files (with single head revisions not switched on).
#  * There is no way to keep particular revisions if you need them (like milestone builds, etc).
#    Old revisions always get deleted when new revisions are checked in.
#    You could have a branch for each revision that you need to keep (then those revisions
#    would be kept because of the behaviour I described in the previous point).
#    But that would swell the integration metadata (especially if you've got
#    a massive amount of files) and the depot file hierarchy.
# I needed a solution that
#  * made it possible to keep
#    * labelled revisions (important revisions are labelled anyway)
#    * recently checked in modifications and the preceding revision
#      So that we avoid removing revisions that would potentially be labelled shortly.
#  * without wasting storage space for unneeded revisions and redundant copies.
#
#
# This script is a more sophisticated alternative for using single head revisions.
# Run this script when you want to get rid of unneeded binary revisions, or you just want
# to get information on how much storage space is occupied by unneeded revisions at that moment.
# See the configuration section for details.
#
# Robert Kovacs (rkovacs7878@gmail.com)
#


###############################
#### CONFIGURATION SECTION ####

P4PORT = "phoebe:1666"
P4USER = "Robert.Kovacs"
P4PASSWD = "Password1"

                                     # Head revisions are always kept
LABELLED_NEEDED = True               # If True then no labelled revisions will be removed
SIZE_LIMIT = 1000                    # No revisions of smaller size will be removed
AGE_LIMIT = 7 * 24 * 60 * 60         # As seconds. If greater than 0 then no revisions newer than this age will be removed. The latest one of the older revisions will not be removed either.

MODE = 0                             # 0 - Report
                                     #     Unneeded revisions will be listed but not removed
                                     # 1 - Bucket (thanks Sven for the great tool!)
                                     #     Unneeded revisions will be bucketed using p4bucket (//guest/sven_erik_knop/p4bucket/...)
                                     #     Before you use this option:
                                     #      * You need to understand p4bucket. Read the documentation!
                                     #      * Copy p4bucket.py to the same directory with unused_binary_revisions.py
                                     #      * Initialise p4bucket from there
                                     #      * Create a bucket for the unneeded binary revisions
                                     #      * Uncomment the p4bucket importing below
                                     #      * Set the BUCKET value below
                                     # 2 - Obliterate
                                     #     Unneeded revisions will be obliterated.
                                     #     BE VERY CAREFUL WITH THIS ONE, THERE IS NO WAY TO UNDO IT!
                                     #     You need to connect with a user that has permission to obliterate.

#import p4bucket                     # Uncomment this line if you want to use p4bucket
                                     # Read MODE == 1 for more information
BUCKET = "my_bucket"                 # Name of the bucket to archive revisions into.
                                     # Only used if MODE == 1
                                     
####                       ####
###############################


import P4
import socket
import platform
import string
import datetime
import sys
import os


p4 = None
server_root = None
revisions = { }             # key: <p4_path>#<revision_number>, value: lbr filename with full path
lbr_files = { }             # key: lbr filename with full path, value is True if this revision is needed


def initialise():
    connect_perforce()
    query_server_root()


def connect_perforce():
    print("Connecting Perforce...")
    
    global p4
    p4 = P4.P4()

    p4.port = P4PORT
    p4.user = P4USER
    p4.password = P4PASSWD

    p4.connect()


def query_server_root():
    print("Querying server root path...")

    server_info = p4.run_info()[0]
    server_host = server_info['serverAddress'].split(':')[0]
    server_ip = socket.gethostbyname(server_host)

    host_name = platform.uname()[1]
    host_ip = socket.gethostbyname(host_name)

    if server_ip != "127.0.0.1" and server_ip != host_ip:
        raise Exception("Run this script on the Perforce server!")

    global server_root
    server_root = server_info["serverRoot"]
    if not server_root.endswith('/') and not server_root.endswith('\\'):
        server_root += '/'


def enumerate_files():
    print("Processing depots...")

    global server_root
    global revisions
    
    for depot in p4.run_depots():
        if depot['type'] == "local":
            print(" " + depot['name'])

            lbr_path = depot['map'].rstrip('.')
#            if not lbr_path.startswith(server_root):
#                lbr_path = server_root + lbr_path

            for file in p4.run_fstat("-Oasfc", "//" + depot['name'] + "/..."):
                if file.has_key('lbrType') and file['lbrType'].startswith("binary") and file['lbrType'].find("D") < 0 and file.has_key('lbrFile') and not file.has_key("attr-archiveDate"):
                    revisions[file['depotFile'] + "#" + file['headRev']] = { 'lbr_filename': lbr_path + file['lbrFile'][(len(depot['name']) + 3):] + ",d/" + file['lbrRev'] + ".gz", 'mod_time': string.atoi(file['headTime']) }

    print(" " + str(len(revisions.keys())) + " binary file revisions found.")


def collect_needed_lbr_files():
    global lbr_files

    for (filename, revision) in revisions.items():
        lbr_files[revision['lbr_filename']] = False

    mark_head_revisions()
    mark_labelled_revisions()
    mark_recently_modified_files()
    mark_small_files()


def mark_head_revisions():
    print("Processing head revisions...")

    p4_files = { }
    for revision in revisions.keys():
        tokens = revision.split('#')
        p4_filename = tokens[0]
        revision_number = int(tokens[1])

        if not p4_files.has_key(p4_filename) or revision_number > p4_files[p4_filename]:
            p4_files[p4_filename] = revision_number

    rev_count = 0
    for (p4_filename, revision_number) in p4_files.items():
        lbr_files[revisions[p4_filename + "#" + str(revision_number)]['lbr_filename']] = True
        rev_count = rev_count + 1

    print(" " + str(rev_count) + " revisions marked as head revision.")


def mark_labelled_revisions():
    if LABELLED_NEEDED:
        print("Processing labels...")

        rev_count = 0
        labels = p4.run_labels()
        for label in labels:
            print(" " + label['label'])

            labelled_revisions = p4.run_files("//...@" + label['label'])
            for labelled_revision in labelled_revisions:
                key = labelled_revision['depotFile'] + "#" + labelled_revision['rev']
                if revisions.has_key(key) and not lbr_files[revisions[key]['lbr_filename']]:
                    lbr_files[revisions[key]['lbr_filename']] = True
                    rev_count = rev_count + 1

    print(" Further " + str(rev_count) + " revisions marked as labelled.")


def mark_recently_modified_files():
    print("Processing recently modified files...")

    now = datetime.datetime.utcnow() - datetime.datetime(1970, 1, 1)
    earliest = now.days * 24 * 60 * 60 + now.seconds - AGE_LIMIT
    
    rev_count = 0
    for (filename, revision) in revisions.items():
        if revision['mod_time'] > earliest:
            if not lbr_files[revision['lbr_filename']]:
                lbr_files[revision['lbr_filename']] = True
                rev_count = rev_count + 1

            tokens = filename.split('#')
            previous_revision = tokens[0] + "#" + str(int(tokens[1]) - 1)
            if revisions.has_key(previous_revision) and not lbr_files[revisions[previous_revision]['lbr_filename']]:
                lbr_files[revisions[previous_revision]['lbr_filename']] = True
                rev_count = rev_count + 1

    print(" Further " + str(rev_count) + " revisions marked as recently modified.")


def mark_small_files():
    if SIZE_LIMIT > 0:
        print("Skipping small files...")

        missing_count = 0
        small_count = 0
        for (lbr_filename, needed) in lbr_files.items():
            if not needed:
                if not os.path.exists(lbr_filename):
                    lbr_files[lbr_filename] = True
                    missing_count = missing_count + 1
                elif os.path.getsize(lbr_filename) < SIZE_LIMIT:
                    lbr_files[lbr_filename] = True
                    small_count = small_count + 1

    print(" Further " + str(small_count) + " revisions marked as smaller than " + str(SIZE_LIMIT) + " bytes.")
    print(" " + str(missing_count) + " librarian files not found.")


def sum_size():
    print("Calculating total file size...")

    file_count = 0
    total_size = 0
    
    for (lbr_filename, needed) in lbr_files.items():
        if not needed:
            file_count += 1
            total_size += os.path.getsize(lbr_filename)

    print(" Unneeded " + str(total_size) + " bytes in " + str(file_count) + " files.")
    

def print_unneeded_revisions():
    print("Unneeded revisions:")
    
    for (filename, revision) in revisions.items():
        if not lbr_files[revision['lbr_filename']]:
            print(" " + filename)


def bucket_unneeded_revisions():
    print("Bucketing revisions:")
    
    bucket = p4bucket.P4Bucket(p4bucket.CONFIG_FILE)
        
    for (filename, revision) in revisions.items():
        if not lbr_files[revision['lbr_filename']]:
            print(" "+ filename)
            bucket.run("archive", ["-b", BUCKET, "-s", filename]);


def obliterate_unneeded_revisions():
    print("Obliterating revisions:")
    
    for (filename, revision) in revisions.items():
        if not lbr_files[revision['lbr_filename']]:
            print(" "+ filename)
            p4.run_obliterate("-y", filename)
    

if __name__ == '__main__':
    initialise()
    enumerate_files()
    collect_needed_lbr_files()
    sum_size()

    if MODE == 0:
        print_unneeded_revisions()
    elif MODE == 1:
        bucket_unneeded_revisions()
    elif MODE == 2:
        obliterate_unneeded_revisions()
#	Change	User	Description
#8	7699	Robert Kovacs	More output
#7	7636	Robert Kovacs	unneeded_binary_revisions: not removing bucketed revisions
#6	7635	Robert Kovacs	unneeded_binary_revisions: must run on the server machine
#5	7634	Robert Kovacs	unneeded_binary_revisions: obliterating, p4bucket
#4	7633	Robert Kovacs	unneeded_binary_revisions: AGE_LIMIT modified. Keeps latest older revision as well.
#3	7632	Robert Kovacs	unneeded_binary_revisions: skipping +S and +D files
#2	7631	Robert Kovacs	unneeded_binary_files: size limit added
#1	7630	Robert Kovacs	unneeded_binary_revisions: first development version