#!/usr/bin/ruby ################################################################# # # Copyright (c) 2008,2010 Perforce Software, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL PERFORCE SOFTWARE, INC. BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # = Description # # Parse a checkpoint and identify duplicate files based on their md5 # checksum. Lazy copies and rcs files are ignored. # # = Usage # # anyduplicate.rb # ################################################################# class FileSize attr_reader :filename, :size def initialize(filename, size) @filename = filename @size = size end end if (ARGV[0] == nil) puts("Usage: anyduplicate.rb ") exit(0) end if !File.exists?(ARGV[0]) printf("%s does not exist!\n", ARGV[0]) exit(0) end md5Hash = Hash.new re_rev = Regexp.new('@pv@ \d+ @db.rev@ @(.*)@ (\d+) \d+ \d+ \d+ \d+ \d+ (.*) (\d+) \d+ (\d+) @.*@ @.*@ (\d+)') ckpSize = File.size(ARGV[0]) readBytes = 0.0 progress = 0.0 mod = 10 line = $<.gets printf("Processing checkpoint: 0%%") $stdout.flush while line readBytes = readBytes + line.length progress = (readBytes / ckpSize) * 100.0 if (progress / mod).floor == 1.0 printf("...%d%%", mod) $stdout.flush mod += 10 end if (match = re_rev.match(line)) depotFile = match[1] rev = match[2] digest = match[3] size = match[4].to_i lbrIsLazy = match[5] lbrType = match[6].to_i if lbrIsLazy == "0" && ((lbrType & 255) == 1) || ((lbrType & 255) == 3) file = FileSize.new( depotFile+"#"+rev, size ) md5Hash[ digest ] = Array.new if( md5Hash[ digest ] == nil || md5Hash.empty? ) md5Hash[ digest ].push( file ) end end line = $<.gets end printf("\nAnalysing results...\n") totalSize = 0 totalFile = 0 md5Hash.values.each do |md5| size = 0 nbFile = 0 if md5.length > 1 md5.each_with_index do |file, index| printf("%s ", file.filename) if index > 0 size += file.size nbFile += 1 end end printf(" = %d bytes duplicated in %d file(s)\n", size, nbFile) end totalSize += size totalFile += nbFile end printf("%d bytes duplicated in %d file(s)\n", totalSize, totalFile)