#!C:/Programs/SDK/Perl64/bin/perl.exe

# includeCleaner.pl
#  - a script to assist in cleaning up c/c++ includes
#    identifies where an include is in both the .h and .cpp files
#    as well as cases when a header is double-included in the same file.
# © 2014 Gabriel Weiss
# gweiss@perforce.com

use strict;
use File::Find;

# Pass in root folder of source tree or put in default path for ease of repeated use.
my $startDir = $ARGV[1];
if ($startDir eq "")
{
	$startDir = 'C:\perforce\gweiss_main_stream\src';
}

if (!-d $startDir)
{
	print "You need to pass the path to the source root folder as the first argument.";
	exit 1;
}

# Gather all our filenames recursively
my @fileNames;
find( \&gatherFiles, $startDir);

my $tmpFile = "";
my $fileName = "";
my $fullPath = "";
my %includes = ();
my $fileLine = "";
# This could be used to catch any type of duplicate text. For our purposes I wanted
# to simply clean up our includes. The $includeText variable could be changed to
# whatever you want to be looking at for cleanup.
my $includeText = '#include';
foreach (@fileNames)
{
	$fullPath = $_;
	$fileName = "";

	# Grab just the name of the file sans extension so
	# we know which filename we should be comparing

	# Our filenames only contain text, underscores or dashes. If your codebase
	# includes other characters, this is the regexp that will need adjusting.
	if ($_ =~ /([\w\-_]*)\./)
	{
		$fileName = $1;
	}
	else
	{
		print "Couldn't match: ".$fullPath."\n";
		exit 1;
	}

	# If our temp variable equals our matched filename, it means that we have a corresponding
	# file that we need to test for duplicates. E.g. we just previously parsed through our cpp
	# file and now we're looking at the matching header. If your files aren't ordered by name
	# in the same directory, this basically won't work.	
	if ($tmpFile eq $fileName)
	{
		open (FILE, "<", $fullPath) or die "Couldn't open the file: ".$_;
		# Parse through the file and test any include lines
		while (<FILE>)
		{
			$fileLine = $_;
			if ($fileLine =~ /^$includeText (.*)$/)
			{
				# If we hit a match in our includes hash it means it was present in our
				# previously tested file.
				if (exists($includes{$1}))
				{
					print "header already included - ".$fullPath."      ".$1."\n";
				}
			}
		}
		close FILE;
	}

	# Now that we've tested against the previous file, go ahead and parse through the same
	# file again, this time looking for duplicates in the current file while re-populating
	# the includes hash. This could be optimized by doing this at the same time as the
	# previous parse through, but with how fast this is done even against some of the
	# monster code files we have, it seemed like for the sake of clarity easier to simply
	# do it in a second pass against the file.
	$tmpFile = $fileName;
	%includes = ();
	open (FILE, "<", $fullPath) or die "Couldn't open the file: ".$_;
	while (<FILE>)
	{
		$fileLine = $_;
		if ($fileLine =~ /^$includeText (.*)$/)
		{
			# Now our exists test means we've already added the include in our current pass
			# so it's a duplicate in the same file.
			if (exists($includes{$1}))
			{
				print "duplicate header - ".$fullPath."      ".$1."\n";
			}
			else
			{
				$includes{$1} = "1";
			}
		}
	}
	close FILE;
}

exit 0;


# Tweaked to only apply to .h and .cpp, but could be adjusted to handle any files
sub gatherFiles
{
	my $file = $File::Find::name;
	if ($file =~ /.*\.h|cpp$/)
	{
		push @fileNames, $file;
	}
}