#!/bin/env perl
#==============================================================================
# Copyright and license info is available in the LICENSE file included with
# the Server Deployment Package (SDP), and also available online:
# https://workshop.perforce.com/view/p4-sdp/main/LICENSE
#------------------------------------------------------------------------------

# For a summary of command line options, run:  p4pcm.pl -h

# This runs on Unix/Linux systems only.
# Log file is p4pcm.log, in $LOGS dir.
# Nomination file is p4pcm.nomlist, in $LOGS dir.

use strict;
use File::Basename;
use Getopt::Long;
use POSIX;

#------------------------------------------------------------------------------
# Initialization
#------------------------------------------------------------------------------
BEGIN
{
    $main::ThisScript = basename($0);
    $main::Version = "1.5.2";
}

#------------------------------------------------------------------------------
# Prototypes for local functions.
#------------------------------------------------------------------------------
sub usage();
sub getDriveSpace($;);

#------------------------------------------------------------------------------
# Declarations
#------------------------------------------------------------------------------
my $AcctTime;
my $TopDir;
#
#  If $Freespace is less than $ThresholdLow, then start deleting files until
#  a minimum of $ThresholdHigh diskspace is available.
#  Thresholds are compared against a system value reporting in kilobytes.
#  Older versions of this tool used absolute value defaults - which were often
#  inappropriate. For coding/ specification simplicity, the maximum available
#  cache space is assumed to be the entire space of the drive that contains it.
#  Since caches are often rooted in a subdirectory of a single common system drive,
#  high percentage thresholds may indirectly specify delete of the entire cache.
#  Which would defeat the proxy purpose. A similar condition exists when the cache
#  and operational log files use space from the same logical drive. It is beyond
#  the scope of this tool to manage log files.
#
my $ThresholdLow  = "12";  # To trigger prior to typical 10% low disk scanner alerts
my $ThresholdHigh = "25";
my $TotalSize;
my $Freespace;
my $Timestamp;
my $Datestamp;
my $Filecount;
my @RmList = ();
my $NomInfo;
my $RmFile;

#------------------------------------------------------------------------------
# Function: usage()
# Displays usage message.
#------------------------------------------------------------------------------
sub usage ()
{
   # tag::includeManual[]
   print "\nUsage:\n
   $main::ThisScript [-d \"proxy_cache_dir\"] [-tlow <low_threshold>] [-thigh <high_threshold>] [-n/-s]
or
   $main::ThisScript -h

This utility removes files in the proxy cache if the amount of free disk
space available to the cache falls below the low threshold (default $ThresholdLow).
It removes cache files based on time last accessed starting with the least
recently accessed continuing until either all files are deleted or the free disk
space available to the cache specified by the high threshold (default $ThresholdHigh)
is reached. Specify numeric threshold values in kilobyte units (kb), or as a number
less than 100 to specify percentage of the total disk space available to
the cache. A high_threshold near the available disk space typically results in a
full clear of the cache defeating the purpose of a proxy.

The '-d \"proxy_cache_dir\"' argument is required unless \$P4PCACHE is defined.
The -d argument takes precedence. proxy_cache_dir should be a fully rooted path
starting with '/'. Relative or local paths are fatal to tool operation.

The log is \$LOGS/p4pcm.log if \$LOGS is defined, else p4pcm.log in the current
directory.

The removal nomination file list is \$LOGS/p4pcm.nomlist if \$LOGS is
defined, else p4pcm.nomlist in the current directory. The nomination list
file contains an access time ordered list of all cache files. If the nomination
list file exists at the start of this tool, the tool exits assuming a separate
run is currently in progress. The nomination list file is deleted when the tool
completes operation unless the '-s' argument is specified.

Use '-n' or '-s' to show what files would be removed. '-s' also causes the
nomination list file to remain undeleted. The nomination list file must be
manually removed prior to a subsequent successful use of this tool.
";
   # end::includeManual[]
   exit  1;;
}

#------------------------------------------------------------------------------
# Function getDriveSpace($TopDir)
# Returns a 2-element array containing $totalspace and $Freespace, i.e.
#    returns ($totalspace, $Freespace).
#------------------------------------------------------------------------------
sub getDriveSpace($;)
{
   my ($TopDir) = @_;
   my $totalSpace;
   my $freeSpace;
   my $dirInfo;
   my $junk;

   # Run 'df -k $TopDir', and extract the total and available space values,
   # using $junk to ignore extraneous information.
   $dirInfo = `df -k $TopDir`;
   $dirInfo =~ s/^.*?\n//; # Zap the header line.
   $dirInfo =~ s/\s+/,/gs; # Replace whitespace with comma as field delimiter.
   ($junk, $totalSpace, $junk, $freeSpace, $junk, $junk) = split (',',$dirInfo);
   return ($totalSpace, $freeSpace);
}
#------------------------------------------------------------------------------
# Function getThreshold($SpecifiedValue,$ParameterName)
#
# Validates and decodes a threshold value specification. $SpecifiedValue is
# expected to be all digits. If the value of $SpecifiedValue is 100 or more,
# it specifies the threshold in kilobytes. Otherwise it specifies the threshold
# as a percentage of $TotalSize (which is specified as kilobytes).
#
# NOTE: Older versions of this tool used the % character to indicate percentage.
#       The % character has special meaning in crontab specifications creating
#       problematic specification/ use scenarios. Since a 100K cache is effectively
#       useless for typical modern files, choosing less than 100 as a percentage
#       specification choice seemed appropriate.
#
# Unknown formats (such as using a MGT suffix), a decoded value of 0, or a decoded
# value greater than $TotalSize cause a LOG error message and return of 0.
#
# Returns decoded threshold value.
#------------------------------------------------------------------------------
sub getThreshold($$)
{
   my ($Specified, $Parameter) = @_;
   my $Value = 0;
   if( $Specified =~ m!^(\d+)$! ) {
      $Value = $1;
      if( $Value > 0 && $Value < 100 ) {
         $Value = int (($1 / 100) * $TotalSize);
         print LOG "$Parameter parameter specified as $Specified \% of $TotalSize kb\n";
      } elsif( $Value == 0 ) {
         print LOG "$Parameter parameter specified as $Specified evaluates as 0 kb\n";
      } else {
         print LOG "$Parameter parameter specified as $Value kb\n";
      }
   } else {
      print LOG "'$Specified' is unrecognized format for $Parameter parameter\n";
      $Value = 0;
   }
   if( $Value > $TotalSize ) {
      print LOG "$Parameter at $Value is larger than total disk size of $TotalSize\n";
      $Value = 0;
   }
   return $Value;
}
#------------------------------------------------------------------------------
# Function setDateAndTime()
#
# Establish format consistent text string values for current date and time.
#------------------------------------------------------------------------------
sub setDateAndTime()
{
   $Datestamp = strftime("\%Y-\%m-\%d",localtime);
   $Timestamp = strftime("\%H:\%M:\%S",localtime);
}
#------------------------------------------------------------------------------
# Function setCacheFileCount()
#
# Sets the total number of files currently in the cache.
#------------------------------------------------------------------------------
sub setCacheFileCount()
{
    my $Count = `find $TopDir -type f | wc -l`;
    chomp $Count;
    $Filecount = int $Count;
}

#------------------------------------------------------------------------------
# Parse command line.
#------------------------------------------------------------------------------
Getopt::Long::config "no_ignore_case";
Getopt::Long::config "auto_abbrev";

GetOptions(\%main::CmdLine, "help", "noop", "savenom", "dir=s", "tlow=s", "thigh=s")
   or die "\nUsage Error:  Unrecognized argument.\n";

# Validate command line arguments.
usage() if $main::CmdLine{'help'};

#------------------------------------------------------------------------------
# Main Program.
#------------------------------------------------------------------------------
setDateAndTime();

my ($name, $dir, $ext) = fileparse($0, '\..*');
my $logfile;
my $nomfile;

if ($ENV{LOGS}) {
   $logfile = "$ENV{LOGS}/$name.log";
   $nomfile = "$ENV{LOGS}/$name.nomlist";
} else {
   $logfile = "$name.log";
   $nomfile = "$name.nomlist";
}

open (LOG, ">>$logfile");

print LOG "==============================================================================\n";
print LOG "Started $main::ThisScript v$main::Version at $Datestamp $Timestamp\n";

# The '-d <TopDir>' argument is required unless P4PCACHE is defined in the
# shell environment.

my $HowSpecified = '';
if ($main::CmdLine{'dir'}) {
   $HowSpecified = 'by command argument';
   $TopDir = $main::CmdLine{'dir'};
} elsif ($ENV{P4PCACHE}) {
   $HowSpecified = 'by P4PCACHE environment variable';
   $TopDir = $ENV{P4PCACHE};
} else {
   close (LOG);
   usage ();
}

# $TopDir must end with '/' or the find function (system or perl) doesn't operate on the cache.
# It must also exist as a directory for % threshold calculations to work.

$TopDir .= '/' unless $TopDir =~ m!\/$!;
print LOG "Cache root directory specified $HowSpecified established as $TopDir\n";

unless ($TopDir =~ m!^\/! ) {
   print LOG "\nError: The cache directory [$TopDir] is relative - does not start with '/'. Aborting.\n";
   close (LOG);
   exit 1;
}

unless (-e $TopDir && -d $TopDir ) {
   print LOG "\nError: The cache directory [$TopDir] does not exist or is not a directory. Aborting.\n";
   close (LOG);
   exit 1;
}

# prime total size and free space as they are needed for threshold specification evaluation.

($TotalSize, $Freespace) = getDriveSpace($TopDir);

$ThresholdLow = $main::CmdLine{'tlow'} if ($main::CmdLine{'tlow'});
$ThresholdLow = getThreshold( $ThresholdLow, 'tlow' );
$ThresholdHigh = $main::CmdLine{'thigh'} if ($main::CmdLine{'thigh'});
$ThresholdHigh = getThreshold( $ThresholdHigh, 'thigh' );
if( $ThresholdHigh != 0 && $ThresholdHigh < $ThresholdLow ) {
  print LOG "thigh $ThresholdHigh must be more than tlow $ThresholdLow\n";
  $ThresholdHigh = 0;
}
if( $ThresholdLow == 0 || $ThresholdHigh == 0 ) {
  print LOG "\nThreshold parameter errors. Aborting.\n";
  close(LOG);
  exit 1;
}

# If a nomination file exists, assume a version of this script is currently running and exit.

if( -e $nomfile ) {
  print LOG "\nNomination file $nomfile exists. Run conflict. Aborting.\n";
  close(LOG);
  exit 1;
}

# $TopDir exists and it's a directory.

        setCacheFileCount();
        setDateAndTime();  # to account for cache file count time
        print LOG "$Datestamp $Timestamp\nCache files = $Filecount\nFree Space = $Freespace kb\nLow space threshold = $ThresholdLow kb\n";
        #  compare $Freespace to $ThresholdLow
        if ( $Freespace < $ThresholdLow ) {
               #  while $Freespace is less than $ThresholdHigh
               #  Find oldest files based on "Last accessed"
               #  NOTES: * System tools are significantly faster and more efficient than the equivalent perl code.
               #         * In an attempt to reduce resource overhead, early versions of this tool used the -atime
               #           find command option to reduce the number of entries that are identified and sorted.
               #           Independent testing indicates relatively limited elapsed time and resource differences
               #           for a wide range of -atime values. It also turns out that -atime is good for steady state,
               #           but problematic for turnover scenarios that occur in short periods of time such as initial
               #           cache priming and dramatic changes in cache content such as new projects.
               print LOG "Delete required. Generating nomination list: $nomfile. Operational overlap is now blocked.\n";
               `find $TopDir -type f -printf "%A+ %s %k %p\n" | sort 2>&1 >$nomfile`;
               print LOG "Target Free Space = $ThresholdHigh kb. Selecting from nomination list.\n";
               open (NOM, "<$nomfile" );
               while (<NOM>) {
                      last if $Freespace > $ThresholdHigh;
                      $NomInfo = $_;
                      last unless $NomInfo;
                      my ($size, $path) = (0, '');
                      ($size, $path) = ($1, $2) if $NomInfo =~ m!^\S+ \d+ (\d+) (.+)$!;
                      next if $path eq '';
                      print LOG "$NomInfo";
                      $Freespace += $size;
                      push(@RmList, $path);
               }
               print LOG "\n*** WARNING - nomination list exhausted after " . scalar @RmList . " entries ***\n"  if $Freespace < $ThresholdHigh && scalar @RmList > 0;
               print LOG "\n*** WARNING - empty nomination list ***\n"  if scalar @RmList == 0;
               close(NOM);
               print LOG "$Freespace kb of desired $ThresholdHigh kb will be available.\n";

               #  if @RmList exists, delete it + log
               if ( @RmList ) {

                      # Delete files to free space.
                      if ($main::CmdLine{'noop'} || $main::CmdLine{'savenom'})
                      {
                         print LOG "NO-OP: The following files would have been deleted:\n";
                         foreach $RmFile (@RmList) {
                            print LOG "$RmFile\n";
                         }
                      } else
                      {
                         setDateAndTime();  # to account for nomination and selection time
                         $Freespace = int $Freespace;   # Ignore fractional kb
                         print LOG "$Datestamp $Timestamp\nEstimated Free Space = $Freespace kb after Files delete:\n";
                         foreach $RmFile (@RmList) {
                            unlink $RmFile or print LOG "ERROR: File not deleted:\n$RmFile\n";
                         }
                         setCacheFileCount();
                         print LOG "Cache files after delete = $Filecount\n";
                      }
               }
               # This delete used to occur prior to the space recovery file deletes. This created
               # an operational overlap window during the file deletes. An observed overlap demonstrated
               # that the overlaping run was including files nominated for delete during the original
               # run. This resulted in smaller than expected space recovery during the overlapping run.
               # Which is not the desired result.
               print LOG "Remove nomination file $nomfile. Operational overlap is now appropriate.\n";
               unlink $nomfile unless $main::CmdLine{'savenom'};
        } else {
               #  log: There is enough free space in $TopDir until next time
               my $Available = $Freespace - $ThresholdLow;
               print LOG "$Available kb currently available\n";
               print LOG "No files need to be deleted.\n";
        }

#  Stop logging.
setDateAndTime();
print LOG "\nCompleted at $Datestamp $Timestamp.\n";
close(LOG);
exit 0;
