#!/bin/bash
#==============================================================================
# Copyright and license info is available in the LICENSE file included with
# the Server Deployment Package (SDP), and also available online:
# https://swarm.workshop.perforce.com/projects/perforce-software-sdp/view/main/LICENSE
#------------------------------------------------------------------------------

set -u

# Common functions used in all backup scripts.

#------------------------------------------------------------------------------
# Verify key variables in the shell environment exist, or else abort.
#
# If checks in this function fail, this function an 'echo' and 'exit 1'
# rather than calling 'log' or 'die', as this function is generally called
# early in processing, before the log is initialized.
#------------------------------------------------------------------------------
check_vars () {

   local CheckVarsPreflightOK=1
   CommonVars="SDP_INSTANCE P4HOME P4PORT P4ROOT P4JOURNAL P4BIN P4DBIN P4TICKETS P4TRUST KEEPCKPS KEEPJNLS KEEPLOGS CHECKPOINTS LOGS OSUSER"
   InstanceVars="P4MASTER_ID P4MASTERPORT"

   # First, check vars that should be set in /p4/common/bin/p4_vars.
   for var in $CommonVars; do
      # Detect unset variables, using ':-' to avoid 'unbound variable' errors.
      # shellcheck disable=SC1083
      if [[ -z "$(eval echo \${"$var":-})" ]]; then
         echo "Error: Required variable \$$var is NOT set. It should be set in /p4/common/bin/p4_vars."
         CheckVarsPreflightOK=0
      fi
   done

   # Next, check vars that should be set in /p4/common/config/p4_N.instance.
   # For some variables, provide additional details that help help users correct
   # the problem.
   for var in $InstanceVars; do
      # shellcheck disable=SC1083
      if [[ -z "$(eval echo \${"$var":-})" ]]; then
         echo "Error: Required variable \$$var is NOT set. It should be set in /p4/common/config/p4_N.vars, where N is the SDP instance name."

         if [[ "$var" == "P4MASTER_ID" ]]; then
            echo "The value for P4MASTER_ID should be the name of the ServerID of the master server."
         fi

         CheckVarsPreflightOK=0
      fi
   done

   if [[ "$CheckVarsPreflightOK" -eq 0 ]]; then
      echo "Use p4master_run or source p4_vars before calling this script."
      echo "Aborting to to errors in shell environment preflight checks."
      exit 1
   fi
}

#------------------------------------------------------------------------------
# is_edge ($ServerID, $RootDir)
#
# Determine if a given ServerID is an edge server or not, checking a given
# database root dir (e.g. $P4ROOT or $OFFLINE_DB).
#
# Input:
# $1 - ServerID (required)
# $2 - RootDir (optional, defaults to $P4ROOT)
#
# Output YES if an edge server, NO otherwise.
#
#------------------------------------------------------------------------------
is_edge () {
   local ServerID=${1:-Unset}
   local RootDir=${2:-$P4ROOT}
   local ServicesData=
   local EdgeCheck=

   # Extract a slice of db.server referencing the given ServerID,
   # and then grab the field containing Services data.
   ServicesData=$("$P4DBIN" -r "$RootDir" -J off -L /dev/null -k db.server -jd - 2>&1 |\
      $GREP "@db.server@ @${ServerID}@" | "$CUT" -d '@' -f 13)

   # Do a bitwise operation to determine if the ServicesData value indicates
   # this is an edge server.
   if [[ -n "$ServicesData" ]]; then
      EdgeCheck=$((ServicesData & 4096))

      if [[ "$EdgeCheck" -gt 0 ]]; then
         echo YES
      else
         echo NO
      fi
   else
      echo NO
   fi
}

#------------------------------------------------------------------------------
# is_standby ($ServerID, $RootDir)
#
# Determine if a given ServerID is an standby server or not, checking a given
# database root dir (e.g. $P4ROOT or $OFFLINE_DB).
#
# Input:
# $1 - ServerID (required)
# $2 - RootDir (optional, defaults to $P4ROOT)
#
# Output YES if an standby server, NO otherwise.
#
#------------------------------------------------------------------------------
is_standby () {
   local ServerID="${1:-Unset}"
   local RootDir="${2:-$P4ROOT}"
   local ServicesData=

   # Extract a slice of db.server referencing the given ServerID,
   # and then grab the field containing Services data.
   ServicesData=$("$P4DBIN" -r "$RootDir" -J off -L /dev/null -k db.server -jd - 2>&1 |\
      "$GREP" "@db.server@ @${ServerID}@" | "$CUT" -d '@' -f 13 | tr -d ' ')

   # Do a check to see if the ServicesData value indicates
   # this is an standby server.
   if [[ -n "$ServicesData" ]]; then
      if [[ "$ServicesData" -eq '35141' ]]; then
         echo YES
      else
         echo NO
      fi
   else
      echo NO
   fi
}

#------------------------------------------------------------------------------
# Set variables for use in various scripts:
# RC=path to the init scripts
# OFFLINE_DB=path to offline db directory
# EDGESERVER=1 if this is an edge server, 0 otherwise.
# STANDBYSERVER=1 if this is a standby server, 0 otherwise.
#
# This must be called after loading the standard shell environment by
# doing:
# source /p4/common/bin/p4_vars N
#
# This sets P4HOME, SERVERID, etc. needed by this function.
#------------------------------------------------------------------------------
set_vars () {
   RC="$P4HOME/bin/p4d_${SDP_INSTANCE}_init"
   OFFLINE_DB="${P4HOME}/offline_db"
   CKPTMPDIR="${CHECKPOINTS}/ckp_tmp"

   # shellcheck disable=SC2153
   if [[ -n "$SERVERID" ]]; then
      if [[ "$(is_edge "$SERVERID")" == YES ]]; then
         export EDGESERVER=1
      else
         export EDGESERVER=0
      fi
   else
      export EDGESERVER=0
   fi

   if [[ -n "$SERVERID" ]]; then
      if [[ "$(is_standby "$SERVERID")" == YES ]]; then
         export STANDBYSERVER=1
         # Get commit server from P4TARGET setting in database
      else
         export STANDBYSERVER=0
      fi
   else
      export STANDBYSERVER=0
   fi

   # Ensure that SDP_ADMIN_PASSWORD_FILE is set, using existing value if set (e.g.
   # in p4_vars), otherise set it to the SDP standard value.
   export SDP_ADMIN_PASSWORD_FILE="${SDP_ADMIN_PASSWORD_FILE:-Unset}"
   if [[ "$SDP_ADMIN_PASSWORD_FILE" == Unset ]]; then
      export SDP_ADMIN_PASSWORD_FILE="$P4CCFG/.p4passwd.${P4SERVER}.admin"
   fi
}

#------------------------------------------------------------------------------
# Check if user is running as required OS user.
#------------------------------------------------------------------------------
check_uid () {
   user=$(id -un)
   if [[ "${user}" != "${OSUSER}" ]]; then
      die "Must be run by user: ${OSUSER}. Abort!"
   fi
}

#------------------------------------------------------------------------------
# Function log() - echo message to logfile or stdout.
#
# If $LOGFILE is defined, write message to the log file only; nothing goes to
# stdout.  Prepend a datestamp.
# If $LOGFILE isn't defined, just echo to stdout, w/o timestamp or.
# In all cases, support '-e' formatting.
# Input:
# $1 - message to log (must be quoted).
#------------------------------------------------------------------------------
log () {
   if [[ "${LOGFILE:-Unset}" != Unset ]]; then
      echo -n "$(date)" >> "$LOGFILE" 2>&1
      echo -e " $0: $*" >> "$LOGFILE" 2>&1
   else
      echo -e "$@"
   fi
}

#------------------------------------------------------------------------------
# Decide depending on our mail utility, how to specify sender (if we need to).
# Mail on some platforms sets sender by default.
# If the mail utility returns what looks like a version identifier
# when given the '-V' flag, use a '-S' flag.  If it does not return a
# verision identifier, don't set a mail sender option.
# Allow GNU Mailutils alternative flag instead.
#------------------------------------------------------------------------------
get_mail_sender_opt () {
   local mail_sender_opt=
   local mail_ver=
   if [[ -n "$MAILFROM" ]]; then
      mail_ver=$($SDPMAIL -V 2>&1)
      if [[ "$mail_ver" =~ "GNU Mailutils" ]]; then
         mail_sender_opt="-aFrom:$MAILFROM"
      elif  [[ "$mail_ver" =~ ^[0-9]+\.[0-9] ]]; then
         mail_sender_opt="-S from=$MAILFROM"
      fi
   fi
   echo "$mail_sender_opt"
}

#------------------------------------------------------------------------------
# Email the log file by $LOGFILE.
#------------------------------------------------------------------------------
mail_log_file () {
   local subject=$1
   local mail_sender_opt
   mail_sender_opt=$(get_mail_sender_opt)
   "$SDPMAIL" -s "$subject" "$mail_sender_opt" "$MAILTO" < "$LOGFILE"
}

#------------------------------------------------------------------------------
# Function die() - log message, send email, and exit.
# If $LOGFILE is defined, write message to the log file, email log,
# and exit.
# If $LOGFILE is not defined, write message to the stdout, and skip
# email.
# If in terminal session, display message to stderr as well.
#------------------------------------------------------------------------------
die () {
   # mail the error (with more helpful subject line than cron)
   log "ERROR!!! - $HOSTNAME $P4SERVER $0: $*"

   if [[ "${LOGFILE:-Unset}" != Unset ]]; then
      mail_log_file "ERROR!!! - $HOSTNAME $P4SERVER $0: $*"
   fi

   # if running from terminal, also send to stderr
   if tty >/dev/null; then
      echo -e "$@" >&2
   fi
   rm -f "${LOGS}/ckp_running.txt"

   exit 1
}

#------------------------------------------------------------------------------
# Convert various byte values (K,M,G,%) to bytes
# Pass in values such as 1024K, 512M, 1G or 10%
#------------------------------------------------------------------------------
convert_to_bytes () {
   local value=$1
   local totalsize=${2:-Undefined}
   local size=
   local unit=

   # Break up value into size (numeric) and unit (K,M,G)
   size=$("$GREP" -Eo '[[:alpha:]%]+|[0-9]+' <<< "$value" | head -1)
   unit=$("$GREP" -Eo '[[:alpha:]%]+|[0-9]+' <<< "$value" | tail -1)

   # Based on unit, convert to bytes
   case "$unit" in
      K)
         echo $((size * 1024))
         ;;
      M)
         echo $((size * 1024**2))
         ;;
      G)
         echo $((size * 1024**3))
         ;;
      %)
         echo $((totalsize * size / 100))
         ;;
   esac
}

#------------------------------------------------------------------------------
# Write a semaphore file, $LOGS/ckp_running.txt.  This file is written at
# the start of processing, and removed upon successful completion.  It
# prevents multiple concurrent operations from being launched accidentally
# e.g. by multiple human admins, or a human inadvertantly competing with a
# cron job.
#
# It is also intended to get human admins to determine the root cause of
# checkpoint failues.
#------------------------------------------------------------------------------
ckp_running() {
   if [[ -f "${LOGS}/ckp_running.txt" ]]; then
      die "Last checkpoint not complete. Check the backup process or contact support."
   fi
   echo "Checkpoint running." > "${LOGS}/ckp_running.txt"
}

#------------------------------------------------------------------------------
# Remove the ckp_running.txt semaphore file when checkpoint processing is
# complete.
#------------------------------------------------------------------------------
ckp_complete() {
   rm -f "${LOGS}/ckp_running.txt"
}

#------------------------------------------------------------------------------
# Ensure key directories are writable. Abort if they are not.
#------------------------------------------------------------------------------
check_dirs () {
   # Check that key dirs are writable
   declare -i dirsOK=1
   dirList="$OFFLINE_DB $CHECKPOINTS $LOGS"
   [[ "$EDGESERVER" -eq 1 ]] && dirList+=" ${CHECKPOINTS}.${SERVERID#p4d_}"
   for dir in $dirList; do
      if [[ ! -d "$dir" || ! -w "$dir" ]]; then
         log "Error: Dir $dir does not exist or is not writable."
         dirsOK=0
      fi
   done
   [[ "$dirsOK" -eq 1 ]] || die "Some expected dirs are missing or not writable. Aborting."
}

#------------------------------------------------------------------------------
# Add the results of df -h or df -m to the log file.
#------------------------------------------------------------------------------
check_disk_space () {
   log "Checking disk space..."
   $P4BIN diskspace >> "$LOGFILE" 2>&1
}

#------------------------------------------------------------------------------
# Check value of journal; ensure it is an integer.
#------------------------------------------------------------------------------
check_journalnum () {
   local JNLNUM=${1:-Unset}
   re='^[0-9]+$'
   if ! [[ $JNLNUM =~ $re ]] ; then
      die "The journal counter value [$JNLNUM] is invalid. It must be numeric."
   fi
}

#------------------------------------------------------------------------------
# Check the checkpoints directory for the oldest checkpoint
#------------------------------------------------------------------------------
get_ckpnum () {
   if [[ "$EDGESERVER" -eq 0 ]]; then
      # shellcheck disable=SC2034 disable=SC2012 disable=SC2016
      OLDESTCHECKPOINT=$(ls -1tr "${CHECKPOINTS}/" | "$GREP" ckp | "$GREP" -v md5 | head -n 1 | "$AWK" -F '.ckp.' '{ print $(2) }' | tr -d '.gz')
   else
      # shellcheck disable=SC2034 disable=SC2012 disable=SC2016
      OLDESTCHECKPOINT=$(ls -1tr "${CHECKPOINTS}.${SERVERID#p4d_}/" | "$GREP" ckp | "$GREP" -v md5 | head -n 1 | "$AWK" -F '.ckp.' '{ print $(2) }' | tr -d '.gz')
   fi
}

#------------------------------------------------------------------------------
# Determine journal counter by checking counter in db.counters.
#------------------------------------------------------------------------------
get_journalnum () {
   # get the current journal and checkpoint serial numbers.
   local nextCheckpointNum
   if [[ -r "$P4ROOT/db.counters" ]]; then
      nextCheckpointNum=$("$P4DBIN" -r "$P4ROOT" -k db.counters -jd - 2>&1 | grep @journal@ | cut -d '@' -f 8)

      if [[ -n "$nextCheckpointNum" ]]; then
         check_journalnum "$nextCheckpointNum"
         JOURNALNUM="$nextCheckpointNum"
      else
         # Special case: If db.counters is empty, then we have a new/empty data
         # set, so just set the value to 0.
         JOURNALNUM=0
      fi
   else
      # Special case: If db.counters doesn't exist, then we have a new/empty
      # data set, so just set the value to 0.
      JOURNALNUM=0
   fi

   # If we are on an edge server, the journal has already rotated, so we have to decrement the value
   # so that we replay the correct journal file and create the correct checkpoint number on the
   # edge server.
   #
   # In the case of a standby server, the journal rotation occurs on the master server,
   # so we don't need to increment the journal number again, so we decrement by 1.
   # Also, when replaying the journals to the offline db, we don't want to play to the live journal
   # because it is still being replicated.
   if [[ "$EDGESERVER" -eq 1 || "$STANDBYSERVER" -eq 1 ]]; then
      JOURNALNUM=$((JOURNALNUM - 1))
   fi
   CHECKPOINTNUM=$((JOURNALNUM + 1))
}

#------------------------------------------------------------------------------
# Determine journal space usage and minimum disk space requirement
#------------------------------------------------------------------------------
get_journal_stats () {
   # Get minimum disk space required on server journal filesystem before server rejects commands
   # This will return the configured and default value, but grab the configured value which shows first
   # If a configured value is not present, it will use the default value
   # shellcheck disable=SC2034 disable=SC2016
   P4JOURNALMIN=$("$P4BIN" configure show filesys.P4JOURNAL.min | "$AWK" '{ print $1 }' | $CUT -d'=' -f2 | head -1)
   # Get current journal free disk space
   # shellcheck disable=SC2034
   P4JOURNALFREE=$("$P4BIN" -ztag -F "%freeBytes%" diskspace P4JOURNAL)
   # Get total available disk space for journal
   # shellcheck disable=SC2034
   P4JOURNALTOTAL=$("$P4BIN" -ztag -F "%totalBytes%" diskspace P4JOURNAL)
}

#------------------------------------------------------------------------------
# Verify that the offline databases are usable by checking the existence
# of a 'offline_db_usable.txt' file that is written only when databases
# are in a known-good state, following successful recovery from a checkpoint.
#------------------------------------------------------------------------------
check_offline_db_usable () {
   # Check it is OK
   if [[ ! -f "$OFFLINE_DB/offline_db_usable.txt" ]]; then
      die "Offline database not in a usable state. Check the backup process."
   fi

   if [[ ! -f "$OFFLINE_DB/db.counters" ]]; then
      die "Offline database not found. Consider creating it with live_checkpoint.sh. Be aware that it locks the live system and can take a long time! Abort!"
   fi
}

#------------------------------------------------------------------------------
# Determine journal counter in offline databases.
#------------------------------------------------------------------------------
get_offline_journal_num () {
   # Get the journal number of the offline database
   check_offline_db_usable
   OFFLINEJNLNUM=$("$P4DBIN" -r "$OFFLINE_DB" -jd - db.counters 2>&1 | grep '@journal@' | cut -d "@" -f 8 2>> "$LOGFILE") || die "Cannot get the offline journal number. Abort!"
   check_journalnum "$OFFLINEJNLNUM"
   log "Offline journal number is: $OFFLINEJNLNUM"
}

#------------------------------------------------------------------------------
# Cleanup old log files.
#------------------------------------------------------------------------------
remove_old_checkpoints_and_journals () {
   declare CheckpointsDir=
   declare StandbyReplicaJournalsDir=
   declare FilePrefix=

   if [[ "$KEEPCKPS" -eq 0 ]]; then
      log "Skipping cleanup of old checkpoints because KEEPCKPS is set to 0."
   else
      log "Deleting obsolete checkpoints and journals. Keeping latest $KEEPCKPS  per KEEPCKPS setting in p4_vars."
      if [[ "$EDGESERVER" -eq 0 ]]; then
         # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N
         CheckpointsDir="${CHECKPOINTS}"
         FilePrefix="${P4SERVER}"
      else
         # Refer to ckp/jnl files starting like (example ServerID=p4d_edge_nyc):
         # /p4/N/checkpoints.edge_nyc/p4_N.edge_nyc
         CheckpointsDir="${CHECKPOINTS}.${SERVERID#p4d_}"
         FilePrefix="${P4SERVER}.${SERVERID#p4d_}"
      fi

      if [[ -d "$CheckpointsDir" ]]; then
         # Remove selected checkpoint and journal files based on the KEEPCKPS
         # setting regardless of whether compressed or not.
         # We multiply KEEPCKP by 2 for the ckp files because of the md5 files.
         # shellcheck disable=SC2012
         for I_LOGFILE in $(ls -t "${CheckpointsDir}/${FilePrefix}".ckp.* 2>/dev/null | "$AWK" "NR > ($KEEPCKPS * 2)"); do
            log "rm -f $I_LOGFILE"
            rm -f "$I_LOGFILE"
         done

         # Use KEEPJNLS to allow for separate journal rotation at a higher
         # frequency.
         # shellcheck disable=SC2012
         for I_LOGFILE in $(ls -t "${CheckpointsDir}/${FilePrefix}".jnl.* 2>/dev/null | "$AWK" "NR > $KEEPJNLS"); do
            log "rm -f $I_LOGFILE"
            rm -f "$I_LOGFILE"
         done
      fi

      StandbyReplicaJournalsDir="${P4HOME}/journals.rep"
      if [[ -d "$StandbyReplicaJournalsDir" ]]; then
         # shellcheck disable=SC2012
         for I_LOGFILE in $(ls -t "$StandbyReplicaJournalsDir/${FilePrefix}".ckp.* 2>/dev/null | "$AWK" "NR > ($KEEPCKPS * 2)"); do
            log "rm -f $I_LOGFILE"
            rm -f "$I_LOGFILE"
         done

         # shellcheck disable=SC2012
         for I_LOGFILE in $(ls -t "${StandbyReplicaJournalsDir}/${FilePrefix}".jnl.* 2>/dev/null | "$AWK" "NR > $KEEPJNLS"); do
            log "rm -f $I_LOGFILE"
            rm -f "$I_LOGFILE"
         done
      fi
   fi
}

#------------------------------------------------------------------------------
# Shutdown p4d using the standard init script. Log the shutdown activity.
#------------------------------------------------------------------------------
stop_p4d () {
   log "Shutting down the p4 server"
   $RC stop >> "$LOGFILE" 2>&1
   log "p4 stop finished -- p4 is down now."
}

#------------------------------------------------------------------------------
# Start p4d using the standard init script. Log the startup activity.
# Return status indicates whether the server started successfully or not.
#------------------------------------------------------------------------------
start_p4d () {
   log "Starting the p4 server"
   "$RC" start >> "$LOGFILE" 2>&1
   sleep 3 # Give it a few seconds to start up
   # Confirm that it started - success below means it did
   if "$P4BIN" -u "$P4USER" -p "$P4PORT" info -s >/dev/null 2>&1 ; then
      log "Server restarted successfully - p4 should be back up now."
      return 0
   else
      log "Error: Server does not appear to have started."
      return 1
   fi
}

#------------------------------------------------------------------------------
# Call 'p4d -jj' to rotate the current/active journal file on the master
# server, starting a fresh new P4JOURNAL file.
#
# In a distributed topology with replicas/edge servers, this function must
# be called on the master/commit server.
#------------------------------------------------------------------------------
truncate_journal () {
   declare CheckpointFile="${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz"
   declare JournalFile="${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM}"

   if [[ "$EDGESERVER" -eq 0 ]]; then
      [[ -f "$CheckpointFile" ]] && \
         die "Checkpoint $CheckpointFile already exists, check the backup process."
      [[ -f "$JournalFile" ]] && \
         die "Journal $JournalFile already exists, check the backup process."

      log "Truncating journal..."
      # During journal rotation, either by a front-door 'p4 admin journal' or a
      # back-door 'p4d -jj', p4d does a copy-then-delete rather than an mv at
      # the OS level.  During rotation, the perforce server will pause
      # responses to clients (just as with a checkpoint), but this should be
      # for a short period of time even for large data sets, as the journal
      # typically represents a single day of metadata.
      # Curly braces capture output of 'time'.
      "$P4CBIN"/p4login -p "$P4MASTERPORT"
      { time "$P4BIN" -p "$P4MASTERPORT" admin journal; } >> "$LOGFILE" 2>&1 || { die "Journal rotation failed. Abort!"; }
      # The test below waits until the journal file exists in the checkpoints directory before proceeding.
      test=1
      while [[ $test != 0 ]]; do
         sleep 5
         if [[ -f "$JournalFile" ]]; then
            test=0
         fi
      done
      "$P4CBIN"/p4login
   else
      log "Warning: truncate_journal () function should not be called on an edge server. Ignoring."
   fi
}

#------------------------------------------------------------------------------
# Call 'p4d -jj' to rotate the current/active journal file on the master
# server from an edge server, starting a fresh new P4JOURNAL file.
#
# In a distributed topology with edge and standby servers, this function can be
# used to trigger a journal rotation on master/commit server. It's not meant to
# be used from the master server itself.
#------------------------------------------------------------------------------
truncate_journal_on_master () {
   # Increment Edge journal number since the journal will increment on the master after calling journal rotation
   local EdgeJournalNum=$((JOURNALNUM + 1))
   local StandbyJournalNum=$((JOURNALNUM + 2)) # If using journalcopy, have to add 2 since live journal is in checkpoints folder
   local JournalFile=

   if [[ "$EDGESERVER" -eq 1 ]]; then
      # Refer to ckp/jnl files starting like (example ServerID=p4d_edge_nyc):
      # /p4/N/checkpoints.edge_nyc/p4_N.edge_nyc
      JournalFile="${CHECKPOINTS}.${SERVERID#p4d_}/${P4SERVER}.${SERVERID#p4d_}.jnl.${EdgeJournalNum}"
   elif [[ "$STANDBYSERVER" -eq 1 ]]; then
      JournalFile="${CHECKPOINTS}/${P4SERVER}.jnl.${StandbyJournalNum}"
   fi

   if [[ "$EDGESERVER" -eq 1 || "$STANDBYSERVER" -eq 1 ]]; then
      [[ -f "$JournalFile" ]] && \
         die "Journal $JournalFile already exists, check the backup process."

      log "Truncating journal on ${P4MASTERPORT}."
      # 'p4d -jj' does a copy-then-delete, instead of a simple mv.
      # During 'p4d -jj' the perforce server will hang the responses to clients,
      # this should be for a very short period of time even for large data
      # sets, as the journal represents a single day of metadata.
      # Curly braces capture output of 'time'.
      "$P4CBIN"/p4login -p "$P4MASTERPORT"
      { time "$P4BIN" -p "$P4MASTERPORT" admin journal; } >> "$LOGFILE" 2>&1 || { die "Journal rotation failed. Abort!"; }
      # The test below waits until the journal file exists in the checkpoints directory before proceeding.
      test=1
      while [[ $test != 0 ]]; do
         sleep 5
         if [[ -f "$JournalFile" ]]; then
            test=0
         fi
      done
      "$P4CBIN"/p4login -service
   else
      log "Warning: truncate_journal_on_master () function should not be called on a master server. Ignoring."
   fi
}

#------------------------------------------------------------------------------
# Similar to truncate_journal() above, p4d_truncate_journal() is intended to be
# usable form the p4d_base init script, to allow journal rotation on p4d
# start.
#------------------------------------------------------------------------------
p4d_truncate_journal () {
   declare JournalFile="${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM}"

   if [[ "$EDGESERVER" -eq 0 ]]; then
      [[ -f "$JournalFile" ]] && \
         die "Journal $JournalFile already exists, check the backup process."
      log "Rotating journal prior to starting p4d."
      "$P4DBIN" -r "$P4ROOT" -J "$P4JOURNAL" -jj >> "$LOGFILE" 2>&1 ||\
         die "Failed to rotate journal. Aborting p4d server start."
   fi
}

#------------------------------------------------------------------------------
# Replay any and all numbered journal files into the offline databases.
#------------------------------------------------------------------------------
replay_journals_to_offline_db () {
   local CheckpointsDir=
   local FilePrefix=
   local NumberedJournal=

   log "Replay any unreplayed journals to the offline database."

   if [[ "$EDGESERVER" -eq 0 ]]; then
      # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N
      CheckpointsDir="${CHECKPOINTS}"
      FilePrefix="${P4SERVER}"
   else
      # Refer to ckp/jnl files starting like (example ServerID=p4d_edge_nyc):
      # /p4/N/checkpoints.edge_nyc/p4_N.edge_nyc
      CheckpointsDir="${CHECKPOINTS}.${SERVERID#p4d_}"
      FilePrefix="${P4SERVER}.${SERVERID#p4d_}"
   fi

   for (( j=OFFLINEJNLNUM; j <= JOURNALNUM; j++ )); do
      NumberedJournal="${CheckpointsDir}/${FilePrefix}.jnl.${j}"
      log "Replay journal $NumberedJournal to offline db."
      # Curly braces capture output of 'time'.
      { time "$P4DBIN" -r "$OFFLINE_DB" -jr -f "$NumberedJournal"; } >> "$LOGFILE" 2>&1 || { die "Offline journal replay failed. Abort!"; }
   done
}

#------------------------------------------------------------------------------
# Replay the live, active P4JOURNAL file into the offline databaes.
#------------------------------------------------------------------------------
replay_active_journal_to_offline_db () {
   log "Replay active journal to offline db."
   # Curly braces capture output of 'time'.
   { time "$P4DBIN" -r "$OFFLINE_DB" -jr -f "${P4JOURNAL}"; } >> "$LOGFILE" 2>&1 || { die "Active Journal replay failed. Abort!"; }
}

#------------------------------------------------------------------------------
# Recreate offline databases from the latest checkpoint.
#------------------------------------------------------------------------------
recreate_offline_db_files () {
   local CheckpointsDir=
   local FilePrefix=
   local LastCheckpointMD5=
   local LastCheckpoint=

   if [[ "$EDGESERVER" -eq 0 ]]; then
      # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N
      CheckpointsDir="${CHECKPOINTS}"
      FilePrefix="${P4SERVER}"
   else
      # Refer to ckp/jnl files starting like (example ServerID=p4d_edge_nyc):
      # /p4/N/checkpoints.edge_nyc/p4_N.edge_nyc
      CheckpointsDir="${CHECKPOINTS}.${SERVERID#p4d_}"
      FilePrefix="${P4SERVER}.${SERVERID#p4d_}"
   fi

   if [[ "$SERVERID" == "$P4MASTER_ID" ]]; then
      if [[ -z "$(find "${CHECKPOINTS}/" -maxdepth 1 -type f -name "${P4SERVER}.ckp.*.gz" -printf 1 -quit)" ]]; then
         ckp_complete
         die "No checkpoints found in $CHECKPOINTS.  Consider running 'live_checkpoint.sh $SDP_INSTANCE'."
      fi
   fi

   # shellcheck disable=SC2129
   rm -f "${OFFLINE_DB}"/offline_db_usable.txt >> "$LOGFILE" 2>&1
   rm -f "${OFFLINE_DB}"/db.* >> "$LOGFILE" 2>&1
   rm -f "${OFFLINE_DB}"/save/db.* >> "$LOGFILE" 2>&1
   # shellcheck disable=SC2012
   LastCheckpointMD5=$(ls -t "${CheckpointsDir}/${FilePrefix}".ckp.*.md5 | head -1)
   [[ -n "$LastCheckpointMD5" ]] || \
      die "Could not find *.md5 file for latest checkpoint. Abort!"

   # Account for the idiosyncracy that MD5 files for checkpoints may look
   # like p4_N.ckp.gz.md5 or p4_N.ckp.md5.
   if [[ "$LastCheckpointMD5" == *".gz.md5" ]]; then
      LastCheckpoint="${LastCheckpointMD5%.md5}"
   else
      LastCheckpoint="${LastCheckpointMD5%.md5}.gz"
   fi

   [[ -r "$LastCheckpoint" ]] || \
      die "Missing last checkpoint file: $LastCheckpoint. Abort!"

   log "Recovering from last full checkpoint, $LastCheckpoint."
   # Curly braces capture output of 'time'.
   { time "$P4DBIN" -r "$OFFLINE_DB" -jr -z "${LastCheckpoint}"; } >> "$LOGFILE" 2>&1 || { die "Restore of checkpoint to $OFFLINE_DB failed!"; }
   echo "Offline db file restored successfully." > "${OFFLINE_DB}/offline_db_usable.txt"
}

#------------------------------------------------------------------------------
# Take a live checkpoint from db.* files in P4ROOT.
#------------------------------------------------------------------------------
checkpoint () {
   local CheckpointsDir=
   local FilePrefix=

   log "Create a new checkpoint from live db files in $P4ROOT."

   if [[ "$EDGESERVER" -eq 0 ]]; then
      # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N
      CheckpointsDir="${CHECKPOINTS}"
      FilePrefix="${P4SERVER}"
   else
      die "Checkpoints may not be run on an edge server."
   fi

   # Curly braces capture output of 'time'.
   { time "$P4DBIN" -r "$P4ROOT" -jc -Z "${CheckpointsDir}/${FilePrefix}"; } >> "$LOGFILE" 2>&1 || { die "ERROR - New checkpoint failed!"; }
}

#------------------------------------------------------------------------------A
# Take a checkpoint from the ROOTDIR, typically either /p4/N/root or
# /p4/N/offline_db.
#------------------------------------------------------------------------------
dump_checkpoint () {
   declare CheckpointsDir=
   declare NewCheckpoint=
   declare NewCheckpointMD5=
   declare FilePrefix=

   # shellcheck disable=SC2153
   log "Dump out new checkpoint from db files in $ROOTDIR."

   if [[ "$EDGESERVER" -eq 0 ]]; then
      # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N
      CheckpointsDir="${CHECKPOINTS}"
      FilePrefix="${P4SERVER}"
   else
      # Refer to ckp/jnl files starting like (example ServerID=p4d_edge_nyc):
      # /p4/N/checkpoints.edge_nyc/p4_N.edge_nyc
      CheckpointsDir="${CHECKPOINTS}.${SERVERID#p4d_}"
      FilePrefix="${P4SERVER}.${SERVERID#p4d_}"
   fi

   NewCheckpoint="${CheckpointsDir}/${FilePrefix}.ckp.${CHECKPOINTNUM}.gz"
   NewCheckpointMD5="${NewCheckpoint}.md5"

   if [[ -r "$NewCheckpoint" && -r "$NewCheckpointMD5" ]]; then
      log "\nWarning: Skipping generation of existing checkpoint $NewCheckpoint.\nVerified MD5 file exists: $NewCheckpointMD5."
      return
   fi

   # Curly braces capture output of 'time'.
   { time "$P4DBIN" -r "$ROOTDIR" -jd -z "${CheckpointsDir}/${FilePrefix}.ckp.${CHECKPOINTNUM}.gz"; } >> "$LOGFILE" 2>&1 || { die "New checkpoint dump failed!"; }
}

#------------------------------------------------------------------------------
# Compare journal numbers between live and offline databases, to ensure
# they can be safely swapped out.
#------------------------------------------------------------------------------
compare_journal_numbers () {
   # Get the journal number of the offline database
   if [[ ! -f "$OFFLINE_DB/offline_db_usable.txt" ]]; then
      die "Offline database not in a usable state. Check the backup process."
   fi

   if [[ ! -f "$OFFLINE_DB/db.counters" ]]; then
      die "Offline database not found. Consider creating it with live_checkpoint.sh. Be aware that it locks the live system and can take a long time! Abort!"
   fi

   local _OFFLINEJNLNUM
   _OFFLINEJNLNUM=$("$P4DBIN" -r "$OFFLINE_DB" -jd - db.counters 2>&1 | grep '@journal@' | cut -d "@" -f 8 2>> "$LOGFILE") || die "Cannot get $OFFLINE_DB journal number. Abort!"
   check_journalnum "$_OFFLINEJNLNUM"

   # Get the journal number of the root database
   if [[ ! -f "$P4ROOT/db.counters" ]]; then
      die "$P4ROOT database not found. Something is seriously wrong since the server was just running a minute ago! Contact support@perforce.com"
   fi
   local _JNLNUM
   _JNLNUM=$("$P4DBIN" -r "$P4ROOT" -jd - db.counters 2>&1 | grep '@journal@' | cut -d "@" -f 8 2>> "$LOGFILE") || die "Cannot get $P4ROOT journal number. Abort!"
   check_journalnum "$_JNLNUM"

   if [[ "$_JNLNUM" -gt "$_OFFLINEJNLNUM" ]]; then
      log "$P4ROOT journal number is: $_JNLNUM"
      log "$OFFLINE_DB journal number is: $_OFFLINEJNLNUM"
      die "$OFFLINE_DB journal number is less than $P4ROOT, cannot switch."
   fi
}

#------------------------------------------------------------------------------
# Swap out live db.* database files in P4ROOT with those in offline_db.
#------------------------------------------------------------------------------
switch_db_files () {
   # Compare the Offline and Master journal numbers before switching to make sure they match.
   compare_journal_numbers
   log "Switching root and offline_db links..."
   [[ -d "${P4ROOT}"/save ]] || mkdir -p "${P4ROOT}"/save
   # shellcheck disable=SC2129
   rm -f "${P4ROOT}"/save/db.* >> "$LOGFILE" 2>&1
   mv "${P4ROOT}"/db.* "${P4ROOT}"/save >> "$LOGFILE" 2>&1
   mv "${P4ROOT}"/license* "${OFFLINE_DB}" >> "$LOGFILE" 2>&1
   mv "${P4ROOT}"/rdb.lbr "${OFFLINE_DB}" >> "$LOGFILE" 2>&1
   mv "${P4ROOT}"/state* "${OFFLINE_DB}" >> "$LOGFILE" 2>&1
   mv "${P4ROOT}"/server.id "${OFFLINE_DB}" >> "$LOGFILE" 2>&1
   rm -f "${OFFLINE_DB}/offline_db_usable.txt"
   OLDBLNK="$(readlink "$OFFLINE_DB")"
   ROOTLNK="$(readlink "$P4ROOT")"
   unlink "$OFFLINE_DB"
   unlink "$P4ROOT"
   ln -s "$OLDBLNK" "$P4ROOT" >> "$LOGFILE" 2>&1 || die "Link of $OLDBLNK to $P4ROOT failed."
   ln -s "$ROOTLNK" "$OFFLINE_DB" >> "$LOGFILE" 2>&1 || die "Link of $ROOTLNK to $OFFLINE_DB failed."
}

#------------------------------------------------------------------------------
# Rotate specified log files, and compress with gzip.
#------------------------------------------------------------------------------
rotate_log_file () {
   cd "$LOGS" || die "Could not cd to: $LOGS"
   ROTATE_LOGNAME=$1
   GZ_EXT=${2:-}
   LOGID=$(date +'%Y-%m-%d_%H-%M-%S')
   if [[ -f "${ROTATE_LOGNAME}" ]]; then
      mv -f "${ROTATE_LOGNAME}" "${ROTATE_LOGNAME}.${LOGID}" >> "$LOGFILE" 2>&1
      [[ -n "$GZ_EXT" ]] && gzip "${ROTATE_LOGNAME}.${LOGID}" >> "$LOGFILE" 2>&1
   fi
   cd - > /dev/null || die "Could not cd to: $OLDPWD"
}

#------------------------------------------------------------------------------
# At the start of each run for live_checkpoint.sh, daily_checkpoint.sh, and
# recreate_db_checkpoint.sh, before *any* logging activity occurs, rotate the
# logs from the most recent prior run, always named "checkpoint.log" or "log".
#------------------------------------------------------------------------------
rotate_last_run_logs () {
   # Rotate prior log file for the current script.
   rotate_log_file "$LOGFILE"

   # Rotate prior server log.
   rotate_log_file "log" ".gz"

   # Rotate prior broker log.
   rotate_log_file "p4broker.log" ".gz"

   # Rotate prior audit log.
   rotate_log_file "audit.log" ".gz"
}

#------------------------------------------------------------------------------
# Remove log files matching a specified name prefix, preserving a specified
# number of the recent logs.
#------------------------------------------------------------------------------
remove_log_files () {
   REMOVE_LOGNAME=$1
   KEEPNUM=$2

   # shellcheck disable=SC2012
   for I_LOGFILE in $(ls -t "${REMOVE_LOGNAME:?}"* 2>/dev/null | $AWK "NR > $KEEPNUM"); do
      log "rm -f $I_LOGFILE"
      rm -f "$I_LOGFILE"
   done
}

#------------------------------------------------------------------------------
# Remove old logs.
#------------------------------------------------------------------------------
remove_old_logs () {
   # Remove old Checkpoint Logs
   # Use KEEPJNLS rather than KEEPLOGS, so we keep the same number
   # of checkpoint logs as we keep checkpoints.
   cd "$LOGS" || die "Could not cd to: $LOGS"

   if [[ "$KEEPJNLS" -eq 0 ]]; then
      log "Skipping cleanup of old checkpoint logs because KEEPJNLS is set to 0."
   else
      log "Deleting old checkpoint logs.  Keeping latest $KEEPJNLS, per KEEPJNLS setting in p4_vars."
      remove_log_files "checkpoint.log" "$KEEPJNLS"
   fi

   if [[ "$KEEPLOGS" -eq 0 ]]; then
      log "Skipping cleanup of old server logs because KEEPLOGS is set to 0."
   else
      log "Deleting old server logs.  Keeping latest $KEEPLOGS, per KEEPLOGS setting in p4_vars."
      remove_log_files "log" "$KEEPLOGS"
      remove_log_files "p4broker.log" "$KEEPLOGS"
      remove_log_files "audit.log" "$KEEPLOGS"
      remove_log_files "sync_replica.log" "$KEEPLOGS"
      remove_log_files "recreate_offline_db.log" "$KEEPLOGS"
      remove_log_files "upgrade.log" "$KEEPLOGS"
      remove_log_files "p4login" "$KEEPLOGS"
      remove_log_files "p4verify.log" "$KEEPLOGS"
      remove_log_files "journal_watch.log" "$KEEPLOGS"
      remove_log_files "purge_revisions.log" "$KEEPLOGS"
   fi
   cd - > /dev/null || die "Could not cd to: $OLDPWD"
}

#------------------------------------------------------------------------------
# Set the SDP Checkpoint counter to indicate last successful SDP checkpoint
# operation. For standby servers, set the SDP Checkpoint counter on the master.
#------------------------------------------------------------------------------
set_counter() {
   "$P4CBIN/p4login"

   if [[ "$EDGESERVER" -eq 1 || "$STANDBYSERVER" -eq 1 ]]; then
      "$P4BIN" -u "$P4USER" -p "$P4MASTERPORT" counter "LastSDPCheckpoint.$SERVERID" "$(date +'%s (%Y/%m/%d %H:%M:%S %z %Z)')" > /dev/null
   else
      "$P4BIN" -u "$P4USER" -p "$P4PORT" counter "LastSDPCheckpoint.$SERVERID" "$(date +'%s (%Y/%m/%d %H:%M:%S %z %Z)')" > /dev/null
   fi
}

#------------------------------------------------------------------------------
# This is the function that is called to run the individual checkpoint
# dump or restores during a parallel run.
#------------------------------------------------------------------------------
function parallel_checkpoint_cmd ()
{
  echo "=== Running $* on $(date)." >> "$cmd_log"
  "$@" >> "$cmd_log" 2>&1
  status=$?
  if [[ "$status" -ne 0 ]]; then
    CkpFailed=1
  fi
  echo "=== $* completed on $(date)." >> "$cmd_log"
}

#------------------------------------------------------------------------------
# This function checks for running processes as part of the parallel dump and restore
#------------------------------------------------------------------------------
function check_running ()
{
   sleep 30
   #loop thread process id's and see if any have finished.
   spot=0
   run=()
   for p in "${ids[@]}"; do
      if [[ -n "$p" ]]; then
        # shellcheck disable=SC2009
        running=$(ps cax | grep "$p")
      fi
      if [[ -n "$running" ]]; then
         run[$spot]=$p
         spot=$((spot+1))
      else
         thread=$((thread-1))
      fi
   done

   if [[ "$spot" -ne 0 ]]; then
      ids=("${run[@]}")
   else
      ids=()
   fi
}

#------------------------------------------------------------------------------
# Dump db files in parallel from offline_db
#------------------------------------------------------------------------------
dump_parallel_ckp ()
{
   db_files=() # Clear array
   thread=0  # Set current threads to 0

   cd "$OFFLINE_DB" || die "Could not cd to: $OFFLINE_DB"

   [[ -d "${CKPTMPDIR}" ]] || mkdir "${CKPTMPDIR}"
   rm -f "${CKPTMPDIR:?}"/*

   # Build array of db_files in offline_db
   for f in db.*; do
      db_files+=( "$f" ) # Append db file to the array
   done

   # loop db_files running the number of them in parallel that is specified by the command line
   for f in "${db_files[@]}"; do
      # Loop to see if we are over our thread count.  If so wait until we drop below it again
      # shellcheck disable=SC2154
      while [[ $thread -ge "$Threads" ]]; do
         check_running
      done

      CkpCmd="${P4DBIN} -r ${OFFLINE_DB} -jd ${CKPTMPDIR}/$P4SERVER.ckp.${f} $f"
      echo "$CkpCmd" > greppattern.txt
      cmd_log="${LOG}-${f}.log"
      parallel_checkpoint_cmd "${P4DBIN}" -r "${OFFLINE_DB}" -jd "${CKPTMPDIR}/$P4SERVER.ckp.${f}" "$f" &
      sleep 1
      # shellcheck disable=SC2009
      pid=$(ps -ef | grep -F -f greppattern.txt | awk '{print $1;}')
      if [[ -n "$pid" ]]; then
         ids[$thread]=$pid # add the process ID into the array of running processes
         thread=$((thread+1))  #  add one to the thread count and start a new verify
      fi
   done

   rm greppattern.txt

   # now that we have started all of them wait until all of our processes have finished before continuing.
   while [[ $thread -gt 0 ]]; do
      check_running
   done

   cd "$CKPTMPDIR" || die "Could not cd to: $CKPTMPDIR"
   rm -f ./*.md5

   # now that the processes have finished combine all of the log file together
   for f in "${db_files[@]}"; do
      if [[ -f "${LOG}-${f}.log" ]]; then
         cat "${LOG}-${f}.log" >> "$LOGFILE"
         rm -f "${LOG}-${f}.log"
      fi
   done

   if [[ "$CkpFailed" -ne 0 ]]; then
      # shellcheck disable=SC2034
      StatusMessage="Error: Checkpoint failed.  Review the log [$LOGFILE]."
      ExitCode=1
   fi

   if [[ "$ExitCode" -ne 0 ]]; then
      die "New checkpoint dump failed!"
   fi

   msg "Completed parallel checkpoint at $(date)."
}

#------------------------------------------------------------------------------
# Restore from db files that have been extracted from a parallel checkpoint tgz file.
#------------------------------------------------------------------------------
restore_parallel_ckp ()
{
   db_files=() # Clear array
   thread=0  # Set current threads to 0

   [[ -d "${CKPTMPDIR}" ]] || die "$CKPTMPDIR doesn't exist! Restore failed."
   cd "$CKPTMPDIR" || die "Could not cd to: $CKPTMPDIR"

   rm -f "${OFFLINE_DB}"/offline_db_usable.txt >> "$LOGFILE" 2>&1
   rm -f "${OFFLINE_DB}"/db.* >> "$LOGFILE" 2>&1 

   # Build array of db_files in checkpoint temp dir
   for f in *; do
      db_files+=( "$f" ) # Append db file to the array
   done

   # loop db_files running the number of them in parallel that is specified by the command line
   for f in "${db_files[@]}"; do
      # Loop to see if we are over our thread count.  If so wait until we drop below it again
      while [[ $thread -ge "$Threads" ]]; do
         check_running
      done

      CkpCmd="${P4DBIN} -r ${OFFLINE_DB} -jr ${CKPTMPDIR}/${f}"
      echo "$CkpCmd" > greppattern.txt
      cmd_log="${LOG}-${f}.log"
      parallel_checkpoint_cmd "${P4DBIN}" -r "${OFFLINE_DB}" -jr "${CKPTMPDIR}/${f}" &
      sleep 1
      # shellcheck disable=SC2009
      pid=$(ps -ef | grep -F -f greppattern.txt | awk '{print $1;}')
      if [[ -n "$pid" ]]; then
         ids[$thread]=$pid # add the process ID into the array of running processes
         thread=$((thread+1))  #  add one to the thread count and start a new verify
      fi
   done

   rm greppattern.txt

   # now that we have started all of them wait until all of our processes have finished before continuing.
   while [[ $thread -gt 0 ]]; do
      check_running
   done

   # now that the processes have finished combine all of the log file together
   for f in "${db_files[@]}"; do
      if [[ -f "${LOG}-${f}.log" ]]; then
         cat "${LOG}-${f}.log" >> "$LOGFILE"
         rm -f "${LOG}-${f}.log"
      fi
   done

   if [[ "$CkpFailed" -ne 0 ]]; then
      # shellcheck disable=SC2034
      StatusMessage="Error: Checkpoint Restore failed.  Review the log [$LOGFILE]."
      ExitCode=1
   fi

   if [[ "$ExitCode" -ne 0 ]]; then
      die "Restore of checkpoint dump failed!"
   fi

   echo "Offline db file restored successfully." > "${OFFLINE_DB}/offline_db_usable.txt"
   msg "Completed parallel checkpoint restore at $(date)."
}

#------------------------------------------------------------------------------
# Create a tgz of the temporary checkpoint folder that contains individually dumped database checkpoints.
#------------------------------------------------------------------------------
create_tar_ckp ()
{
   cd "$CHECKPOINTS" || die "Could not cd to: $CHECKPOINTS"
   Ckptgz=${P4SERVER}.ckp.parallel.${CHECKPOINTNUM}.tgz
   [[ -f "$Ckptgz" ]] && die "$Ckptgz file already exists. Check the backup process!"
   { time tar cvzf "$Ckptgz" "${CKPTMPDIR}"; } >> "$LOGFILE" 2>&1 || { die "Failed to create tgz checkpoint file!"; }
   rm -rf "${CKPTMPDIR:?}"/*
}

#------------------------------------------------------------------------------
# Extract a tgz of the temporary checkpoint folder that contains individually dumped database checkpoints.
#------------------------------------------------------------------------------
extract_tar_ckp ()
{
   cd "$CHECKPOINTS" || die "Could not cd to: $CHECKPOINTS"
   Ckptgz=$1
   [[ -f $Ckptgz ]] || die "$Ckptgz doesn't exist!"
   { time tar xvzf "$Ckptgz"; } >> "$LOGFILE" 2>&1 || { die "Failed to extract $Ckptgz checkpoint file!"; }
}
