backup_functions.sh #40

  • //
  • guest/
  • russell_jackson/
  • sdp/
  • Server/
  • Unix/
  • p4/
  • common/
  • bin/
  • backup_functions.sh
  • View
  • Commits
  • Open Download .zip Download (62 KB)
#!/bin/bash
#==============================================================================
# Copyright and license info is available in the LICENSE file included with
# the Server Deployment Package (SDP), and also available online:
# https://swarm.workshop.perforce.com/projects/perforce-software-sdp/view/main/LICENSE
#------------------------------------------------------------------------------

set -u

# Global Variables.
export P4DInitScript=
export P4DSystemdServiceFile=
export P4BrokerInitScript=
export P4BrokerSystemdServiceFile=
export P4ProxyInitScript=
export P4ProxySystemdServiceFile=
export CKPTMPDIR=
export OFFLINE_DB=
export EDGESERVER=
export STANDBYSERVER=

# Common functions used in various SDP scripts.

#------------------------------------------------------------------------------
# Verify key variables in the shell environment exist, or else abort.
#
# If checks in this function fail, this function an 'echo' and 'exit 1'
# rather than calling 'log' or 'die', as this function is generally called
# early in processing, before the log is initialized.
#------------------------------------------------------------------------------
function check_vars () {

   local CheckVarsPreflightOK=1
   CommonVars="SDP_INSTANCE P4HOME P4PORT P4ROOT P4JOURNAL P4BIN P4DBIN P4TICKETS P4TRUST KEEPCKPS KEEPJNLS KEEPLOGS CHECKPOINTS LOGS OSUSER"
   InstanceVars="P4MASTER_ID P4MASTERPORT"

   # First, check vars that should be set in /p4/common/bin/p4_vars.
   for var in $CommonVars; do
      # Detect unset variables, using ':-' to avoid 'unbound variable' errors.
      # shellcheck disable=SC1083
      if [[ -z "$(eval echo \${"$var":-})" ]]; then
         echo "Error: Required variable \$$var is NOT set. It should be set in /p4/common/bin/p4_vars."
         CheckVarsPreflightOK=0
      fi
   done

   # Next, check vars that should be set in /p4/common/config/p4_N.instance.
   # For some variables, provide additional details that help help users correct
   # the problem.
   for var in $InstanceVars; do
      # shellcheck disable=SC1083
      if [[ -z "$(eval echo \${"$var":-})" ]]; then
         echo "Error: Required variable \$$var is NOT set. It should be set in /p4/common/config/p4_N.vars, where N is the SDP instance name."

         if [[ "$var" == "P4MASTER_ID" ]]; then
            echo "The value for P4MASTER_ID should be the name of the ServerID of the master server."
         fi

         CheckVarsPreflightOK=0
      fi
   done

   if [[ "$CheckVarsPreflightOK" -eq 0 ]]; then
      echo "Use p4master_run or source p4_vars before calling this script."
      echo "Aborting to to errors in shell environment preflight checks."
      exit 1
   fi
}

#------------------------------------------------------------------------------
#  is_edge ($ServerID, $RootDir)
#
# Determine if a given ServerID is an edge server or not, checking a given
# database root dir (e.g. $P4ROOT or $OFFLINE_DB).
#
# Input:
# $1 - ServerID (required)
# $2 - RootDir (optional, defaults to $P4ROOT)
#
# Output YES if an edge server, NO otherwise.
#
#------------------------------------------------------------------------------
function is_edge () {
   local ServerID=${1:-Unset}
   local RootDir=${2:-$P4ROOT}
   local ServicesData=
   local EdgeCheck=

   # Extract a slice of db.server referencing the given ServerID,
   # and then grab the field containing Services data.
   ServicesData=$("$P4DBIN" -r "$RootDir" -J off -L /dev/null -k db.server -jd - 2>&1 |\
      $GREP "@db.server@ @${ServerID}@" | "$CUT" -d '@' -f 13)

   # Do a bitwise operation to determine if the ServicesData value indicates
   # this is an edge server.
   if [[ -n "$ServicesData" ]]; then
      EdgeCheck=$((ServicesData & 4096))

      if [[ "$EdgeCheck" -gt 0 ]]; then
         echo YES
      else
         echo NO
      fi
   else
      echo NO
   fi
}

#------------------------------------------------------------------------------
# is_standby ($ServerID, $RootDir)
#
# Determine if a given ServerID is an standby server or not, checking a given
# database root dir (e.g. $P4ROOT or $OFFLINE_DB).
#
# Input:
# $1 - ServerID (required)
# $2 - RootDir (optional, defaults to $P4ROOT)
#
# Output YES if an standby server, NO otherwise.
#
#------------------------------------------------------------------------------
function is_standby () {
   local ServerID="${1:-Unset}"
   local RootDir="${2:-$P4ROOT}"
   local ServicesData=

   # Extract a slice of db.server referencing the given ServerID,
   # and then grab the field containing Services data.
   ServicesData=$("$P4DBIN" -r "$RootDir" -J off -L /dev/null -k db.server -jd - 2>&1 |\
      "$GREP" "@db.server@ @${ServerID}@" | "$CUT" -d '@' -f 13 | tr -d ' ')

   # Do a check to see if the ServicesData value indicates
   # this is an standby server.
   if [[ -n "$ServicesData" ]]; then
      if [[ "$ServicesData" -eq '35141' ]]; then
         echo YES
      else
         echo NO
      fi
   else
      echo NO
   fi
}

#------------------------------------------------------------------------------
# Set variables for use in various scripts:
# OFFLINE_DB=path to offline db directory
# EDGESERVER=1 if this is an edge server, 0 otherwise.
# STANDBYSERVER=1 if this is a standby server, 0 otherwise.
#
# This must be called after loading the standard shell environment by
# doing:
# source /p4/common/bin/p4_vars N
#
# This sets P4HOME, SERVERID, etc. needed by this function.
#------------------------------------------------------------------------------
function set_vars () {

   P4DInitScript="$P4HOME/bin/p4d_${SDP_INSTANCE}_init"
   P4DSystemdServiceFile="/etc/systemd/system/p4d_${SDP_INSTANCE}.service"
   P4BrokerInitScript="$P4HOME/bin/p4broker_${SDP_INSTANCE}_init"
   P4BrokerSystemdServiceFile="/etc/systemd/system/p4broker_${SDP_INSTANCE}.service"
   P4ProxyInitScript="$P4HOME/bin/p4p_${SDP_INSTANCE}_init"
   P4ProxySystemdServiceFile="/etc/systemd/system/p4p_${SDP_INSTANCE}.service"

   OFFLINE_DB="${P4HOME}/offline_db"
   CKPTMPDIR="${CHECKPOINTS}/ckp_tmp"

   # shellcheck disable=SC2153
   if [[ -n "$SERVERID" ]]; then
      if [[ "$(is_edge "$SERVERID")" == YES ]]; then
         export EDGESERVER=1
      else
         export EDGESERVER=0
      fi
   else
      export EDGESERVER=0
   fi

   if [[ -n "$SERVERID" ]]; then
      if [[ "$(is_standby "$SERVERID")" == YES ]]; then
         export STANDBYSERVER=1
         # Get commit server from P4TARGET setting in database
      else
         export STANDBYSERVER=0
      fi
   else
      export STANDBYSERVER=0
   fi

   # Ensure that SDP_ADMIN_PASSWORD_FILE is set, using existing value if set (e.g.
   # in p4_vars), otherise set it to the SDP standard value.
   export SDP_ADMIN_PASSWORD_FILE="${SDP_ADMIN_PASSWORD_FILE:-Unset}"
   if [[ "$SDP_ADMIN_PASSWORD_FILE" == Unset ]]; then
      export SDP_ADMIN_PASSWORD_FILE="$P4CCFG/.p4passwd.${P4SERVER}.admin"
   fi
}

#------------------------------------------------------------------------------
# Check if user is running as required OS user.
#------------------------------------------------------------------------------
function check_uid () {
   user=$(id -un)
   if [[ "${user}" != "${OSUSER}" ]]; then
      die "Must be run by user: ${OSUSER}. Abort!"
   fi
}

#------------------------------------------------------------------------------
# Function log() - echo message to logfile or stdout.
#
# If $LOGFILE is defined, write message to the log file only; nothing goes to
# stdout.  Prepend a datestamp.
# If $LOGFILE isn't defined, just echo to stdout, w/o timestamp or.
# In all cases, support '-e' formatting.
# Input:
# $1 - message to log (must be quoted).
#------------------------------------------------------------------------------
function log () {
   if [[ "${LOGFILE:-Unset}" != Unset ]]; then
      echo -n "$(date)" >> "$LOGFILE" 2>&1
      echo -e " $0: $*" >> "$LOGFILE" 2>&1
   else
      echo -e "$@"
   fi
}

#------------------------------------------------------------------------------
# Decide depending on our mail utility, how to specify sender (if we need to).
# Mail on some platforms sets sender by default.
# If the mail utility returns what looks like a version identifier
# when given the '-V' flag, use a '-S' flag.  If it does not return a
# verision identifier, don't set a mail sender option.
# Allow GNU Mailutils alternative flag instead.
#------------------------------------------------------------------------------
function get_mail_sender_opt () {
   local mail_sender_opt=
   local mail_ver=
   if [[ -n "$MAILFROM" ]]; then
      mail_ver=$($SDPMAIL -V 2>&1)
      # shellcheck disable=SC2076
      if [[ "$mail_ver" =~ "GNU Mailutils" ]]; then
         mail_sender_opt="-aFrom:$MAILFROM"
      elif  [[ "$mail_ver" =~ ^[0-9]+\.[0-9] ]]; then
         mail_sender_opt="-S from=$MAILFROM"
      fi
   fi
   echo "$mail_sender_opt"
}

#------------------------------------------------------------------------------
# Email the log file by $LOGFILE.
#------------------------------------------------------------------------------
function mail_log_file () {
   local subject=$1
   local mail_sender_opt
   mail_sender_opt=$(get_mail_sender_opt)
   $SDPMAIL -s "$subject" "$mail_sender_opt" "$MAILTO" < "$LOGFILE"
}

#------------------------------------------------------------------------------
# Function die() - log message, send email, and exit.
# If $LOGFILE is defined, write message to the log file, email log,
# and exit.
# If $LOGFILE is not defined, write message to the stdout, and skip
# email.
# If in terminal session, display message to stderr as well.
#------------------------------------------------------------------------------
function die () {
   # mail the error (with more helpful subject line than cron)
   log "ERROR!!! - $HOSTNAME $P4SERVER $0: $*"

   if [[ "${LOGFILE:-Unset}" != Unset ]]; then
      mail_log_file "ERROR!!! - $HOSTNAME $P4SERVER $0: $*"
   fi

   # if running from terminal, also send to stderr
   if tty >/dev/null; then
      echo -e "$@" >&2
   fi

   rm -f "${LOGS}/ckp_running.txt"

   exit 1
}

#------------------------------------------------------------------------------
# Convert various byte values (K,M,G,%) to bytes
# Pass in values such as 1024K, 512M, 1G or 10%
#------------------------------------------------------------------------------
function convert_to_bytes () {
   local value=$1
   local totalsize=${2:-Undefined}
   local size=
   local unit=

   # Break up value into size (numeric) and unit (K,M,G)
   size=$("$GREP" -Eo '[[:alpha:]%]+|[0-9]+' <<< "$value" | head -1)
   unit=$("$GREP" -Eo '[[:alpha:]%]+|[0-9]+' <<< "$value" | tail -1)

   # Based on unit, convert to bytes
   case "$unit" in
      K)
         echo $((size * 1024))
         ;;
      M)
         echo $((size * 1024**2))
         ;;
      G)
         echo $((size * 1024**3))
         ;;
      %)
         echo $((totalsize * size / 100))
         ;;
   esac
}

#------------------------------------------------------------------------------
# Write a semaphore file, $LOGS/ckp_running.txt.  This file is written at
# the start of processing, and removed upon successful completion.  It
# prevents multiple concurrent operations from being launched accidentally
# e.g. by multiple human admins, or a human inadvertantly competing with a
# cron job.
#
# It is also intended to get human admins to determine the root cause of
# checkpoint failues.
#------------------------------------------------------------------------------
function ckp_running() {
   if [[ -f "${LOGS}/ckp_running.txt" ]]; then
      die "Last checkpoint not complete. Check the backup process or contact support."
   fi
   echo "Checkpoint running." > "${LOGS}/ckp_running.txt"
}

#------------------------------------------------------------------------------
# Remove the ckp_running.txt semaphore file when checkpoint processing is
# complete.
#------------------------------------------------------------------------------
function ckp_complete() {
   rm -f "${LOGS}/ckp_running.txt"
}

#------------------------------------------------------------------------------
# Ensure key directories are writable. Abort if they are not.
#------------------------------------------------------------------------------
function check_dirs () {
   # Check that key dirs are writable
   declare -i dirsOK=1
   dirList="$OFFLINE_DB $CHECKPOINTS $LOGS"
   for dir in $dirList; do
      if [[ ! -d "$dir" || ! -w "$dir" ]]; then
         log "Error: Dir $dir does not exist or is not writable."
         dirsOK=0
      fi
   done
   [[ "$dirsOK" -eq 1 ]] || die "Some expected dirs are missing or not writable. Aborting."
}

#------------------------------------------------------------------------------
# Add the results of df -h or df -m to the log file.
#------------------------------------------------------------------------------
function check_disk_space () {
   log "Checking disk space..."
   $P4BIN diskspace >> "$LOGFILE" 2>&1
}

#------------------------------------------------------------------------------
# Check value of journal; ensure it is an integer.
#------------------------------------------------------------------------------
function check_journalnum () {
   local JNLNUM=${1:-Unset}
   re='^[0-9]+$'
   if ! [[ $JNLNUM =~ $re ]] ; then
      die "The journal counter value [$JNLNUM] is invalid. It must be numeric."
   fi
}

#------------------------------------------------------------------------------
# Check the checkpoints directory for the oldest checkpoint
#------------------------------------------------------------------------------
function get_ckpnum () {
   if [[ "$EDGESERVER" -eq 0 ]]; then
      # shellcheck disable=SC2034 disable=SC2012 disable=SC2016
      OLDESTCHECKPOINT=$(ls -1tr "${CHECKPOINTS}/" | "$GREP" ckp | "$GREP" -v md5 | head -n 1 | "$AWK" -F '.ckp.' '{ print $(2) }' | tr -d '.gz')
   else
      # shellcheck disable=SC2034 disable=SC2012 disable=SC2016
      OLDESTCHECKPOINT=$(ls -1tr "${CHECKPOINTS}.${SERVERID#p4d_}/" | "$GREP" ckp | "$GREP" -v md5 | head -n 1 | "$AWK" -F '.ckp.' '{ print $(2) }' | tr -d '.gz')
   fi
}

#------------------------------------------------------------------------------
# Determine journal counter by checking counter in db.counters.
#------------------------------------------------------------------------------
get_journalnum () {
   # get the current journal and checkpoint serial numbers.
   local nextCheckpointNum
   if [[ -r "$P4ROOT/db.counters" ]]; then
      nextCheckpointNum=$("$P4DBIN" -r "$P4ROOT" -k db.counters -jd - 2>&1 | grep @journal@ | cut -d '@' -f 8)

      if [[ -n "$nextCheckpointNum" ]]; then
         check_journalnum "$nextCheckpointNum"
         JOURNALNUM="$nextCheckpointNum"
      else
         # Special case: If db.counters is empty, then we have a new/empty data
         # set, so just set the value to 0.
         JOURNALNUM=0
      fi
   else
      # Special case: If db.counters doesn't exist, then we have a new/empty
      # data set, so just set the value to 0.
      JOURNALNUM=0
   fi

   # If we are on an edge server, the journal has already rotated, so we have to decrement the value
   # so that we replay the correct journal file and create the correct checkpoint number on the
   # edge server.
   #
   # In the case of a standby server, the journal rotation occurs on the master server,
   # so we don't need to increment the journal number again, so we decrement by 1.
   # Also, when replaying the journals to the offline db, we don't want to play to the live journal
   # because it is still being replicated.
   if [[ "$EDGESERVER" -eq 1 || "$STANDBYSERVER" -eq 1 ]]; then
      JOURNALNUM=$((JOURNALNUM - 1))
   fi
   CHECKPOINTNUM=$((JOURNALNUM + 1))
}

#------------------------------------------------------------------------------
# Determine journal space usage and minimum disk space requirement
#------------------------------------------------------------------------------
get_journal_stats () {
   # Get minimum disk space required on server journal filesystem before server rejects commands
   # This will return the configured and default value, but grab the configured value which shows first
   # If a configured value is not present, it will use the default value
   # shellcheck disable=SC2034 disable=SC2016
   P4JOURNALMIN=$("$P4BIN" configure show filesys.P4JOURNAL.min | "$AWK" '{ print $1 }' | $CUT -d'=' -f2 | head -1)
   # Get current journal free disk space
   # shellcheck disable=SC2034
   P4JOURNALFREE=$("$P4BIN" -ztag -F "%freeBytes%" diskspace P4JOURNAL)
   # Get total available disk space for journal
   # shellcheck disable=SC2034
   P4JOURNALTOTAL=$("$P4BIN" -ztag -F "%totalBytes%" diskspace P4JOURNAL)
}

#------------------------------------------------------------------------------
# Verify that the offline databases are usable by checking the existence
# of a 'offline_db_usable.txt' file that is written only when databases
# are in a known-good state, following successful recovery from a checkpoint.
#------------------------------------------------------------------------------
check_offline_db_usable () {
   # Check it is OK
   if [[ ! -f "$OFFLINE_DB/offline_db_usable.txt" ]]; then
      die "Offline database not in a usable state. Check the backup process."
   fi

   if [[ ! -f "$OFFLINE_DB/db.counters" ]]; then
      die "Offline database not found. Consider creating it with live_checkpoint.sh. Be aware that live_checkpoint.sh locks the live system and may take a long time. Aborting."
   fi
}

#------------------------------------------------------------------------------
# Determine journal counter in offline databases.
#------------------------------------------------------------------------------
get_offline_journal_num () {
   # Get the journal number of the offline database
   check_offline_db_usable
   OFFLINEJNLNUM=$("$P4DBIN" -r "$OFFLINE_DB" -jd - db.counters 2>&1 | grep '@journal@' | cut -d "@" -f 8 2>> "$LOGFILE") || die "Cannot get the offline journal number. Abort!"
   check_journalnum "$OFFLINEJNLNUM"
   log "Offline journal number is: $OFFLINEJNLNUM"
}

#------------------------------------------------------------------------------
# Cleanup old log files.
#------------------------------------------------------------------------------
remove_old_checkpoints_and_journals () {
   declare CheckpointsDir=
   declare StandbyReplicaJournalsDir=
   declare FilePrefix=
   declare JournalPrefix=

   if [[ "$KEEPCKPS" -eq 0 ]]; then
      log "Skipping cleanup of old checkpoints because KEEPCKPS is set to 0."
   else
      log "Deleting obsolete checkpoints and journals. Keeping latest $KEEPCKPS  per KEEPCKPS setting in p4_vars."
      # For the master server, we can safely rely on the SDP standard that the journalPrefix
      # is of the form '/p4/N/checkpoints/p4_N'. For replicas and edge servers, determine
      # the values dynamically based on the current journalPrefix value for the given ServerID.
      if [[ "$SERVERID" == "$P4MASTER_ID" ]]; then
         # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N
         CheckpointsDir="${CHECKPOINTS}"
         FilePrefix="${P4SERVER}"
      else
         JournalPrefix="$("$P4DBIN" -r "$P4ROOT" -k db.config -jd - | grep "@${SERVERID}@ @journalPrefix@" | cut -d '@' -f 10)"
         if [[ -n "$JournalPrefix" ]]; then
            CheckpointsDir="${JournalPrefix%/*}"
            FilePrefix="${JournalPrefix##*/}"
         else
            log "Warning: Could not determine journalPrefix for ServerID $SERVERID."
            CheckpointsDir="${CHECKPOINTS}"
            FilePrefix="${P4SERVER}"
         fi
      fi

      if [[ -d "$CheckpointsDir" ]]; then
         # Remove selected checkpoint and journal files based on the KEEPCKPS
         # setting regardless of whether compressed or not.
         # We multiply KEEPCKP by 2 for the ckp files because of the md5 files.
         # shellcheck disable=SC2012
         for I_LOGFILE in $(ls -t "${CheckpointsDir}/${FilePrefix}".ckp.* 2>/dev/null | "$AWK" "NR > ($KEEPCKPS * 2)"); do
            log "rm -f $I_LOGFILE"
            rm -f "$I_LOGFILE"
         done

         # Use KEEPJNLS to allow for separate journal rotation at a higher
         # frequency.
         # shellcheck disable=SC2012
         for I_LOGFILE in $(ls -t "${CheckpointsDir}/${FilePrefix}".jnl.* 2>/dev/null | "$AWK" "NR > $KEEPJNLS"); do
            log "rm -f $I_LOGFILE"
            rm -f "$I_LOGFILE"
         done
      fi

      StandbyReplicaJournalsDir="${P4HOME}/journals.rep"
      if [[ -d "$StandbyReplicaJournalsDir" ]]; then
         # shellcheck disable=SC2012
         for I_LOGFILE in $(ls -t "$StandbyReplicaJournalsDir/${FilePrefix}".ckp.* 2>/dev/null | "$AWK" "NR > ($KEEPCKPS * 2)"); do
            log "rm -f $I_LOGFILE"
            rm -f "$I_LOGFILE"
         done

         # shellcheck disable=SC2012
         for I_LOGFILE in $(ls -t "${StandbyReplicaJournalsDir}/${FilePrefix}".jnl.* 2>/dev/null | "$AWK" "NR > $KEEPJNLS"); do
            log "rm -f $I_LOGFILE"
            rm -f "$I_LOGFILE"
         done
      fi
   fi
}

#------------------------------------------------------------------------------
# Function: is_server_up ($server)
#
# Input:
# $1 - server, one of 'p4d', 'p4p', or 'p4broker'
#
# Output: None
#
# Return Codes:
# 0: Server is up.
# 1: Server is down.
# 2: Bad usage.
#
# Server up/down status is checked using the appropriate init script.
#------------------------------------------------------------------------------
function is_server_up () {
   local server="${1:-Unset}"

   case "$server" in
      (p4d)
         "$P4DInitScript" status > /dev/null 2>&1
         return $?
      ;;
      (p4broker)
         "$P4BrokerInitScript" status > /dev/null 2>&1
         return $?
      ;;
      (p4p)
        "$P4ProxyInitScript" status > /dev/null 2>&1
         return $?
      ;;
      (Unset)
         log "Internal Error: is_server_up(): No server type specified."
         return  2
      ;;
      (*)
         log "Internal Error: is_server_up(): Unknown server specified: $server"
         return 2
      ;;
   esac
}

#------------------------------------------------------------------------------
# Shutdown p4d using systemd if configured for systemd. Otherwise call the
# underlying init script directly.
#
# Log the shutdown activity.
#
#------------------------------------------------------------------------------
stop_p4d () {
   log "Shutting down the ${P4DBIN##*/} server."
   local -i maxStopDelay=${SDP_MAX_STOP_DELAY_P4D:-43200}
   local -i stopVerified=0
   local -i i=0

   if [[ -r "$P4DSystemdServiceFile" ]]; then
      { sudo systemctl stop "${P4DBIN##*/}"; } >> "$LOGFILE" 2>&1 ||\
         die "Failed to execute: sudo systemctl stop ${P4DBIN##*/}"

      # With systemd, we must independently confirm service stop,
      # waiting if needed.
      stopVerified=0
      i=0; while [[ "$i" -lt "$maxStopDelay" ]]; do
         if is_server_up p4d; then
            sleep 1
         else
            stopVerified=1
            break
         fi
         i+=1
      done
   else
      "$P4DInitScript" stop >> "$LOGFILE" 2>&1
      stopVerified=1
   fi

   if [[ "$stopVerified" -eq 1 ]]; then
      log "Stopped ${P4DBIN##*/} server."
      return 0
   else
      log "Error: Server ${P4DBIN##*/} did not stop after $maxStopDelay seconds. Tailing $P4LOG:"
      tail "$P4LOG" >> "$LOGFILE" 2>&1
      die "Aborting due to failed p4d stop."
   fi
}

#------------------------------------------------------------------------------
# Shutdown p4broker using systemd if configured for systemd. Otherwise call the
# underlying init script directly.
#
# Log the shutdown activity.
#
#------------------------------------------------------------------------------
stop_p4broker () {
   log "Shutting down the ${P4BROKERBIN##*/} server."
   local -i maxStopDelay=${SDP_MAX_STOP_DELAY_P4BROKER:-600}
   local -i stopVerified=0
   local -i i=0

   if [[ -r "$P4BrokerSystemdServiceFile" ]]; then
      { sudo systemctl stop "${P4BROKERBIN##*/}"; } >> "$LOGFILE" 2>&1 ||\
         die "Failed to execute: sudo systemctl stop ${P4BROKERBIN##*/}"

      # With systemd, we must independently confirm service stop,
      # waiting if needed.
      stopVerified=0
      i=0; while [[ "$i" -lt "$maxStopDelay" ]]; do
         if is_server_up p4broker; then
            sleep 1
         else
            stopVerified=1
            break
         fi
         i+=1
      done
   else
      "$P4BrokerInitScript" stop >> "$LOGFILE" 2>&1
      stopVerified=1
   fi

   if [[ "$stopVerified" -eq 1 ]]; then
      log "Stopped ${P4BROKERBIN##*/} server."
      return 0
   else
      die "Server ${P4BROKERBIN##*/} did not stop after $maxStartDelay seconds."
   fi
}

#------------------------------------------------------------------------------
# Shutdown p4p using systemd if configured for systemd. Otherwise call the
# underlying init script directly.
#
# Log the shutdown activity.
#
#------------------------------------------------------------------------------
stop_p4p () {
   log "Shutting down the ${P4PBIN##*/} server."
   local -i maxStopDelay=${SDP_MAX_STOP_DELAY_P4P:-600}
   local -i stopVerified=0
   local -i i=0

   if [[ -r "$P4ProxySystemdServiceFile" ]]; then
      { sudo systemctl stop "${P4PBIN##*/}"; } >> "$LOGFILE" 2>&1 ||\
         die "Failed to execute: sudo systemctl stop ${P4PBIN##*/}"

      # With systemd, we must independently confirm service stop,
      # waiting if needed.
      stopVerified=0
      i=0; while [[ "$i" -lt "$maxStopDelay" ]]; do
         if is_server_up p4p; then
            sleep 1
         else
            stopVerified=1
            break
         fi
         i+=1
      done
   else
      "$P4ProxyInitScript" stop >> "$LOGFILE" 2>&1
      stopVerified=1
   fi

   if [[ "$stopVerified" -eq 1 ]]; then
      log "Stopped ${P4PBIN##*/} server."
      return 0
   else
      die "Server ${P4PBIN##*/} did not stop after $maxStopDelay seconds."
   fi
}

#------------------------------------------------------------------------------
# Start p4d using systemd if configured for systemd. Otherwise call the
# underlying init script directly.
#
# Log the startup activity.
#
# This is a do-or-die function. It returns success upon successful server
# startup, or else dies.
#------------------------------------------------------------------------------
function start_p4d () {
   log "Starting the ${P4DBIN##*/} server."
   local -i maxStartDelay=${SDP_MAX_START_DELAY_P4D:-120}
   local -i startVerified=0
   local -i i=0

   if [[ -r "$P4DSystemdServiceFile" ]]; then
      { sudo systemctl start "${P4DBIN##*/}"; } ||\
         die "Failed to execute: sudo systemctl start ${P4DBIN##*/}"
   else
      "$P4DInitScript" start >> "$LOGFILE" 2>&1
   fi

   # Confirm that p4d started, waiting if needed.
   startVerified=0
   i=0; while [[ "$i" -lt "$maxStartDelay" ]]; do
      if is_server_up p4d; then
         startVerified=1
         break
      else
         sleep 1
      fi
      i+=1
   done

   if [[ "$startVerified" -eq 1 ]]; then
      log "Server ${P4DBIN##*/} started successfully."
      return 0
   else
      log "Error: Server ${P4DBIN##*/} did not start after $maxStartDelay seconds. Tailing $P4LOG:"
      tail "$P4LOG" >> "$LOGFILE" 2>&1
      die "Aborting due to failed p4d start."
   fi
}

#------------------------------------------------------------------------------
# Start p4broker using systemd if configured for systemd. Otherwise call the
# underlying init script directly.
#
# Log the startup activity.
#
# This is a do-or-die function. It returns success upon successful server
# startup, or else dies.
#------------------------------------------------------------------------------
function start_p4broker () {
   log "Starting the ${P4BROKERBIN##*/} server."
   local -i maxStartDelay=${SDP_MAX_START_DELAY_P4BROKER:-60}
   local -i startVerified=0
   local -i i=0

   if [[ -r "$P4BrokerSystemdServiceFile" ]]; then
      { sudo systemctl start "${P4BROKERBIN##*/}"; } ||\
         die "Failed to execute: sudo systemctl start ${P4BROKERBIN##*/}"
   else
      "$P4BrokerInitScript" start >> "$LOGFILE" 2>&1
   fi

   # Confirm that p4broker started, waiting if needed.
   startVerified=0
   i=0; while [[ "$i" -lt "$maxStartDelay" ]]; do
      if is_server_up p4broker; then
         startVerified=1
         break
      else
         sleep 1
      fi
      i+=1
   done

   if [[ "$startVerified" -eq 1 ]]; then
      log "Server ${P4BROKERBIN##*/} started successfully."
      return 0
   else
      die "Server ${P4BROKERBIN##*/} did not start after $maxStartDelay seconds."
   fi
}

#------------------------------------------------------------------------------
# Start p4p using systemd if configured for systemd. Otherwise call the
# underlying init script directly.
#
# Log the startup activity.
#
# This is a do-or-die function. It returns success upon successful server
# startup, or else dies.
#------------------------------------------------------------------------------
function start_p4p () {
   log "Starting the ${P4PBIN##*/} server."
   local -i maxStartDelay=${SDP_MAX_START_DELAY_P4P:-60}
   local -i startVerified=0
   local -i i=0

   if [[ -r "$P4ProxySystemdServiceFile" ]]; then
      { sudo systemctl start "${P4PBIN##*/}"; } ||\
         die "Failed to execute: sudo systemctl start ${P4PBIN##*/}"
   else
      "$P4ProxyInitScript" start >> "$LOGFILE" 2>&1
   fi

   # Confirm that p4p started, waiting if needed.
   startVerified=0
   i=0; while [[ "$i" -lt "$maxStartDelay" ]]; do
      if is_server_up p4p; then
         startVerified=1
         break
      else
         sleep 1
      fi
      i+=1
   done

   if [[ "$startVerified" -eq 1 ]]; then
      log "Server ${P4PBIN##*/} started successfully."
      return 0
   else
      die "Server ${P4PBIN##*/} did not start after $maxStartDelay seconds."
   fi
}

#------------------------------------------------------------------------------
# Do a front-door 'p4d admin journal' command to rotate the current/active
# journal file on the master server, starting a fresh new P4JOURNAL file.
#
# In a distributed topology with replicas/edge servers, this function must
# be called only on the master/commit server.
#------------------------------------------------------------------------------
function truncate_journal () {
   declare CheckpointFile="${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz"
   declare JournalFile="${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM}"

   if [[ "$SERVERID" == "$P4MASTER_ID" ]]; then
      [[ -f "$CheckpointFile" ]] && \
         die "Checkpoint $CheckpointFile already exists, check the backup process."
      [[ -f "$JournalFile" ]] && \
         die "Journal $JournalFile already exists, check the backup process."

      log "Truncating journal..."
      # During journal rotation, either by a front-door 'p4 admin journal' or a
      # back-door 'p4d -jj', p4d does a copy-then-delete rather than an mv at
      # the OS level.  During rotation, the perforce server will pause
      # responses to clients (just as with a checkpoint), but this should be
      # for a short period of time even for large data sets, as the journal
      # typically represents a single day of metadata.
      # Curly braces capture output of 'time'.
      "$P4CBIN"/p4login -p "$P4MASTERPORT"
      { time "$P4BIN" -p "$P4MASTERPORT" admin journal; } >> "$LOGFILE" 2>&1 || { die "Journal rotation failed. Abort!"; }
      # The test below waits until the journal file exists in the checkpoints directory before proceeding.
      test=1
      while [[ $test != 0 ]]; do
         sleep 5
         if [[ -f "$JournalFile" ]]; then
            test=0
         fi
      done
      "$P4CBIN"/p4login
   else
      log "Warning: The truncate_journal () function should only be called on the master server. It is ignored on edge and replica replica servers."
   fi
}

#------------------------------------------------------------------------------
# Call 'p4d -jj' to rotate the current/active journal file on the master
# server from an edge server, starting a fresh new P4JOURNAL file.
#
# In a distributed topology with edge and standby servers, this function can be
# used to trigger a journal rotation on master/commit server. It's not meant to
# be used from the master server itself.
#------------------------------------------------------------------------------
function truncate_journal_on_master () {
   # Increment Edge journal number since the journal will increment on the master after calling journal rotation
   local EdgeJournalNum=$((JOURNALNUM + 1))
   local StandbyJournalNum=$((JOURNALNUM + 2)) # If using journalcopy, have to add 2 since live journal is in checkpoints folder
   local JournalFile=

   if [[ "$EDGESERVER" -eq 1 ]]; then
      # Refer to ckp/jnl files starting like (example ServerID=p4d_edge_nyc):
      # /p4/N/checkpoints.edge_nyc/p4_N.edge_nyc
      JournalFile="${CHECKPOINTS}.${SERVERID#p4d_}/${P4SERVER}.${SERVERID#p4d_}.jnl.${EdgeJournalNum}"
   elif [[ "$STANDBYSERVER" -eq 1 ]]; then
      JournalFile="${CHECKPOINTS}/${P4SERVER}.jnl.${StandbyJournalNum}"
   fi

   if [[ "$SERVERID" != "$P4MASTER_ID" ]]; then
      [[ -f "$JournalFile" ]] && \
         die "Journal $JournalFile already exists, check the backup process."

      log "Truncating journal on ${P4MASTERPORT}."
      # 'p4d -jj' does a copy-then-delete, instead of a simple mv.
      # During 'p4d -jj' the perforce server will hang the responses to clients,
      # this should be for a very short period of time even for large data
      # sets, as the journal represents a single day of metadata.
      # Curly braces capture output of 'time'.
      "$P4CBIN"/p4login -p "$P4MASTERPORT"
      { time "$P4BIN" -p "$P4MASTERPORT" admin journal; } >> "$LOGFILE" 2>&1 || { die "Journal rotation failed. Abort!"; }
      # The test below waits until the journal file exists in the checkpoints directory before proceeding.
      test=1
      while [[ $test != 0 ]]; do
         sleep 5
         if [[ -f "$JournalFile" ]]; then
            test=0
         fi
      done
      "$P4CBIN"/p4login -service
   else
      log "Warning: truncate_journal_on_master () function should not be called on a master server. Ignoring."
   fi
}

#------------------------------------------------------------------------------
# Similar to truncate_journal() above, p4d_truncate_journal() is intended to be
# usable form the p4d_base init script, to allow journal rotation on p4d
# start.  As it may be called from the init script, it may be called on the
# master, a replica, or the edge. However, it should will only do the journal
# rotation if called on the master.
#------------------------------------------------------------------------------
function p4d_truncate_journal () {
   declare JournalFile="${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM}"

   if [[ "$SERVERID" == "$P4MASTER_ID" ]]; then
      [[ -f "$JournalFile" ]] && \
         die "Journal $JournalFile already exists, check the backup process."
      log "Rotating journal prior to starting p4d."
      "$P4DBIN" -r "$P4ROOT" -J "$P4JOURNAL" -jj >> "$LOGFILE" 2>&1 ||\
         die "Failed to rotate journal. Aborting p4d server start."
   else
      log "Warning: The p4d_truncate_journal() function has no effect if called on a server other than the master. Ignoring."
   fi
}

#------------------------------------------------------------------------------
# Replay any and all numbered journal files into the offline databases.
#------------------------------------------------------------------------------
function replay_journals_to_offline_db () {
   local CheckpointsDir=
   local FilePrefix=
   local NumberedJournal=
   local JournalPrefix=

   log "Replay any unreplayed journals to the offline database."
   check_offline_db_usable

   # For the master server, we can safely rely on the SDP standard that the
   # journalPrefix is of the form '/p4/N/checkpoints/p4_N'. For replicas and
   # edge servers, determine the values dynamically based on the current journal
   # Prefix value for the given ServerID.
   if [[ "$SERVERID" == "$P4MASTER_ID" ]]; then
      # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N
      CheckpointsDir="${CHECKPOINTS}"
      FilePrefix="${P4SERVER}"
   else
      JournalPrefix="$("$P4DBIN" -r "$P4ROOT" -k db.config -jd - | grep "@${SERVERID}@ @journalPrefix@" | cut -d '@' -f 10)"
      if [[ -n "$JournalPrefix" ]]; then
         CheckpointsDir="${JournalPrefix%/*}"
         FilePrefix="${JournalPrefix##*/}"
      else
         log "Warning: Could not determine journalPrefix for ServerID $SERVERID."
         CheckpointsDir="${CHECKPOINTS}"
         FilePrefix="${P4SERVER}"
      fi
   fi

   for (( j=OFFLINEJNLNUM; j <= JOURNALNUM; j++ )); do
      NumberedJournal="${CheckpointsDir}/${FilePrefix}.jnl.${j}"
      log "Replay journal $NumberedJournal to offline db."
      rm -f "${OFFLINE_DB}/offline_db_usable.txt" >> "$LOGFILE" 2>&1
      # Curly braces capture output of 'time'.
      { time "$P4DBIN" -r "$OFFLINE_DB" -jr -f "$NumberedJournal"; } >> "$LOGFILE" 2>&1 || { die "Offline journal replay failed. Abort!"; }
      echo "Offline db file restored successfully." > "${OFFLINE_DB}/offline_db_usable.txt"
   done
}

#------------------------------------------------------------------------------
# Replay the live, active P4JOURNAL file into the offline databaes.
#------------------------------------------------------------------------------
function replay_active_journal_to_offline_db () {
   log "Replay active journal to offline db."

   declare ActiveJournal=

   # On a standby server, the current/active journal is named /p4/N/logs/journal.<jnlNum>.
   # On the master and other server types, the active journal is $P4JOURNAL.
   if [[ "$STANDBYSERVER" -eq 1 ]]; then
      local _JNLNUM
      _JNLNUM=$("$P4DBIN" -r "$P4ROOT" -jd - db.counters 2>&1 | grep '@journal@' | cut -d "@" -f 8 2>> "$LOGFILE") || die "Cannot get $P4ROOT journal number. Abort!"
      ActiveJournal="$LOGS/journal.$_JNLNUM"
   else
      ActiveJournal="$P4JOURNAL"
   fi

   # Curly braces capture output of 'time'.
   { time "$P4DBIN" -r "$OFFLINE_DB" -jr -f "${ActiveJournal}"; } >> "$LOGFILE" 2>&1 || { die "Active Journal replay failed. Abort!"; }

}

#------------------------------------------------------------------------------
# Recreate offline databases from the latest checkpoint.
#------------------------------------------------------------------------------
function recreate_offline_db_files () {
   local CheckpointsDir=
   local FilePrefix=
   local LastCheckpointMD5=
   local LastCheckpoint=

   # For the master server, we can safely rely on the SDP standard that the journalPrefix
   # is of the form '/p4/N/checkpoints/p4_N'. For replicas and edge servers, determine
   # the values dynamically based on the current journalPrefix value for the given ServerID.
   if [[ "$SERVERID" == "$P4MASTER_ID" ]]; then
      # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N
      CheckpointsDir="${CHECKPOINTS}"
      FilePrefix="${P4SERVER}"
   else
      JournalPrefix="$("$P4DBIN" -r "$P4ROOT" -k db.config -jd - | grep "@${SERVERID}@ @journalPrefix@" | cut -d '@' -f 10)"
      if [[ -n "$JournalPrefix" ]]; then
         CheckpointsDir="${JournalPrefix%/*}"
         FilePrefix="${JournalPrefix##*/}"
      else
         log "Warning: Could not determine journalPrefix for ServerID $SERVERID."
         CheckpointsDir="${CHECKPOINTS}"
         FilePrefix="${P4SERVER}"
      fi
   fi

   if [[ -z "$(ls "${CheckpointsDir}/${FilePrefix}".ckp.*.md5)" ]]; then
      ckp_complete

      if [[ "$SERVERID" == "$P4MASTER_ID" ]]; then
         die "No checkpoints found in $CheckpointsDir with prefix $FilePrefix.  Consider running 'live_checkpoint.sh $SDP_INSTANCE'."
      else
         die "No checkpoints found in $CheckpointsDir with prefix $FilePrefix."
      fi
   fi

   # shellcheck disable=SC2012
   LastCheckpointMD5=$(ls -t "${CheckpointsDir}/${FilePrefix}".ckp.*.md5 | head -1)
   [[ -n "$LastCheckpointMD5" ]] || \
      die "Could not find *.md5 file for latest checkpoint. Abort!"

   # shellcheck disable=SC2129
   rm -f "${OFFLINE_DB}"/offline_db_usable.txt >> "$LOGFILE" 2>&1
   rm -f "${OFFLINE_DB}"/db.* >> "$LOGFILE" 2>&1
   rm -f "${OFFLINE_DB}"/save/db.* >> "$LOGFILE" 2>&1

   # Account for the idiosyncracy that MD5 files for checkpoints may look
   # like p4_N.ckp.gz.md5 or p4_N.ckp.md5.
   if [[ "$LastCheckpointMD5" == *".gz.md5" ]]; then
      LastCheckpoint="${LastCheckpointMD5%.md5}"
   else
      LastCheckpoint="${LastCheckpointMD5%.md5}.gz"
   fi

   [[ -r "$LastCheckpoint" ]] || \
      die "Missing last checkpoint file: $LastCheckpoint. Abort!"

   log "Recovering from last full checkpoint, $LastCheckpoint."
   # Curly braces capture output of 'time'.
   { time "$P4DBIN" -r "$OFFLINE_DB" -jr -z "${LastCheckpoint}"; } >> "$LOGFILE" 2>&1 || { die "Restore of checkpoint to $OFFLINE_DB failed!"; }
   echo "Offline db file restored successfully." > "${OFFLINE_DB}/offline_db_usable.txt"
}

#------------------------------------------------------------------------------
# Take a live checkpoint from db.* files in P4ROOT.
#------------------------------------------------------------------------------
function checkpoint () {
   local CheckpointsDir=
   local FilePrefix=

   log "Create a new checkpoint from live db files in $P4ROOT."

   if [[ "$SERVERID" == "$P4MASTER_ID" ]]; then
      # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N
      CheckpointsDir="${CHECKPOINTS}"
      FilePrefix="${P4SERVER}"
   else
      die "Live checkpoints can only be run on the master server."
   fi

   # Curly braces capture output of 'time'.
   { time "$P4DBIN" -r "$P4ROOT" -jc -Z "${CheckpointsDir}/${FilePrefix}"; } >> "$LOGFILE" 2>&1 || { die "ERROR - New live checkpoint failed!"; }
}

#------------------------------------------------------------------------------A
# Take a checkpoint from the ROOTDIR, typically either /p4/N/root or
# /p4/N/offline_db.
#------------------------------------------------------------------------------
function dump_checkpoint () {
   declare CheckpointsDir=
   declare NewCheckpoint=
   declare NewCheckpointMD5=
   declare FilePrefix=
   declare JournalPrefix=
   declare -i DoSnapshot=0
   declare -i SnapshotOK=1
   declare -i CheckpointOK=1

   # shellcheck disable=SC2153
   log "Dump out new checkpoint from db files in $ROOTDIR."

   # For the master server, we can safely rely on the SDP standard that the journalPrefix
   # is of the form '/p4/N/checkpoints/p4_N'. For replicas and edge servers, determine
   # the values dynamically based on the current journalPrefix value for the given ServerID.
   if [[ "$SERVERID" == "$P4MASTER_ID" ]]; then
      # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N
      CheckpointsDir="${CHECKPOINTS}"
      FilePrefix="${P4SERVER}"
   else
      JournalPrefix="$("$P4DBIN" -r "$P4ROOT" -k db.config -jd - | grep "@${SERVERID}@ @journalPrefix@" | cut -d '@' -f 10)"
      if [[ -n "$JournalPrefix" ]]; then
         CheckpointsDir="${JournalPrefix%/*}"
         FilePrefix="${JournalPrefix##*/}"
      else
         log "Warning: Could not determine journalPrefix for ServerID $SERVERID."
         CheckpointsDir="${CHECKPOINTS}"
         FilePrefix="${P4SERVER}"
      fi
   fi

   NewCheckpoint="${CheckpointsDir}/${FilePrefix}.ckp.${CHECKPOINTNUM}.gz"
   NewCheckpointMD5="${NewCheckpoint}.md5"

   if [[ -r "$NewCheckpoint" && -r "$NewCheckpointMD5" ]]; then
      log "Warning: Skipping generation of existing checkpoint $NewCheckpoint.\\nVerified MD5 file exists: $NewCheckpointMD5."
      return
   fi

   # Curly braces capture output of 'time'.
   if { time "$P4DBIN" -r "$ROOTDIR" -jd -z "${CheckpointsDir}/${FilePrefix}.ckp.${CHECKPOINTNUM}.gz"; } >> "$LOGFILE" 2>&1; then
      CheckpointOK=1
   else
      CheckpointOK=0
   fi

   if [[ -n "${SNAPSHOT_SCRIPT:-}" ]]; then
      DoSnapshot=1
      log "Calling site-specific snapshot script: $SNAPSHOT_SCRIPT"
      if "$SNAPSHOT_SCRIPT" >> "$LOGFILE" 2>&1; then
         SnapshotOK=1
      else
         SnapshotOK=0
      fi
   fi

   if [[ "$DoSnapshot" -eq 0 ]]; then
      if [[ "$CheckpointOK" -eq 1 ]]; then
         log "New checkpoint dump succeeded."
      else
         die "New checkpoint dump FAILED."
      fi
   else
      if [[ "$CheckpointOK" -eq 0 && "$SnapshotOK" -eq 0 ]]; then
         die "Both checkpoint dump and snapshot FAILED."
      elif [[ "$CheckpointOK" -eq 1 && "$SnapshotOK" -eq 0 ]]; then
         die "New checkpoint dump succeeded, but snapshot FAILED."
      elif [[ "$CheckpointOK" -eq 0 && "$SnapshotOK" -eq 1 ]]; then
         die "New checkpoint dump FAILED, but snapshot succeeded."
      else
         log "New checkpoint dump and snapshot succeeded."
      fi
   fi
}

#------------------------------------------------------------------------------
# Compare journal numbers between live and offline databases, to ensure
# they can be safely swapped out.
#------------------------------------------------------------------------------
function compare_journal_numbers () {

   local _OFFLINEJNLNUM
   _OFFLINEJNLNUM=$("$P4DBIN" -r "$OFFLINE_DB" -jd - db.counters 2>&1 | grep '@journal@' | cut -d "@" -f 8 2>> "$LOGFILE") || die "Cannot get $OFFLINE_DB journal number. Abort!"
   check_journalnum "$_OFFLINEJNLNUM"

   # Get the journal number of the root database
   if [[ ! -f "$P4ROOT/db.counters" ]]; then
      die "$P4ROOT database not found. Something is seriously wrong since the server was just running a minute ago! Contact support@perforce.com"
   fi
   local _JNLNUM
   _JNLNUM=$("$P4DBIN" -r "$P4ROOT" -jd - db.counters 2>&1 | grep '@journal@' | cut -d "@" -f 8 2>> "$LOGFILE") || die "Cannot get $P4ROOT journal number. Abort!"
   check_journalnum "$_JNLNUM"

   if [[ "$_JNLNUM" -gt "$_OFFLINEJNLNUM" ]]; then
      log "$P4ROOT journal number is: $_JNLNUM"
      log "$OFFLINE_DB journal number is: $_OFFLINEJNLNUM"
      die "$OFFLINE_DB journal number is less than $P4ROOT, cannot switch."
   fi
}

#------------------------------------------------------------------------------
# Swap out live db.* database files in P4ROOT with those in offline_db.
#------------------------------------------------------------------------------
function switch_db_files () {
   log "Calling 'verify_sdp.sh -L off' before swapping db.* files."
   "$P4CBIN"/verify_sdp.sh -skip excess,crontab -L off >> "$LOGFILE" 2>&1 ||\
      die "Error: Cannot confirm all is well with $P4CBIN/verify_sdp.sh. Aborting"

   # Compare the Offline and Master journal numbers before switching to make
   # sure they match.
   compare_journal_numbers

   log "Switching root and offline_db links."
   [[ -d "${P4ROOT}"/save ]] || mkdir -p "${P4ROOT}"/save >> "$LOGFILE" 2>&1

   # shellcheck disable=SC2129
   echo "P4ROOT is not available during switch_db_files() processing." > "$P4ROOT/P4ROOT_not_usable.txt" 2>> "$LOGFILE"
   echo "P4ROOT is not available during switch_db_files() processing." > "$OFFLINE_DB/P4ROOT_not_usable.txt" 2>> "$LOGFILE"

   # shellcheck disable=SC2129
   rm -f "${P4ROOT}"/save/db.* >> "$LOGFILE" 2>&1
   rm -rf "${P4ROOT}"/server.locks >> "$LOGFILE" 2>&1
   mv "${P4ROOT}"/db.* "${P4ROOT}"/save/. >> "$LOGFILE" 2>&1

   if [[ -r "$P4ROOT"/license ]]; then
      mv "${P4ROOT}"/license "${OFFLINE_DB}/." >> "$LOGFILE" 2>&1
   fi

   if [[ -n "$(ls "$P4ROOT"/license* 2>/dev/null)" ]]; then
      mv "${P4ROOT}"/license* "${OFFLINE_DB}/." >> "$LOGFILE" 2>&1
   fi

   if [[ -r "${P4ROOT}"/rdb.lbr ]]; then
      mv "${P4ROOT}"/rdb.lbr "${OFFLINE_DB}/." >> "$LOGFILE" 2>&1
   fi

   if [[ -n "$(ls "$P4ROOT"/state* 2>/dev/null)" ]]; then
      mv "${P4ROOT}"/state* "${OFFLINE_DB}/." >> "$LOGFILE" 2>&1
   fi

   if [[ -r "${P4ROOT}"/server.id ]]; then
      mv "${P4ROOT}"/server.id "${OFFLINE_DB}/." >> "$LOGFILE" 2>&1
   fi

   if [[ -n "$(ls "$P4ROOT"/server.id* 2>/dev/null)" ]]; then
      mv "${P4ROOT}"/server.id* "${OFFLINE_DB}/." >> "$LOGFILE" 2>&1
   fi

   rm -f "${OFFLINE_DB}/offline_db_usable.txt" >> "$LOGFILE" 2>&1
   LinkOfflineDB="$(readlink "$OFFLINE_DB")"
   LinkP4ROOT="$(readlink "$P4ROOT")"
   unlink "$OFFLINE_DB"
   unlink "$P4ROOT"

   ln -s "$LinkOfflineDB" "$P4ROOT" >> "$LOGFILE" 2>&1 ||\
      die "Link of $LinkOfflineDB to $P4ROOT failed."

   ln -s "$LinkP4ROOT" "$OFFLINE_DB" >> "$LOGFILE" 2>&1 ||\
      die "Link of $LinkP4ROOT to $OFFLINE_DB failed."

   rm -f "$P4ROOT/P4ROOT_not_usable.txt" >> "$LOGFILE" 2>&1
   rm -f "$OFFLINE_DB/P4ROOT_not_usable.txt" >> "$LOGFILE" 2>&1
}

#------------------------------------------------------------------------------
# Function: trim_log_file ($LogToTrim, $MaxLines)
#
# For log files expected to be short, keep them at a max size.
#
# When they get too big, trim them from the top first, so the most recent
# output is retained.
#
# Thus function treats its processing as non-essential; most errors are
# silently ignored and output discarded. Only error output related to replacing
# the original log is retained in LOGFILE or displayed.
#------------------------------------------------------------------------------
function trim_log_file () {
   local LogToTrim="${1:-}"
   local MaxLines="${2:-5000}"
   local TmpFile=
   local Lines=

   [[ -w "$LogToTrim" ]] || return

   # Abort if MaxLines isn't numeric.
   [[ "$MaxLines" =~ ^[0-9]+$ ]] || return

   TmpFile="${LogToTrim}.trimming.$$.$RANDOM"
   Lines=$(wc -l "$LogToTrim")
   Lines=${Lines%% *}

   # Confirm Lines is a number, else just abort.
   [[ "$Lines" =~ ^[0-9]+$ ]] || return

   # If the file isn't big enough to need trimming, abort.
   [[ "$Lines" -gt "$MaxLines" ]] || return

   log "Trimming log $LogToTrim from $Lines to $MaxLines lines."

   # If the trimming fails, disrd output and just return.
   if tail -"$MaxLines" "$LogToTrim" > "$TmpFile" 2>/dev/null; then
      if [[ -n "${LOGFILE:-}" ]]; then
         mv -f "$TmpFile" "$LogToTrim" >> "$LOGFILE" 2>&1
      else
         mv -f "$TmpFile" "$LogToTrim"
      fi
   else
      return
   fi
}

#------------------------------------------------------------------------------
# Rotate specified log files, and compress with gzip.
#------------------------------------------------------------------------------
function rotate_log_file () {
   local LogToRotate="${1:-}"
   local GzExt="${2:-}"
   local -i i=1
   local Datestamp=
   local RotatedLog=
   local RotatedZippedLog=

   [[ -n "$LogToRotate" ]] || return

   if [[ -n "${LOGFILE:-}" ]]; then
      pushd "$LOGS" > /dev/null 2>> "$LOGFILE" || die "Could not cd to: $LOGS"
   else
      pushd "$LOGS" > /dev/null || die "Could not cd to: $LOGS"
   fi

   Datestamp=$(date +'%Y-%m-%d_%H-%M-%S')
   RotatedLog="${LogToRotate}.${Datestamp}"

   if [[ -f "${LogToRotate}" ]]; then
      if [[ -n "${LOGFILE:-}" ]]; then
         mv -f "${LogToRotate}" "${RotatedLog}" >> "$LOGFILE" 2>&1

         if [[ -n "$GzExt" ]]; then
            RotatedZippedLog="${RotatedLog}${GzExt}"

            # If needed, move existing zipped log aside.
            if [[ -e "$RotatedZippedLog" ]]; then
               while [[ -e "${LogToRotate}.${Datestamp}.${i}${GzExt}" ]]; do
                  i+=1
               done
               log "Moving pre-existing $RotatedZippedLog aside to ${LogToRotate}.${Datestamp}.${i}${GzExt}" >> "$LOGFILE" 2>&1
               mv -f "$RotatedZippedLog" "${LogToRotate}.${Datestamp}.${i}${GzExt}" >> "$LOGFILE" 2>&1
            fi

            gzip "$RotatedLog" >> "$LOGFILE" 2>&1
         fi
      else
         mv -f "${LogToRotate}" "${RotatedLog}"

         if [[ -n "$GzExt" ]]; then
            RotatedZippedLog="${RotatedLog}${GzExt}"

            # If needed, move existing zipped log aside.
            if [[ -e "$RotatedZippedLog" ]]; then
               while [[ -e "${LogToRotate}.${Datestamp}.${i}${GzExt}" ]]; do
                  i+=1
               done
               log "Moving pre-existing $RotatedZippedLog aside to ${LogToRotate}.${Datestamp}.${i}${GzExt}"
               mv -f "$RotatedZippedLog" "${LogToRotate}.${Datestamp}.${i}${GzExt}"
            fi

            gzip "$RotatedLog"
         fi
      fi
   fi

   if [[ -n "${LOGFILE:-}" ]]; then
      popd > /dev/null 2>> "$LOGFILE" || die "Could not cd to: $OLDPWD"
   else
      popd > /dev/null || die "Could not cd to: $OLDPWD"
   fi
}

#------------------------------------------------------------------------------
# At the start of each run for live_checkpoint.sh, daily_checkpoint.sh, and
# recreate_db_checkpoint.sh, before *any* logging activity occurs, rotate the
# logs from the most recent prior run, always named "checkpoint.log" or "log".
#------------------------------------------------------------------------------
function rotate_last_run_logs () {
   # Rotate prior log file for the current script.
   rotate_log_file "$LOGFILE"

   # Rotate prior server log.
   rotate_log_file "log" ".gz"

   # Rotate prior broker log.
   rotate_log_file "p4broker.log" ".gz"

   # Rotate prior audit log.
   rotate_log_file "audit.log" ".gz"
}

#------------------------------------------------------------------------------
# Remove log files matching a specified name prefix, preserving a specified
# number of the recent logs.
#------------------------------------------------------------------------------
function remove_log_files () {
   REMOVE_LOGNAME=$1
   KEEPNUM=$2

   # shellcheck disable=SC2012
   for I_LOGFILE in $(ls -t "${REMOVE_LOGNAME:?}"* 2>/dev/null | $AWK "NR > $KEEPNUM"); do
      log "rm -f $I_LOGFILE"
      rm -f "$I_LOGFILE"
   done
}

#------------------------------------------------------------------------------
# Remove old logs.
#------------------------------------------------------------------------------
function remove_old_logs () {
   # Remove old Checkpoint Logs
   # Use KEEPJNLS rather than KEEPLOGS, so we keep the same number
   # of checkpoint logs as we keep checkpoints.
   pushd "$LOGS" > /dev/null 2>> "$LOGFILE" || die "Could not cd to: $LOGS"

   if [[ "$KEEPJNLS" -eq 0 ]]; then
      log "Skipping cleanup of old checkpoint logs because KEEPJNLS is set to 0."
   else
      log "Deleting old checkpoint logs.  Keeping latest $KEEPJNLS, per KEEPJNLS setting in p4_vars."
      remove_log_files "checkpoint.log" "$KEEPJNLS"
   fi

   if [[ "$KEEPLOGS" -eq 0 ]]; then
      log "Skipping cleanup of old server logs because KEEPLOGS is set to 0."
   else
      log "Deleting old server logs.  Keeping latest $KEEPLOGS, per KEEPLOGS setting in p4_vars."
      remove_log_files "log" "$KEEPLOGS"
      remove_log_files "p4broker.log" "$KEEPLOGS"
      remove_log_files "audit.log" "$KEEPLOGS"
      remove_log_files "sync_replica.log" "$KEEPLOGS"
      remove_log_files "recreate_offline_db.log" "$KEEPLOGS"
      remove_log_files "upgrade.log" "$KEEPLOGS"
      remove_log_files "p4login" "$KEEPLOGS"
      remove_log_files "p4verify.log" "$KEEPLOGS"
      remove_log_files "journal_watch.log" "$KEEPLOGS"
      remove_log_files "refresh_P4ROOT_from_offline_db.log" "$KEEPLOGS"
      remove_log_files "purge_revisions.log" "$KEEPLOGS"
   fi
   popd > /dev/null 2>>"$LOGFILE" || die "Could not cd to: $OLDPWD"
}

#------------------------------------------------------------------------------
# Set the SDP Checkpoint counter to indicate last successful SDP checkpoint
# operation. For standby servers, set the SDP Checkpoint counter on the master.
#------------------------------------------------------------------------------
function set_counter() {
   "$P4CBIN/p4login"

   if [[ "$EDGESERVER" -eq 1 || "$STANDBYSERVER" -eq 1 ]]; then
      "$P4BIN" -u "$P4USER" -p "$P4MASTERPORT" counter "LastSDPCheckpoint.$SERVERID" "$(date +'%s (%Y/%m/%d %H:%M:%S %z %Z)')" > /dev/null
   else
      "$P4BIN" -u "$P4USER" -p "$P4PORT" counter "LastSDPCheckpoint.$SERVERID" "$(date +'%s (%Y/%m/%d %H:%M:%S %z %Z)')" > /dev/null
   fi
}

#------------------------------------------------------------------------------
# This is the function that is called to run the individual checkpoint
# dump or restores during a parallel run.
#------------------------------------------------------------------------------
function parallel_checkpoint_cmd ()
{
  echo "=== Running $* on $(date)." >> "$cmd_log"
  "$@" >> "$cmd_log" 2>&1
  status=$?
  if [[ "$status" -ne 0 ]]; then
    CkpFailed=1
  fi
  echo "=== $* completed on $(date)." >> "$cmd_log"
}

#------------------------------------------------------------------------------
# This function checks for running processes as part of the parallel dump and restore
#------------------------------------------------------------------------------
function check_running ()
{
   sleep 30
   #loop thread process id's and see if any have finished.
   spot=0
   run=()
   for p in "${ids[@]}"; do
      if [[ -n "$p" ]]; then
        # shellcheck disable=SC2009
        running=$(ps cax | grep "$p")
      fi
      if [[ -n "$running" ]]; then
         run[$spot]=$p
         spot=$((spot+1))
      else
         thread=$((thread-1))
      fi
   done

   if [[ "$spot" -ne 0 ]]; then
      ids=("${run[@]}")
   else
      ids=()
   fi
}

#------------------------------------------------------------------------------
# Dump db files in parallel from offline_db
#------------------------------------------------------------------------------
function dump_parallel_ckp ()
{
   db_files=() # Clear array
   thread=0  # Set current threads to 0

   cd "$OFFLINE_DB" || die "Could not cd to: $OFFLINE_DB"

   [[ -d "${CKPTMPDIR}" ]] || mkdir "${CKPTMPDIR}"
   rm -f "${CKPTMPDIR:?}"/*

   # Build array of db_files in offline_db
   for f in db.*; do
      db_files+=( "$f" ) # Append db file to the array
   done

   # loop db_files running the number of them in parallel that is specified by the command line
   for f in "${db_files[@]}"; do
      # Loop to see if we are over our thread count.  If so wait until we drop below it again
      # shellcheck disable=SC2154
      while [[ $thread -ge "$Threads" ]]; do
         check_running
      done

      CkpCmd="${P4DBIN} -r ${OFFLINE_DB} -jd ${CKPTMPDIR}/$P4SERVER.ckp.${f} $f"
      echo "$CkpCmd" > greppattern.txt
      cmd_log="${LOG}-${f}.log"
      parallel_checkpoint_cmd "${P4DBIN}" -r "${OFFLINE_DB}" -jd "${CKPTMPDIR}/$P4SERVER.ckp.${f}" "$f" &
      sleep 1
      # shellcheck disable=SC2009
      pid=$(ps -ef | grep -F -f greppattern.txt | awk '{print $1;}')
      if [[ -n "$pid" ]]; then
         ids[$thread]=$pid # add the process ID into the array of running processes
         thread=$((thread+1))  #  add one to the thread count and start a new verify
      fi
   done

   rm greppattern.txt

   # now that we have started all of them wait until all of our processes have finished before continuing.
   while [[ $thread -gt 0 ]]; do
      check_running
   done

   cd "$CKPTMPDIR" || die "Could not cd to: $CKPTMPDIR"
   rm -f ./*.md5

   # now that the processes have finished combine all of the log file together
   for f in "${db_files[@]}"; do
      if [[ -f "${LOG}-${f}.log" ]]; then
         cat "${LOG}-${f}.log" >> "$LOGFILE"
         rm -f "${LOG}-${f}.log"
      fi
   done

   if [[ "$CkpFailed" -ne 0 ]]; then
      # shellcheck disable=SC2034
      StatusMessage="Error: Checkpoint failed.  Review the log [$LOGFILE]."
      ExitCode=1
   fi

   if [[ "$ExitCode" -ne 0 ]]; then
      die "New checkpoint dump failed!"
   fi

   msg "Completed parallel checkpoint at $(date)."
}

#------------------------------------------------------------------------------
# Restore from db files that have been extracted from a parallel checkpoint tgz file.
#------------------------------------------------------------------------------
function restore_parallel_ckp ()
{
   db_files=() # Clear array
   thread=0  # Set current threads to 0

   [[ -d "${CKPTMPDIR}" ]] || die "$CKPTMPDIR doesn't exist! Restore failed."
   cd "$CKPTMPDIR" || die "Could not cd to: $CKPTMPDIR"

   rm -f "${OFFLINE_DB}"/offline_db_usable.txt >> "$LOGFILE" 2>&1
   rm -f "${OFFLINE_DB}"/db.* >> "$LOGFILE" 2>&1 

   # Build array of db_files in checkpoint temp dir
   for f in *; do
      db_files+=( "$f" ) # Append db file to the array
   done

   # loop db_files running the number of them in parallel that is specified by the command line
   for f in "${db_files[@]}"; do
      # Loop to see if we are over our thread count.  If so wait until we drop below it again
      while [[ $thread -ge "$Threads" ]]; do
         check_running
      done

      CkpCmd="${P4DBIN} -r ${OFFLINE_DB} -jr ${CKPTMPDIR}/${f}"
      echo "$CkpCmd" > greppattern.txt
      cmd_log="${LOG}-${f}.log"
      parallel_checkpoint_cmd "${P4DBIN}" -r "${OFFLINE_DB}" -jr "${CKPTMPDIR}/${f}" &
      sleep 1
      # shellcheck disable=SC2009
      pid=$(ps -ef | grep -F -f greppattern.txt | awk '{print $1;}')
      if [[ -n "$pid" ]]; then
         ids[$thread]=$pid # add the process ID into the array of running processes
         thread=$((thread+1))  #  add one to the thread count and start a new verify
      fi
   done

   rm greppattern.txt

   # now that we have started all of them wait until all of our processes have finished before continuing.
   while [[ $thread -gt 0 ]]; do
      check_running
   done

   # now that the processes have finished combine all of the log file together
   for f in "${db_files[@]}"; do
      if [[ -f "${LOG}-${f}.log" ]]; then
         cat "${LOG}-${f}.log" >> "$LOGFILE"
         rm -f "${LOG}-${f}.log"
      fi
   done

   if [[ "$CkpFailed" -ne 0 ]]; then
      # shellcheck disable=SC2034
      StatusMessage="Error: Checkpoint Restore failed.  Review the log [$LOGFILE]."
      ExitCode=1
   fi

   if [[ "$ExitCode" -ne 0 ]]; then
      die "Restore of checkpoint dump failed!"
   fi

   echo "Offline db file restored successfully." > "${OFFLINE_DB}/offline_db_usable.txt"
   msg "Completed parallel checkpoint restore at $(date)."
}

#------------------------------------------------------------------------------
# Create a tgz of the temporary checkpoint folder that contains individually dumped database checkpoints.
#------------------------------------------------------------------------------
function create_tar_ckp ()
{
   cd "$CHECKPOINTS" || die "Could not cd to: $CHECKPOINTS"
   Ckptgz=${P4SERVER}.ckp.parallel.${CHECKPOINTNUM}.tgz
   [[ -f "$Ckptgz" ]] && die "$Ckptgz file already exists. Check the backup process!"
   { time tar cvzf "$Ckptgz" "${CKPTMPDIR}"; } >> "$LOGFILE" 2>&1 || { die "Failed to create tgz checkpoint file!"; }
   rm -rf "${CKPTMPDIR:?}"/*
}

#------------------------------------------------------------------------------
# Extract a tgz of the temporary checkpoint folder that contains individually dumped database checkpoints.
#------------------------------------------------------------------------------
function extract_tar_ckp ()
{
   cd "$CHECKPOINTS" || die "Could not cd to: $CHECKPOINTS"
   Ckptgz=$1
   [[ -f $Ckptgz ]] || die "$Ckptgz doesn't exist!"
   { time tar xvzf "$Ckptgz"; } >> "$LOGFILE" 2>&1 || { die "Failed to extract $Ckptgz checkpoint file!"; }
}
# Change User Description Committed
#83 31165 Russell C. Jackson (Rusty) Increased the number of checkpoint threads to 16
#82 30545 Russell C. Jackson (Rusty) Increased number of journals to leave in the journals directory to 5.
#81 30542 Russell C. Jackson (Rusty) Correct/Added the NumCheckPointThreads parameter in a few places.
#80 30234 Russell C. Jackson (Rusty) Changed comparison to double parenthesis
#79 30232 Russell C. Jackson (Rusty) Corrected typo for FIRSTJOURNALNUM
#78 30230 Russell C. Jackson (Rusty) Remove extra (
#77 30228 Russell C. Jackson (Rusty) Added check to only replay if first journal number is lower than the journal counter.
#76 30208 Russell C. Jackson (Rusty) Fixed journal cleanup to include the .gz extension.
#75 30081 Russell C. Jackson (Rusty) Updated for sync_replica.sh to reduce noise and handle multiple journal folders.
#74 30056 Russell C. Jackson (Rusty) Added multifile parallel checkpoint support.
#73 29979 Russell C. Jackson (Rusty) Working backup_functions.sh with new journal location and gzip features.
#72 29977 Russell C. Jackson (Rusty) Added missing quote.
#71 29965 Russell C. Jackson (Rusty) Changes to directories, commands and journalPrefix to implement new process to keep journal rotation to the same volume as the logs volume, but to still move journals to the checkpoints directory so they get backed up.
Also compresses journals to save space.
#70 29922 Russell C. Jackson (Rusty) Start of work on gzipping and moving journals.
#69 29772 Russell C. Jackson (Rusty) Added touch to make sure the ls command works.
#68 29762 Russell C. Jackson (Rusty) Changed method of getting journal numbers to work in both checkpoints and journals.rep
#67 29385 Russell C. Jackson (Rusty) Added db.checkpoint.threads and -N to the live checkpoint function.
#66 29383 Russell C. Jackson (Rusty) Correct typos
#65 29381 Russell C. Jackson (Rusty) Better clean up commands.
#64 29379 Russell C. Jackson (Rusty) Switched to more accurate method of finding the latest journal file.
#63 29375 Russell C. Jackson (Rusty) Changed everything in cleanup to use find commands to only find the types of files expected.
#62 29373 Russell C. Jackson (Rusty) Modified to clean both types of checkpoints, and to remove the OK file as well.
#61 29356 Russell C. Jackson (Rusty) Added number of threads to use for restoring a checkpoint to the offline database.
#60 29353 Russell C. Jackson (Rusty) Updated to handle new parallel checkpointing in 2022.2
#59 29340 Russell C. Jackson (Rusty) Changed ls to use -t instead of -r since the -r sorting messes up when
the journal numbers roll over from 9 to 10, 99 to 100, and so on.
#58 29149 Russell C. Jackson (Rusty) Changed ls -t to ls -r in replay_journals to get the correct journal number.
#57 29147 Russell C. Jackson (Rusty) Added logging to replay_journals_to_offlinedb
#56 29139 Russell C. Jackson (Rusty) Added check for 2GB of free space in offline db before allowing scripts to proceed.
#55 28947 Russell C. Jackson (Rusty) Added handling of gzipped journal files.
#54 28450 Russell C. Jackson (Rusty) Removed +1 on offlinecheckpointnum and moved function to run right before dump_checkpoint.
#53 28005 Russell C. Jackson (Rusty) Fixed missing OFFLINECHECKPOINTNUM issue and fixed entries in template.p4.crontab.
#52 28000 Russell C. Jackson (Rusty) Dropped the tr -d 015 because it was deleting characters it shouldn't.
#51 27970 Russell C. Jackson (Rusty) Added JOURNALNUM to log file names for better association with the journals, and added rotation of the replica_cleanup logs.
#50 27813 Russell C. Jackson (Rusty) Added a cp of sdp_server_type.txt from root to offline before switch.
#49 27787 Russell C. Jackson (Rusty) Added tr to the log to strip carriage returns from log files.
#48 27716 Russell C. Jackson (Rusty) Added get_offline_journalnum to replay_journls_to_offline_db to avoid missing a call to that function.
#47 27653 Russell C. Jackson (Rusty) Removed call to verify_sdp.sh
#46 27647 Russell C. Jackson (Rusty) Added check for forward-standby in is_standby function
#45 27645 Russell C. Jackson (Rusty) Removed -w check on dirs since it is not working on some storage.
#44 27447 Russell C. Jackson (Rusty) Changed truncate journal functions to use sdp_server_type.txt type for more accurate check of the master server.
#43 27244 Russell C. Jackson (Rusty) Added LASTJOURNALNUM back to backup_functions.sh
#42 27237 Russell C. Jackson (Rusty) Removed code that sets different checkpoint directories.
#41 27235 Russell C. Jackson (Rusty) Removed useless warning messages.
#40 27233 Russell C. Jackson (Rusty) Removed check for checkpoints.edgename dir
#39 27217 Russell C. Jackson (Rusty) Merged in some changes from the Perforce maintained SDP
#38 26941 Russell C. Jackson (Rusty) Added nice to dump and restore of checkpoints.
Added check for free space in cache clean. Only run when req.
#37 26934 Russell C. Jackson (Rusty) Modified replay journal to offline db to use the actual last journal rather than the journalnum.
#36 26844 Russell C. Jackson (Rusty) Add changes from Blizzard to work with their environment.
#35 25896 Russell C. Jackson (Rusty) Added existing checkpoint check to dump_checkpoint where it should actually be.
#34 25894 Russell C. Jackson (Rusty) Removed check for existing checkpoint because it was being called before we get the correct checkpoint number.
#33 25892 Russell C. Jackson (Rusty) Set checkpoint number to journal number since we are calling get_journal number after truncate journal.
Added logging and removed extra call to get_offline_journal_num.
#32 25888 Russell C. Jackson (Rusty) Renamed nextCheckpointNum to CURRENTJOURNAL to be more consistent, and accurate.
#31 25886 Russell C. Jackson (Rusty) Removed decrement of journal counter for edge servers since we check after journal rotation now.
#30 25859 Russell C. Jackson (Rusty) Fixes for issues with journals rotating during recreate_db_sync_replica.sh
#29 25660 Russell C. Jackson (Rusty) Modified check dir function to use touch instead of -w since that is failing on some NFS mounts.
#28 25633 Russell C. Jackson (Rusty) Add call to get offline journal number to replay journals to offline database.
#27 25607 Russell C. Jackson (Rusty) Added second check for unreplayed journals.
#26 25089 Russell C. Jackson (Rusty) Added replica_cleanup.log to logs to remove.
#25 24172 Russell C. Jackson (Rusty) Added a cd to the logs directory into remove_logs_files function for direct calls to the function.
#24 24101 Russell C. Jackson (Rusty) Added -t upgrade:1666 to xu so it would work on the offline_db without a license.
#23 24027 Russell C. Jackson (Rusty) Added more redirects to /dev/null
#22 24025 Russell C. Jackson (Rusty) Added redirect to get rid of edge check track output when track=1
#21 23693 Russell C. Jackson (Rusty) Added move of sdp_server_type.txt file to switch command.
#20 23546 Russell C. Jackson (Rusty) Added comments about the -xu.
#19 23545 Russell C. Jackson (Rusty) Added a -xu on the offline_db after a restore to make sure the database is up to date.
#18 23479 Russell C. Jackson (Rusty) Added redirect to log file for p4d_truncate_journal.
#17 23269 Russell C. Jackson (Rusty) Updated so that edge servers and replicas of edge server in a shareddata environment use a unique checkpoints folder.
Otherwise, everything is still just checkpoints to maintain compatability.
#16 23267 Russell C. Jackson (Rusty) Update so that only edge servers use the modified checkpoints folder to allow scripts to work with older sdp layouts.
#15 23249 Russell C. Jackson (Rusty) Removed CHECKPOINTS from check_vars
#14 23247 Russell C. Jackson (Rusty) Modified the checkpoints folder to include an extension of the master or edge server.id
to support sharing the data volume between a master and a workspace edge server in the same
data center.

Updated recover_edge to use variables appropriately and fixed a bug in the naming of the
checkpoint file.
#13 23227 Russell C. Jackson (Rusty) Fixed a bug with compare journal numbers.
#12 23063 Russell C. Jackson (Rusty) Removed log redirect since a log isn't defined for the init script.
#11 23032 Russell C. Jackson (Rusty) Removed : in upgrade log name.
Removed extra spaces and some unecessary logic from backup_functions.sh since it is in p4d_base now.
#10 22981 Russell C. Jackson (Rusty) Made files writeable so they are easier to update once on the server.
#9 22801 Russell C. Jackson (Rusty) Fixed the test to check the variable and do the bit mask test correctly.
#8 22798 Russell C. Jackson (Rusty) Put in a working version.
#7 22797 Russell C. Jackson (Rusty) Added missing $ in front of SERVERVAL variable, but this still isn't working.
#6 22757 Russell C. Jackson (Rusty) Added P4SVRPORT to use to make the server listen on just the port number.

Changed P4PORT to include the hostname to eliminate a bug where the ticket is not
issued by the master as it should be when rpl.forward.login is set up.

Created a p4login_master and updated appropriate scripts to use it. This actually
isn't necessary with the change to include the hostname in P4PORT since the ticket
will be valid on all servers in the group, but this covers configurations that aren't
using rpl.forward.login.
#5 22703 Russell C. Jackson (Rusty) Corrected handling of existing gzipped log.
#4 22697 Russell C. Jackson (Rusty) Handle existing rotated logs, and remove check_disk_space from log reports since people monitor disk space with monitoring systems.
#3 22696 Russell C. Jackson (Rusty) Updates to support using journalnum in rotate_last_run_logs.
#2 22695 Russell C. Jackson (Rusty) Changed rotated log names back to journalnum to match rotated journals.
#1 22693 Russell C. Jackson (Rusty) Branched a Unix only version of the SDP.
Removed extra items to create a cleaner tree.
Moved a few items around to make more sense without Windows in the mix.
//guest/perforce_software/sdp/dev/Server/Unix/p4/common/bin/backup_functions.sh
#58 22679 Russell C. Jackson (Rusty) Removed duplicate line that was producing a cron message.
#57 22658 Russell C. Jackson (Rusty) Added line to remove the ckp_running.txt file when the checkpoint fails through
the die function because the checkpoint is no longer running, and this file prevents
the next checkpoint from running successfully.
#56 22633 Russell C. Jackson (Rusty) Removed Debug and extra echo of journal number to eliminate cron messages.
#55 22387 Robert Cowham Fix journal rotation off-by-one error
Also make sure that -jj rotation specifies prefix.
#54 22345 C. Thomas Tyler Another tweak.
#53 22343 C. Thomas Tyler Fixed off-by-one error in new offline journal counter calculation
logic.

Bypassing pre-commit review until test suite runs clean again.

#review-22344
#52 22277 C. Thomas Tyler Debugging.
#51 22276 C. Thomas Tyler Debugging.
#50 22274 C. Thomas Tyler Fixed bug where detection of journal number fails for new/empty data
set.

Removed msg() and bail() functions, and changed approach to make the
existing log() and die() functions behave correctly regardless of
whether $LOGFILE is defined.

If $LOGFILE is defined, log() silently write to the log file, otherwise
writes to the screen (stdout).

If $LOGFILE is defined, die() writes to the log file and sends an
email, otherwise writes to the screen (stdout).  If on a tty, error
is duplicated in stderr.

To Do: Improve in-code comments.

Bypassing pre-commit review until tests pass.

#review-22275
#49 22272 C. Thomas Tyler Enhanced error message in check_journalnum() in backup_functions.hs.

Bypassing pre-commit review until tests pass.

#review-22273
#48 22270 C. Thomas Tyler Attempting fix of build failure.
Bypassing pre-commit review.

#review-22271
#47 22250 C. Thomas Tyler Further refinements to the new 'rotate journal on p4d start' change:
* Fixed p4d_truncate_journal so it has less environment dependencies
(e.g. doesn't depend on LOGFILE, etc.) and doesn't try sending email.
* Introduced msg() and bail(), counterparts to log() and die() which
don't try to write to LOGFILE and don't try to send email.
* Added call to get_journalnum() before call to p4d_truncate_journal().
* Fixed logic in get_journalnum() so it gets the journal number w/o
needing p4d to be up.
 * I think I fixed the syntax error in bitwise operator check when
setting EDGE_SERVER.  It works on a non-edge server (sets
EDGESERVER=0).  For now I have it doing an
'echo EDGESERVER=$EDGESERVER', but need to test that it correctly
sets EDGESERVER=1 on an edge server.

TO DO: Remove that 'echo EDGESERVER=$EDGESERVER' once we verify it
correctly sets the value for $EDGESERVER. (Or not?)
#46 22239 Russell C. Jackson (Rusty) Change set_vars to look up the edge server directly in the database so the server does
not have to be on-line to check.

Fix for Job: SDP-223
#45 22066 Russell C. Jackson (Rusty) Added rotate for p4verify.log instead of just deleting the prior one.
#44 21624 C. Thomas Tyler Fixed issue with mail sending a usage error on Ubuntu,
which does not accept the '-V' flag to check the version.
#43 21580 Russell C. Jackson (Rusty) Changed compare journal numbers function to only fail if root journal number is
greater than offline_db. The not equal check was preventing the recreate_db_sync_replca.sh
script from being used to fix a replica that was out of sync with the master.
#42 21322 Russell C. Jackson (Rusty) #review-21323
Forgot server.id
#41 21318 Russell C. Jackson (Rusty) #review-21319
Added commands to move license*, rdb.lbr and state* from P4ROOT to OFFLINE_DB before
switching the links.

Added command to remove the db.* files from offline_db/save as well before trying
to recreate the offline database.
#40 21178 Russell C. Jackson (Rusty) Change the SDP so that root and offline_db can be on different volumes and still accomplish
a fast database recovery using recreate_db_checkpoint.sh and recreate_db_sync_replica.sh.
This is done by switching the links now rather than moving the db files.
#39 20970 Russell C. Jackson (Rusty) Changed to use the standard remove log function on the p4login log.
We don't need to keep
anymore than the keeplogs specified number of these logs around. It doesn't matter if they
are all in the last hour or the last seven days. The only need for a p4login log is for
debugging something not working. Anyone that needs long term tracking of logins can turn
on the auth structured log to track the logins.
#38 20964 adrian_waters Include removal of the p4login.*.log files in daily cleanup
#37 20940 Russell C. Jackson (Rusty) Drop JOURNALNUM from the rotated log names because it forces you to wait to rotate
the prior logs until you get the journal number and creates a problem where the error
that you couldn't get the journal number ends up at the end of the previous days log
file, and that is what gets email out. That causes confusion for the person trying
to see what the error is.

Moved all rotate_last_run_logs up to the point right after we set the environment.
#36 20822 C. Thomas Tyler Change logic to use p4d init script only from /p4/N/bin.

The current logic sets a variable essentially preferring the p4d
init script in /etc/init.d, using the one in /p4/N/bin only if the
one in /etc/init.d doesn't exist as a file (and would not be
selected if it was a symlink).

Reasons:
* Referencing the file/symlink in /etc/init.d introduces
potentially complex and confusing behavior.  If there were
a file in /etc/init.d rather than symlink'd, that could be
bad if it doesn't get upated with new versions of the SDP,
where stuff in /p4/N/bin should be reliably updated.

* I just expect the SDP to always use its own files in /p4/N/bin,
under direct control of the perforce user, rather than external
references to it.  In a proper SDP deployment on Linux,
/etc/init.d should contain symlinks for SDP init scripts
anyway.  But why trust that if there's no need?

* If there is a file in /etc/init.d and it's different than
/p4/N/bin for some reason, we should prefer the one in /p4/N/bin.

* The sylminks in /etc/init.d are outside the direct control of
the perforce user, and could point to who-knows-where.
#35 20749 C. Thomas Tyler Approved and committed, but I believe that the shared data setting is always set to false on the master and we should look at fixing that in another change.

Enhanced p4login again.

Improvements:
Default behavior with no arguments gives the desired results.
For example, if run on a master, we login on the super user P4USER to
P4PORT.  If run on a replica/edge and auth.id is set, we login P4USER
to the P4TARGET port of the replica.

All other login functionality, such as logging in the replication
service user on a replica, logging in supplemental automation users,
is now accessed via new flags.

A usage message is now available via '-h' and '-man' options.  The
new synopsys is:
p4login [<instance>] [-p <port> | -service] [-automation] [-all]

The <instance> parameter is the only non-flag positional parameter,
and can be ommitted if SDP_INSTANCE is already defined (as is typical
when called by scripts).

With this change, several other scripts calling either the 'p4login'
script or 'p4 login' commands were normalized to call p4login as
appropriate given the new usage.

Reviewer Note:  Review p4login first, then other files.  Most changes
are in p4login.

In other scripts callling p4login, calls similar to:
$P4BIN -u $P4USER -p $P4PORT login < /path/to/pwd
are replaced with: $P4CBIN/p4login

In other scritps calling p4login, calls similar to:
$P4BIN -p $P4MASTERPORT login < /path/to/pwd
are replaced with: $P4CBIN/p4login -p $P4MASTERPORT

Note that, if auth.id is set, calling 'p4login' actually has the
same behavior as 'p4login -p $P4MASTERPORT', since p4login
called on a replica with auth.id set will just login to the master
port anyway.

Depending on intent, sometimes $P4BIN/p4login -service
is used.

== Misc Cleanup ==

In doing the cleanup:
* Fixed a hard-coding-to-instance-1 bug in broker_rotate.sh.
* Fixed an inconsistency in recreate_db_sync_replica.sh, where
it did just a regular login rather than a login -a as done in other
places for (for compatibility with some multi-interface NIC card
configs).

== p4login Call Normalization ==
Code cleanup was done to normalize calls to p4login, such that:
1) the call starts with $P4CBIN/p4login (not the hard-coded path),
and 2) logic to redirect sdtout/stderr to /dev/null was removed,
since it's not necessary with p4login.  (And if p4login ever
does generate any unwanted output, we only fix it in one place).

== Tweak to instance_vars.template ==
This change includes a tweak to set P4MASTERPORT dynamically
on a replica to ensure the value precisely matches P4TARGET
for the given replica.  This will reduce a source of problems
when SSL is used, as it is particularly sensitive to the precise
P4PORT values used, and will also help for environments which
have not yet set auth.id.  If the port cannot be determined
dynamically, we fall back to the old logic using the assigned
value.

== Tweak to SDP_ALWAYS_LOGIN behavior ==
This used to default to 1, now it defaults to 0.  At this
point we should no longer need to force logins, and in fact
doing so can get into a 'p4 login' hang situation with
auth.id set.  Best to avoid unnecessary logins if we
already have a valid ticket.  (I think the need to force a
login may have gone away with p4d patches).

== Obsolete Script ==
With this change, svclogin.sh is now obsolete.  All it was doing
was a few redundant 'p4 login' commands followed by a call to
p4login anyway.

== Testing ==
Our test suite doesn't fully cover this change, so additional
manual testing was done in the Battle School lab environment.
#34 20637 Russell C. Jackson (Rusty) Fixed the real cause of the problem and put the redirects to LOGFILE back.
The actual cause of the problem was that we were rotating the sync_replica.log
file twice within that function because of the call to rotate $LOGFILE and a
second call to rotate "sync_replica.log". I removed the 2nd call to rotate the
sync_replica.log.
#33 20636 Russell C. Jackson (Rusty) Changed mv and gzip in rotate log to go to /dev/null to avoid stomping on the file we just rotated.
#32 20170 Russell C. Jackson (Rusty) Moved password and users into the config directory to allow for instance specific
users and passwords. Ran into a case where two different teams were sharing the same
server hardware and needed this type of differentiation. Surprised that we haven't hit
this sooner.

Also defaulted mkdirs to use the numeric ports since this is the most common
installation.
#31 19851 Robert Cowham Check for usable offline_db before creating checkpoint work file.
This avoids an error right at the start locking out the utility which
will fix said error!
#30 19768 UnstoppableDrew @tom_tyler @russell_jackson
Bug fix for running p4master_run as root, and some comment header cleanup. Job 000543

p4master_run: Preserve original arguments list and use this when exec'ing as $OSUSER.

backup_functions.sh: Add text about sourcing p4_vars yourself instead of using p4master_run.

update_limites.py: Run p4login directly without p4master_run since p4login calls p4_vars now.

everything else: Remove comment block about needing to run with p4master_run. Reword comment
  about SDP_INSTANCE since it is not always an integer value.
#29 19523 Russell C. Jackson (Rusty) Added a KEEPJNLS variable to allow you to keep more journals than checkpoints
in case you rotate the journal more frequently than you run checkpoints.
#28 19113 Russell C. Jackson (Rusty) Changed name of daily_backup.sh to daily_checkpoint.sh
Changed name of weekly_backup.sh to recreate_db_checkpoint.sh

Updated crontabs with new names, and changed to run recreate_db_checkpoint
on the 1st Sat. of Jan. and July. For most companies, this is a better
practice than recreating weekly per discussion with Anton.

Remove solaris crontab since Solaris is pretty much dead, and we don't test on it.

Updated docs to reflect name changes, and did a little clean other other sections
while I was in there.
#27 19105 Russell C. Jackson (Rusty) This change uses p4 admin journal command against the master server to rotate the journal.
Added a p4d_truncate_journal to use in weekly_back that still rotates via p4d.

The purpose of this change is to allow you to run daily_backup.sh on a standby machine
where you have a shared depotdata volume. If you want to use daily on the standby machine,
you have to put offline_db on the shared depotdata volume which means you will NOT want
to run weekly_backup.sh on the master very often, but that is basically what Anton is
recommending now. I am currently testing this setup on a production environment, and if
it works well, I will change mkdirs.sh to put offline_db on the depotdata volume by
default and update the crontabs not to run weekly anymore.

#review-19083
#26 18934 C. Thomas Tyler Moved ckp_runnig.txt to $LOGS (/p4/n/logs) from /p4/n/checkpoints:
* Avoids it getting rsync'd by sync_replica.sh or by common
human admin rsyncs of the /p4/n/checkpoints dir.
* It should be in a volume that's not shared.
* Puts it in the logs directory where you go look when things break.
#25 18617 Russell C. Jackson (Rusty) #review-18610
Fixed a bug with check_journalnum where it was being called to check the offline journal
number, but the function was hard coded to JOURNALNUM.

Implemented a function to compare the journal numbers of P4ROOT and OFFLINE_DB before
switching the db files as an extra layer of protection to avoid data loss.
#24 18595 Russell C. Jackson (Rusty) Fixed a log rotation bug that has been around for a long time.
If you rotated the journal
more times than KEEPCKPS and KEEPLOGS, the old method would remove all of your logs and
checkpoints because it didn't actually look at how many were on disk. Found the bug
while reviewing the test harness with Robert.

Adjusted the test harness to account for the change. (Stole from Robert's shelf.)
#23 18590 Robert Cowham Fix failing tests.
Change log filename format to use - instead of : as seperator for date/time component
#22 18587 Russell C. Jackson (Rusty) Reworked the log rotation stuff in backup_functions.sh to make it cleaner and
handle the new log from recreate_offline_db.sh.

Modified recreate_offline_db.sh to add comments about a bad checkpoint. Also
made it create its own log file since it isn't doing a checkpoint. Removed the
log rotation for the same reason.

Moved the LOGFILE setting out to all of scripts to make it more obvious for future
scripts that you need to set that variable in your script so that it doesn't just
default to checkpoint.log.

Moved the functions in weekly_backup.sh and recreate_offline_db.sh into backup_functions.sh
where they belong for consistency.

Modified backup_functions.sh to use a consistent naming convention for all the
rotated log files rather than checkpoint.log being unique.

Replaced all back ticks with the newer bash $() method.

Removed all of the line wrapping since I am pretty sure that none of us are working on an
80 character terminal these days and it is easier to read this way.
#21 18533 Robert Cowham Put a date/time suffix onto checkpoint.log.* files in case of any errors to avoid them being overwritten.
Make remove_old_logs tidy up appropriately.
#20 18532 Robert Cowham Correct log message regarding journals replays
#19 18484 Russell C. Jackson (Rusty) Added comment on WAITCOUNT to explain the value.
#18 18450 Russell C. Jackson (Rusty) Added a kill for the p4d_stop function in case p4d doesn't shut down.

In the process of testing this, I discovered that using $P4DBIN in this
case was a bug that didn't work when running in case insensitive mode because
the executable doesn't match what is actually running since we end up
calling p4d from /p4/common/bin. Corrected the grep so that it would match
in either case.

#review-18430
#17 16335 C. Thomas Tyler Routine Merge Down to dev from main using:
p4 merge -b perforce_software-sdp-dev
#16 16029 C. Thomas Tyler Routine merge to dev from main using:
p4 merge -b perforce_software-sdp-dev
#15 15797 C. Thomas Tyler Routine    Merge Down to dev from main for SDP.
#14 15778 C. Thomas Tyler Routine Merge Down to dev from main.
#13 15376 adrian_waters formatting only - fix spacing;  there's inconsistent use of tabs/spaces throughout the file - needs cleanup at some point.
#12 15375 adrian_waters Routine merge-down from main->dev
#11 15374 adrian_waters - Ensure backup scripts are run as the OSUSER (to prevent accidental running as root); 
- in scripts where LOGFILE value is changed from the 'checkpoint.log'  set by set_vars, ensure the new assignment is before check_dirs is called, otherwise errors could be written to the 'wrong' log
- in 'die()' - detect if running from terminal & also send output to stderr
#10 13931 C. Thomas Tyler Routine merge-down to dev from main.
#9 13906 C. Thomas Tyler Normalized P4INSTANCE to SDP_INSTANCE to get Unix/Windows
implementations in sync.

Reasons:
1. Things that interact with SDP in both Unix and Windows
environments shoudn't have to account for this obscure
SDP difference between Unix and Windows.  (I came across
this doing CBD work).

2. The Windows and Unix scripts have different variable
names for defining the same concept, the SDP instance.
Unix uses P4INSTANCE, while Windows uses SDP_INSTANCE.

3. This instance tag, a data set identifier, is an SDP concept.
I prefer the SDP_INSTANCE name over P4INSTANCE, so I prpose
to normalize to SDP_INSTANCE.

4. The P4INSTANCE name makes it look like a setting that might be
recognized by the p4d itself, which it is not.  (There are other
such things such as P4SERVER that could perhaps be renamed as
a separate task; but I'm not sure we want to totally disallow
the P4 prefix for variable names. It looks too right to be wrong
in same cases, like P4BIN and P4DBIN.  That's a discussion for
another day, outside the scope of this task).

Meanwhile:
* Fixed a bug in the Windows 2013.3 upgrade script that
was referencing undefined P4INSTANCE, as the Windows
environment defined only SDP_INSTANCE.

* Had P4INSTANCE been removed completely, this change would
likely cause trouble for users doing updates for existing
SDP installations.  So, though it involves slight technical debt,
I opted to keep a redundant definition of P4INSTANCE
in p4_vars.template, with comments indicating SDP_INSTANCE should be
used in favor of P4INSTANCE, with a warning that P4INSTANCE
may go away in a future release.  This should avoid unnecessary
upgrade pain.

* In mkdirs.sh, the varialbe name was INSTANCE rather than
SDP_INSTANCE.  I changed that as well.  That required manual
change rather than sub/replace to avoid corrupting other similar
varialbe names (e.g.  MASTERINSTANCE).

This is a trivial change technically (a substitute/replace, plus
tweaks in p4_vars.template), but impacts many files.
#8 12169 Russell C. Jackson (Rusty) Updated copyright date to 2015

 Updated shell scripts to require an instance parameter to eliminate the need
 for calling p4master_run.    Python and Perl still need it since you have to set the
environment for them to run in.

 Incorporated comments from reviewers. Left the . instead of source as that seems
more common in the field and has the same functionality.
#7 12028 C. Thomas Tyler Refreshed SDP dev branch, merging down from main.
#6 11541 Russell C. Jackson (Rusty) Keeping dev up to date.
#5 11535 Russell C. Jackson (Rusty) Updated dev from main.
#4 11509 Russell C. Jackson (Rusty) Added sync_replica.log to backup function log rotations, and added rm on existing
 gzipped logs with the same name in order to keep the script from hanging waiting
 for a response to overwrite.

 Added sync_shared_replica.sh and weekly_sync_shared_replica.sh to support replicas
with shared depotdata storage. No rsync is necessary. The logs volume must not be
a shared volume with these scripts though.
#3 11483 Russell C. Jackson (Rusty) Brought over changes from RCJ backup_functions.sh
#2 11463 Russell C. Jackson (Rusty) Updated dev to prepare for Summit agreed changes.
#1 10638 C. Thomas Tyler Populate perforce_software-sdp-dev.
//guest/perforce_software/sdp/main/Server/Unix/p4/common/bin/backup_functions.sh
#1 10148 C. Thomas Tyler Promoted the Perforce Server Deployment Package to The Workshop.