backup_functions.sh #7

#!/bin/bash
#==============================================================================
# Copyright and license info is available in the LICENSE file included with
# the Server Deployment Package (SDP), and also available online:
# https://swarm.workshop.perforce.com/projects/perforce-software-sdp/view/main/LICENSE
#------------------------------------------------------------------------------

set -u

# Common functions used in all backup scripts.

#------------------------------------------------------------------------------
# Verify key variables in the shell environment exist, or else abort.
#------------------------------------------------------------------------------
check_vars () {
   if [[ -z "$SDP_INSTANCE" || -z "$P4HOME" || -z "$P4PORT" || -z "$P4ROOT" || -z "$P4JOURNAL" || -z "$P4BIN" || -z "$P4DBIN" || -z "$P4TICKETS" || -z "$KEEPCKPS" || -z "$KEEPJNLS" || -z "$KEEPLOGS" || -z "$CHECKPOINTS" || -z "$LOGS" || -z "$OSUSER" ]]; then
      echo "Use p4master_run or source p4_vars when calling this script."
      echo "Required external variable not set. Abort!"
      exit 1
   fi
}

#------------------------------------------------------------------------------
# is_edge ($SeverID, $RootDir)
#
# Determine if a given ServerID is an edge server or not, checking a given
# database root dir (e.g. $P4ROOT or $OFFLINE_DB).
#
# Input:
# $1 - SeverID (required)
# $2 - RootDir (optional, defaults to $P4ROOT)
#
# Output YES if an edge server, NO otherwise.
#
#------------------------------------------------------------------------------
is_edge () {
   local ServerID=${1:-Unset}
   local RootDir=${2:-$P4ROOT}
   local ServicesData=
   local EdgeCheck=

   # Extract a slice of db.server referencing the given ServerID,
   # and then grab the field containing Services data.
   ServicesData=$($P4DBIN -r $RootDir -J off -L /dev/null -k db.server -jd - 2>&1 |\
      $GREP "@db.server@ @${ServerID}@" | $CUT -d '@' -f 13)

   # Do a bitwise operation to determine if the ServicesData value indicates
   # this is an edge server.
   if [[ -n "$ServicesData" ]]; then
      EdgeCheck=$(($ServicesData & 4096))

      if [[ "$EdgeCheck" -gt 0 ]]; then
         echo YES
      else
         echo NO
      fi
   else
      echo NO
   fi
}

#------------------------------------------------------------------------------
# is_standby ($SeverID, $RootDir)
#
# Determine if a given ServerID is an standby server or not, checking a given
# database root dir (e.g. $P4ROOT or $OFFLINE_DB).
#
# Input:
# $1 - SeverID (required)
# $2 - RootDir (optional, defaults to $P4ROOT)
#
# Output YES if an standby server, NO otherwise.
#
#------------------------------------------------------------------------------
is_standby () {
   local ServerID=${1:-Unset}
   local RootDir=${2:-$P4ROOT}
   local ServicesData=

   # Extract a slice of db.server referencing the given ServerID,
   # and then grab the field containing Services data.
   ServicesData=$($P4DBIN -r $RootDir -J off -L /dev/null -k db.server -jd - 2>&1 |\
      $GREP "@db.server@ @${ServerID}@" | $CUT -d '@' -f 13 | tr -d ' ')

   # Do a check to see if the ServicesData value indicates
   # this is an standby server.
   if [[ -n "$ServicesData" ]]; then
      if [[ "$ServicesData" -eq '35141' ]]; then
         echo YES
      else
         echo NO
      fi
   else
      echo NO
   fi
}

#------------------------------------------------------------------------------
# Set variables for use in various scripts:
# RC=path to the init scripts
# OFFLINE_DB=path to offline db directory
# EDGESERVER=1 if this is an edge server, 0 otherwise.
# STANDBYSERVER=1 if this is a standby server, 0 otherwise.
#
# This must be called after loading the standard shell environment by
# doing:
# source /p4/common/bin/p4_vars N
#
# This sets P4HOME, SERVERID, etc. needed by this function.
#------------------------------------------------------------------------------
set_vars () {
   RC=$P4HOME/bin/p4d_${SDP_INSTANCE}_init
   OFFLINE_DB=${P4HOME}/offline_db

   if [[ -n "$SERVERID" ]]; then
      if [[ "$(is_edge $SERVERID)" == YES ]]; then
         export EDGESERVER=1
         # Get commit server from P4TARGET setting in database
         P4COMMITSERVER=$($P4DBIN -r $P4ROOT -cshow | $GREP "${SERVERID}: P4TARGET" | $CUT -d ' ' -f 4)
      else
         export EDGESERVER=0
      fi
   else
      export EDGESERVER=0
   fi

   if [[ -n "$SERVERID" ]]; then
      if [[ "$(is_standby $SERVERID)" == YES ]]; then
         export STANDBYSERVER=1
         # Get commit server from P4TARGET setting in database
         P4COMMITSERVER=$($P4DBIN -r $P4ROOT -cshow | $GREP "${SERVERID}: P4TARGET" | $CUT -d ' ' -f 4)
      else
         export STANDBYSERVER=0
      fi
   else
      export STANDBYSERVER=0
   fi

   # Ensure that SDP_ADMIN_PASSWORD_FILE is set, using existing value if set (e.g.
   # in p4_vars), otherise set it to the SDP standard value.
   export SDP_ADMIN_PASSWORD_FILE="${SDP_ADMIN_PASSWORD_FILE:-Unset}"
   [[ "$SDP_ADMIN_PASSWORD_FILE" == Unset ]] && \
      export SDP_ADMINPASS_FILE="$P4CCFG/.p4passwd.${P4SERVER}.admin"
}

#------------------------------------------------------------------------------
# Check if user is running as required OS user.
#------------------------------------------------------------------------------
check_uid () {
   user=$(id -un)
   if [[ ${user} != ${OSUSER} ]]; then
      die "Must be run by user: ${OSUSER}. Abort!"
   fi
}

#------------------------------------------------------------------------------
# Function log() - echo message to logfile or stdout.
#
# If $LOGFILE is defined, write message to the log file only; nothing goes to
# stdout.  Prepend a datestamp.
# If $LOGFILE isn't defined, just echo to stdout, w/o timestamp or.
# In all cases, support '-e' formatting.
# Input:
# $1 - message to log (must be quoted).
#------------------------------------------------------------------------------
log () {
   if [[ ${LOGFILE:-Unset} != Unset ]]; then
      echo -n $(date)   2>&1 >> "$LOGFILE"
      echo -e " $0: $@" 2>&1 >> "$LOGFILE"
   else
      echo -e "$@"
   fi
}

#------------------------------------------------------------------------------
# Decide depending on our mail utility, how to specify sender (if we need to).
# Mail on some platforms sets sender by default.
# If the mail utility returns what looks like a version identifier
# when given the '-V' flag, use a '-S' flag.  If it does not return a
# verision identifier, don't set a mail sender option.
# Allow GNU Mailutils alternative flag instead.
#------------------------------------------------------------------------------
get_mail_sender_opt () {
   local mail_sender_opt=
   local mail_ver=
   if [[ -n "$MAILFROM" ]]; then
      mail_ver=$($SDPMAIL -V 2>&1)
      if [[ "$mail_ver" =~ "GNU Mailutils" ]]; then
         mail_sender_opt="-aFrom:$MAILFROM"
      elif  [[ "$mail_ver" =~ ^[0-9]+\.[0-9] ]]; then
         mail_sender_opt="-S from=$MAILFROM"
      fi
   fi
   echo "$mail_sender_opt"
}

#------------------------------------------------------------------------------
# Email the log file by $LOGFILE.
#------------------------------------------------------------------------------
mail_log_file () {
   local subject=$1
   local mail_sender_opt=$(get_mail_sender_opt)
   $SDPMAIL -s "$subject" $mail_sender_opt $MAILTO < "$LOGFILE"
}

#------------------------------------------------------------------------------
# Function die() - log message, send email, and exit.
# If $LOGFILE is defined, write message to the log file, email log,
# and exit.
# If $LOGFILE is not defined, write message to the stdout, and skip
# email.
# If in terminal session, display message to stderr as well.
#------------------------------------------------------------------------------
die () {
   # mail the error (with more helpful subject line than cron)
   log "ERROR!!! - $HOSTNAME $P4SERVER $0: $@"

   if [[ ${LOGFILE:-Unset} != Unset ]]; then
      mail_log_file "ERROR!!! - $HOSTNAME $P4SERVER $0: $@"
   fi

   # if running from terminal, also send to stderr
   tty >/dev/null
   if [[ $? -eq 0 ]]; then
      echo -e "$@" >&2
   fi
   rm -f ${LOGS}/ckp_running.txt

   exit 1
}

#------------------------------------------------------------------------------
# Convert various byte values (K,M,G,%) to bytes
# Pass in values such as 1024K, 512M, 1G or 10%
#------------------------------------------------------------------------------
convert_to_bytes () {
   local value=$1
   local totalsize=${2:-Undefined}
   local size=
   local unit=

   # Break up value into size (numeric) and unit (K,M,G)
   size=$($GREP -Eo '[[:alpha:]%]+|[0-9]+' <<<$value | head -1)
   unit=$($GREP -Eo '[[:alpha:]%]+|[0-9]+' <<<$value | tail -1)

   # Based on unit, convert to bytes
   case "$unit" in
      K)
         echo $(($size * 1024))
         ;;
      M)
         echo $(($size * 1024**2))
         ;;
      G)
         echo $(($size * 1024**3))
         ;;
      %)
         echo $(($totalsize * $size / 100))
         ;;
   esac
}

#------------------------------------------------------------------------------
# Write a semaphore file, $LOGS/ckp_running.txt.  This file is written at
# the start of processing, and removed upon successful completion.  It
# prevents multiple concurrent operations from being launched accidentally
# e.g. by multiple human admins, or a human inadvertantly competing with a
# cron job.
#
# It is also intended to get human admins to determine the root cause of
# checkpoint failues.
#------------------------------------------------------------------------------
ckp_running() {
   if [[ -f ${LOGS}/ckp_running.txt ]]; then
      die "Last checkpoint not complete. Check the backup process or contact support."
   fi
   echo "Checkpoint running." > ${LOGS}/ckp_running.txt
}

#------------------------------------------------------------------------------
# Remove the ckp_running.txt semaphore file when checkpoint processing is
# complete.
#------------------------------------------------------------------------------
ckp_complete() {
   rm -f ${LOGS}/ckp_running.txt
}

#------------------------------------------------------------------------------
# Ensure key directories are writable. Abort if they are not.
#------------------------------------------------------------------------------
check_dirs () {
   # Check that key dirs are writable
   declare -i dirsOK=1
   dirList="$OFFLINE_DB $CHECKPOINTS $LOGS"
   [[ $EDGESERVER -eq 1 ]] && dirList+=" ${CHECKPOINTS}.${SERVERID#p4d_}"
   for dir in $dirList; do
      if [[ ! -d "$dir" || ! -w "$dir" ]]; then
         log "Error: Dir $dir does not exist or is not writable."
         dirsOK=0
      fi
   done
   [[ $dirsOK -eq 1 ]] || "Some expected dirs are missing or not writable. Aborting."
}

#------------------------------------------------------------------------------
# Add the results of df -h or df -m to the log file.
#------------------------------------------------------------------------------
check_disk_space () {
   log "Checking disk space..."
   $P4BIN diskspace >> "$LOGFILE" 2>&1
}

#------------------------------------------------------------------------------
# Check value of journal; ensure it is an integer.
#------------------------------------------------------------------------------
check_journalnum () {
   local JNLNUM=${1:-Unset}
   re='^[0-9]+$'
   if ! [[ $JNLNUM =~ $re ]] ; then
      die "The journal counter value [$JNLNUM] is invalid. It must be numeric."
   fi
}

#------------------------------------------------------------------------------
# Check the checkpoints directory for the oldest checkpoint
#------------------------------------------------------------------------------
get_ckpnum () {
   if [[ $EDGESERVER -eq 0 ]]; then
      OLDESTCHECKPOINT=$(ls -1tr "${CHECKPOINTS}/" | $GREP ckp | $GREP -v md5 | head -n 1 | $AWK -F '.ckp.' '{ print $(2) }' | tr -d '.gz')
   else
      OLDESTCHECKPOINT=$(ls -1tr "${CHECKPOINTS}.${SERVERID#p4d_}/" | $GREP ckp | $GREP -v md5 | head -n 1 | $AWK -F '.ckp.' '{ print $(2) }' | tr -d '.gz')
   fi
}

#------------------------------------------------------------------------------
# Determine journal counter by checking counter in db.counters.
#------------------------------------------------------------------------------
get_journalnum () {
   # get the current journal and checkpoint serial numbers.
   local nextCheckpointNum
   if [[ -r "$P4ROOT/db.counters" ]]; then
      nextCheckpointNum=$($P4DBIN -r $P4ROOT -k db.counters -jd - 2>&1 | grep @journal@ | cut -d '@' -f 8)

      if [[ -n "$nextCheckpointNum" ]]; then
         check_journalnum "$nextCheckpointNum"
         JOURNALNUM="$nextCheckpointNum"
      else
         # Special case: If db.counters is empty, then we have a new/empty data
         # set, so just set the value to 0.
         JOURNALNUM=0
      fi
   else
      # Special case: If db.counters doesn't exist, then we have a new/empty
      # data set, so just set the value to 0.
      JOURNALNUM=0
   fi

   # If we are on an edge server, the journal has already rotated, so we have to decrement the value
   # so that we replay the correct journal file and create the correct checkpoint number on the
   # edge server.
   #
   # In the case of a standby server, the journal rotation occurs on the master server,
   # so we don't need to increment the journal number again, so we decrement by 1.
   # Also, when replaying the journals to the offline db, we don't want to play to the live journal
   # because it is still being replicated.
   if [[ $EDGESERVER -eq 1 || $STANDBYSERVER -eq 1 ]]; then
      JOURNALNUM=$(($JOURNALNUM - 1))
   fi
   CHECKPOINTNUM=$(($JOURNALNUM + 1))
}

#------------------------------------------------------------------------------
# Determine journal space usage and minimum disk space requirement
#------------------------------------------------------------------------------
get_journal_stats () {
   # Get minimum disk space required on server journal filesystem before server rejects commands
   # This will return the configured and default value, but grab the configured value which shows first
   # If a configured value is not present, it will use the default value
   P4JOURNALMIN=$($P4BIN configure show filesys.P4JOURNAL.min | $AWK '{ print $1 }' | $CUT -d'=' -f2 | head -1)
   # Get current journal free disk space
   P4JOURNALFREE=$($P4BIN -ztag -F "%freeBytes%" diskspace P4JOURNAL)
   # Get total available disk space for journal
   P4JOURNALTOTAL=$($P4BIN -ztag -F "%totalBytes%" diskspace P4JOURNAL)
}

#------------------------------------------------------------------------------
# Verify that the offline databases are usable by checking the existence
# of a 'offline_db_usable.txt' file that is written only when databases
# are in a known-good state, following successful recovery from a checkpoint.
#------------------------------------------------------------------------------
check_offline_db_usable () {
   # Check it is OK
   if [[ ! -f $OFFLINE_DB/offline_db_usable.txt ]]; then
      die "Offline database not in a usable state. Check the backup process."
   fi
   if [[ ! -f $OFFLINE_DB/db.counters ]]; then
      die "Offline database not found. Consider creating it with live_checkpoint.sh. Be aware that it locks the live system and can take a long time! Abort!"
   fi
}

#------------------------------------------------------------------------------
# Determine journal counter in offline databases.
#------------------------------------------------------------------------------
get_offline_journal_num () {
   # Get the journal number of the offline database
   check_offline_db_usable
   OFFLINEJNLNUM=$($P4DBIN -r $OFFLINE_DB -jd - db.counters 2>&1 | grep '@journal@' | cut -d "@" -f 8 2>> $LOGFILE) || die "Cannot get the offline journal number. Abort!"
   check_journalnum $OFFLINEJNLNUM
   log "Offline journal number is: $OFFLINEJNLNUM"
}

#------------------------------------------------------------------------------
# Cleanup old log files.
#------------------------------------------------------------------------------
remove_old_checkpoints_and_journals () {
   declare CheckpointsDir=
   declare StandbyReplicaJournalsDir=
   declare FilePrefix=

   if [[ $KEEPCKPS -eq 0 ]]; then
      log "Skipping cleanup of old checkpoints because KEEPCKPS is set to 0."
   else
      log "Deleting obsolete checkpoints and journals. Keeping latest $KEEPCKPS  per KEEPCKPS setting in p4_vars."
      if [[ $EDGESERVER -eq 0 ]]; then
         # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N
         CheckpointsDir="${CHECKPOINTS}"
         FilePrefix="${P4SERVER}"
      else
         # Refer to ckp/jnl files starting like (example ServerID=p4d_edge_nyc):
         # /p4/N/checkpoints.edge_nyc/p4_N.edge_nyc
         CheckpointsDir="${CHECKPOINTS}.${SERVERID#p4d_}"
         FilePrefix="${P4SERVER}.${SERVERID#p4d_}"
      fi

      if [[ -d "$CheckpointsDir" ]]; then
         # Remove selected checkpoint and journal files based on the KEEPCKPS
         # setting regardless of whether compressed or not.
         # We multiply KEEPCKP by 2 for the ckp files because of the md5 files.
         for I_LOGFILE in $(ls -t ${CheckpointsDir}/${FilePrefix}.ckp.* 2>/dev/null | $AWK "NR > ($KEEPCKPS * 2)"); do
            log "rm -f $I_LOGFILE"
            rm -f "$I_LOGFILE"
         done

         # Use KEEPJNLS to allow for separate journal rotation at a higher
         # frequency.
         for I_LOGFILE in $(ls -t ${CheckpointsDir}/${FilePrefix}.jnl.* 2>/dev/null | $AWK "NR > $KEEPJNLS"); do
            log "rm -f $I_LOGFILE"
            rm -f "$I_LOGFILE"
         done
      fi

      StandbyReplicaJournalsDir="${P4HOME}/journals.rep"
      if [[ -d "$StandbyReplicaJournalsDir" ]]; then
         for I_LOGFILE in $(ls -t $StandbyReplicaJournalsDir/${FilePrefix}.ckp.* 2>/dev/null | $AWK "NR > ($KEEPCKPS * 2)"); do
            log "rm -f $I_LOGFILE"
            rm -f "$I_LOGFILE"
         done

         for I_LOGFILE in $(ls -t ${StandbyReplicaJournalsDir}/${FilePrefix}.jnl.* 2>/dev/null | $AWK "NR > $KEEPJNLS"); do
            log "rm -f $I_LOGFILE"
            rm -f "$I_LOGFILE"
         done
      fi
   fi
}

#------------------------------------------------------------------------------
# Shutdown p4d using the standard init script. Log the shutdown activity.
#------------------------------------------------------------------------------
stop_p4d () {
   log "Shutting down the p4 server"
   $RC stop >> "$LOGFILE" 2>&1
   log "p4 stop finished -- p4 is down now."
}

#------------------------------------------------------------------------------
# Start p4d using the standard init script. Log the startup activity.
# Return status indicates whether the server started successfully or not.
#------------------------------------------------------------------------------
start_p4d () {
   log "Starting the p4 server"
   $RC start >> "$LOGFILE" 2>&1
   sleep 3 # Give it a few seconds to start up
   # Confirm that it started - success below means it did
   if $P4BIN -u $P4USER -p $P4PORT info >/dev/null 2>&1 ; then
      log "Server restarted successfully - p4 should be back up now."
      return 0
   else
      log "Error: Server does not appear to have started."
      return 1
   fi
}

#------------------------------------------------------------------------------
# Call 'p4d -jj' to rotate the current/active journal file on the master
# server, starting a fresh new P4JOURNAL file.
#
# In a distributed topology with edge servers, this function
# be called on the master/commit server.
#------------------------------------------------------------------------------
truncate_journal () {
   declare CheckpointFile="${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz"
   declare JournalFile="${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM}"

   if [[ $EDGESERVER -eq 0 ]]; then
      [[ -f "$CheckpointFile" ]] && \
         die "Checkpoint $CheckpointFile already exists, check the backup process."
      [[ -f "$JournalFile" ]] && \
         die "Journal $JournalFile already exists, check the backup process."

      log "Truncating journal..."
      # 'p4d -jj' does a copy-then-delete, instead of a simple mv.
      # During 'p4d -jj' the perforce server will hang the responses to clients,
      # this should be for a very short period of time even for large data
      # sets, as the journal represents a single day of metadata.
      # Curly braces capture output of 'time'.
      $P4CBIN/p4login -p $P4MASTERPORT
      { time $P4BIN -p $P4MASTERPORT admin journal ${CHECKPOINTS}/${P4SERVER}; } >> "$LOGFILE" 2>&1 || { die "Journal rotation failed. Abort!"; }
      # The test below waits until the journal file exists in the checkpoints directory before proceeding.
      test=1
      while [ $test != 0 ]
      do
         sleep 5
         if [ -f "$JournalFile" ];then
            test=0
         fi
      done
      $P4CBIN/p4login
   else
      log "Warning: truncate_journal () function should not be called on an edge server. Ignoring."
   fi
}

#------------------------------------------------------------------------------
# Call 'p4d -jj' to rotate the current/active journal file on the master
# server from an edge server, starting a fresh new P4JOURNAL file.
#
# In a distributed topology with edge and standby servers, this function can be
# used to trigger a journal rotation on master/commit server. It's not meant to
# be used from the master server itself.
#------------------------------------------------------------------------------
truncate_journal_on_master () {
   # Increment Edge journal number since the journal will increment on the master after calling journal rotation
   local EdgeJournalNum=$((JOURNALNUM + 1))
   local StandbyJournalNum=$((JOURNALNUM + 2)) # If using journalcopy, have to add 2 since live journal is in checkpoints folder
   local JournalFile=

   if [[ $EDGESERVER -eq 1 ]]; then
      # Refer to ckp/jnl files starting like (example ServerID=p4d_edge_nyc):
      # /p4/N/checkpoints.edge_nyc/p4_N.edge_nyc
      JournalFile="${CHECKPOINTS}.${SERVERID#p4d_}/${P4SERVER}.${SERVERID#p4d_}.jnl.${EdgeJournalNum}"
   elif [[ $STANDBYSERVER -eq 1 ]]; then
      JournalFile="${CHECKPOINTS}/${P4SERVER}.jnl.${StandbyJournalNum}"
   fi

   if [[ $EDGESERVER -eq 1 || $STANDBYSERVER -eq 1 ]]; then
      [[ -f "$JournalFile" ]] && \
         die "Journal $JournalFile already exists, check the backup process."

      log "Truncating journal on ${P4COMMITSERVER}..."
      # 'p4d -jj' does a copy-then-delete, instead of a simple mv.
      # During 'p4d -jj' the perforce server will hang the responses to clients,
      # this should be for a very short period of time even for large data
      # sets, as the journal represents a single day of metadata.
      # Curly braces capture output of 'time'.
      $P4CBIN/p4login -p $P4COMMITSERVER
      { time $P4BIN -p $P4COMMITSERVER admin journal; } >> "$LOGFILE" 2>&1 || { die "Journal rotation failed. Abort!"; }
      # The test below waits until the journal file exists in the checkpoints directory before proceeding.
      test=1
      while [ $test != 0 ]
      do
         sleep 5
         if [ -f "$JournalFile" ];then
            test=0
         fi
      done
      $P4CBIN/p4login
   else
      log "Warning: truncate_journal_on_master () function should not be called on a master server. Ignoring."
   fi
}

#------------------------------------------------------------------------------
# Similar to truncate_journal() above, p4d_truncate_journal() is intended to be
# usable form the p4d_base init script, to allow journal rotation on p4d
# start.
#------------------------------------------------------------------------------
p4d_truncate_journal () {
   declare JournalFile="${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM}"

   if [[ $EDGESERVER -eq 0 ]]; then
      [[ -f "$JournalFile" ]] && \
         die "Journal $JournalFile already exists, check the backup process."
      log "Rotating journal prior to starting p4d."
      $P4DBIN -r $P4ROOT -J $P4JOURNAL -jj ${CHECKPOINTS}/${P4SERVER} >> "$LOGFILE" 2>&1 ||\
         die "Failed to rotate journal. Aborting p4d server start."
   fi
}

#------------------------------------------------------------------------------
# Replay any and all numbered journal files into the offline databases.
#------------------------------------------------------------------------------
replay_journals_to_offline_db () {
   local CheckpointsDir=
   local FilePrefix=
   local NumberedJournal=

   log "Replay any unreplayed journals to the offline database."

   if [[ $EDGESERVER -eq 0 ]]; then
      # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N
      CheckpointsDir="${CHECKPOINTS}"
      FilePrefix="${P4SERVER}"
   else
      # Refer to ckp/jnl files starting like (example ServerID=p4d_edge_nyc):
      # /p4/N/checkpoints.edge_nyc/p4_N.edge_nyc
      CheckpointsDir="${CHECKPOINTS}.${SERVERID#p4d_}"
      FilePrefix="${P4SERVER}.${SERVERID#p4d_}"
   fi

   # Check to see if a new journal file with the current journal number exists
   # otherwise kill the checkpoint operation since replication may not be up to date
   if [[ $STANDBYSERVER -eq 1 ]]; then
      CurrentJournalNumber=$(($JOURNALNUM + 1))
      CurrentJournalFile="${CheckpointsDir}/${FilePrefix}.jnl.${CurrentJournalNumber}"
      if [[ ! -f $CurrentJournalFile ]]; then
         die "Current journal file $CurrentJournalFile doesn't exist. Check the replication process."
      fi
   fi

   for (( j=$OFFLINEJNLNUM; $j <= $JOURNALNUM; j++ )); do
      NumberedJournal="${CheckpointsDir}/${FilePrefix}.jnl.${j}"
      log "Replay journal $NumberedJournal to offline db."
      # Curly braces capture output of 'time'.
      { time $P4DBIN -r $OFFLINE_DB -jr -f $NumberedJournal; } >> "$LOGFILE" 2>&1 || { die "Offline journal replay failed. Abort!"; }
   done
}

#------------------------------------------------------------------------------
# Replay the live, active P4JOURNAL file into the offline databaes.
#------------------------------------------------------------------------------
replay_active_journal_to_offline_db () {
   log "Replay active journal to offline db."
   # Curly braces capture output of 'time'.
   { time $P4DBIN -r $OFFLINE_DB -jr -f ${P4JOURNAL}; } >> "$LOGFILE" 2>&1 || { die "Active Journal replay failed. Abort!"; }
}

#------------------------------------------------------------------------------
# Recreate offline databases from the latest checkpoint.
#------------------------------------------------------------------------------
recreate_offline_db_files () {
   local CheckpointsDir=
   local FilePrefix=
   local LastCheckpointMD5=
   local LastCheckpoint=

   if [[ $EDGESERVER -eq 0 ]]; then
      # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N
      CheckpointsDir="${CHECKPOINTS}"
      FilePrefix="${P4SERVER}"
   else
      # Refer to ckp/jnl files starting like (example ServerID=p4d_edge_nyc):
      # /p4/N/checkpoints.edge_nyc/p4_N.edge_nyc
      CheckpointsDir="${CHECKPOINTS}.${SERVERID#p4d_}"
      FilePrefix="${P4SERVER}.${SERVERID#p4d_}"
   fi

   [[ -f ${CheckpointsDir}/${FilePrefix}.ckp.*.gz ]] && ckp_complete && die "No checkpoints found - run live_checkpoint.sh"
   rm -f ${OFFLINE_DB}/offline_db_usable.txt >> "$LOGFILE" 2>&1
   rm -f ${OFFLINE_DB}/db.* >> "$LOGFILE" 2>&1
   rm -f ${OFFLINE_DB}/save/db.* >> "$LOGFILE" 2>&1
   LastCheckpointMD5=$(ls -t ${CheckpointsDir}/${FilePrefix}.ckp.*.md5 | head -1)
   [[ -n "$LastCheckpointMD5" ]] || \
      die "Could not find *.md5 file for latest checkpoint. Abort!"

   # Account for the idiosyncracy that MD5 files for checkpoints may look
   # like p4_N.ckp.gz.md5 or p4_N.ckp.md5.
   if [[ "$LastCheckpointMD5" == *".gz.md5" ]]; then
      LastCheckpoint="${LastCheckpointMD5%.md5}"
   else
      LastCheckpoint="${LastCheckpointMD5%.md5}.gz"
   fi

   [[ -r "$LastCheckpoint" ]] || \
      die "Missing last checkpoint file: $LastCheckpoint. Abort!"

   log "Recovering from last full checkpoint, $LastCheckpoint."
   # Curly braces capture output of 'time'.
   { time $P4DBIN -r $OFFLINE_DB -jr -z ${LastCheckpoint}; } >> "$LOGFILE" 2>&1 || { die "Restore of checkpoint to $OFFLINE_DB failed!"; }
   echo "Offline db file restored successfully." > ${OFFLINE_DB}/offline_db_usable.txt
}

#------------------------------------------------------------------------------
# Take a live checkpoint from db.* files in P4ROOT.
#------------------------------------------------------------------------------
checkpoint () {
   local CheckpointsDir=
   local FilePrefix=

   log "Create a new checkpoint from live db files in $P4ROOT."

   if [[ "$EDGESERVER" -eq 0 ]]; then
      # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N
      CheckpointsDir="${CHECKPOINTS}"
      FilePrefix="${P4SERVER}"
   else
      die "Checkpoints may not be run on an edge server."
   fi

   # Curly braces capture output of 'time'.
   { time $P4DBIN -r $P4ROOT -jc -Z ${CheckpointsDir}/${FilePrefix}; } >>"$LOGFILE" 2>&1 || { die "ERROR - New checkpoint failed!"; }
}

#------------------------------------------------------------------------------A
# Take a checkpoint from the ROOTDIR, typically either /p4/N/root or
# /p4/N/offline_db.
#------------------------------------------------------------------------------
dump_checkpoint () {
   declare CheckpointsDir=
   declare NewCheckpoint=
   declare NewCheckpointMD5=
   declare FilePrefix=

   log "Dump out new checkpoint from db files in $ROOTDIR."

   if [[ $EDGESERVER -eq 0 ]]; then
      # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N
      CheckpointsDir="${CHECKPOINTS}"
      FilePrefix="${P4SERVER}"
   else
      # Refer to ckp/jnl files starting like (example ServerID=p4d_edge_nyc):
      # /p4/N/checkpoints.edge_nyc/p4_N.edge_nyc
      CheckpointsDir="${CHECKPOINTS}.${SERVERID#p4d_}"
      FilePrefix="${P4SERVER}.${SERVERID#p4d_}"
   fi

   NewCheckpoint=${CheckpointsDir}/${FilePrefix}.ckp.${CHECKPOINTNUM}.gz
   NewCheckpointMD5=${NewCheckpoint}.md5

   if [[ -r "$NewCheckpoint" && -r "$NewCheckpointMD5" ]]; then
      log "\nWarning: Skipping generation of existing checkpoint $NewCheckpoint.\nVerified MD5 file exists: $NewCheckpointMD5."
      return
   fi

   # Curly braces capture output of 'time'.
   { time $P4DBIN -r $ROOTDIR -jd -z ${CheckpointsDir}/${FilePrefix}.ckp.${CHECKPOINTNUM}.gz; } >> "$LOGFILE" 2>&1 || { die "New checkpoint dump failed!"; }
}

#------------------------------------------------------------------------------
# Compare journal numbers between live and offline databases, to ensure
# they can be safely swapped out.
#------------------------------------------------------------------------------
compare_journal_numbers () {
   # Get the journal number of the offline database
   if [[ ! -f $OFFLINE_DB/offline_db_usable.txt ]]; then
      die "Offline database not in a usable state. Check the backup process."
   fi
   if [[ ! -f $OFFLINE_DB/db.counters ]]; then
      die "Offline database not found. Consider creating it with live_checkpoint.sh. Be aware that it locks the live system and can take a long time! Abort!"
   fi
   local _OFFLINEJNLNUM=$($P4DBIN -r $OFFLINE_DB -jd - db.counters 2>&1 | grep '@journal@' | cut -d "@" -f 8 2>> $LOGFILE) || die "Cannot get $OFFLINE_DB journal number. Abort!"
   check_journalnum $_OFFLINEJNLNUM

   # Get the journal number of the root database
   if [[ ! -f $P4ROOT/db.counters ]]; then
      die "$P4ROOT database not found. Something is seriously wrong since the server was just running a minute ago! Contact support@perforce.com"
   fi
   local _JNLNUM=$($P4DBIN -r $P4ROOT -jd - db.counters 2>&1 | grep '@journal@' | cut -d "@" -f 8 2>> $LOGFILE) || die "Cannot get $P4ROOT journal number. Abort!"
   check_journalnum $_JNLNUM

   if [[ $_JNLNUM -gt $_OFFLINEJNLNUM ]]; then
      log "$P4ROOT journal number is: $_JNLNUM"
      log "$OFFLINE_DB journal number is: $_OFFLINEJNLNUM"
      die "$OFFLINE_DB journal number is less than $P4ROOT, cannot switch."
   fi
}

#------------------------------------------------------------------------------
# Swap out live db.* database files in P4ROOT with those in offline_db.
#------------------------------------------------------------------------------
switch_db_files () {
   # Compare the Offline and Master journal numbers before switching to make sure they match.
   compare_journal_numbers
   log "Switching root and offline_db links..."
   [[ -d ${P4ROOT}/save ]] || mkdir -p ${P4ROOT}/save
   rm -f ${P4ROOT}/save/db.* >> $LOGFILE 2>&1
   mv ${P4ROOT}/db.* ${P4ROOT}/save >> $LOGFILE 2>&1
   mv ${P4ROOT}/license* ${OFFLINE_DB} >> $LOGFILE 2>&1
   mv ${P4ROOT}/rdb.lbr ${OFFLINE_DB} >> $LOGFILE 2>&1
   mv ${P4ROOT}/state* ${OFFLINE_DB} >> $LOGFILE 2>&1
   mv ${P4ROOT}/server.id ${OFFLINE_DB} >> $LOGFILE 2>&1
   rm -f ${OFFLINE_DB}/offline_db_usable.txt
   OLDBLNK=$(readlink $OFFLINE_DB)
   ROOTLNK=$(readlink $P4ROOT)
   unlink $OFFLINE_DB
   unlink $P4ROOT
   ln -s $OLDBLNK $P4ROOT >> $LOGFILE 2>&1 || die "Link of $OLDBLNK to $P4ROOT failed."
   ln -s $ROOTLNK $OFFLINE_DB >> $LOGFILE 2>&1 || die "Link of $ROOTLNK to $OFFLINE_DB failed."
}

#------------------------------------------------------------------------------
# Rotate specified log files, and compress with gzip.
#------------------------------------------------------------------------------
rotate_log_file () {
   cd "$LOGS"
   ROTATE_LOGNAME=$1
   GZ_EXT=${2:-}
   LOGID=$(date +'%Y-%m-%d_%H-%M-%S')
   if [[ -f ${ROTATE_LOGNAME} ]]; then
      mv -f ${ROTATE_LOGNAME} ${ROTATE_LOGNAME}.${LOGID} >> $LOGFILE 2>&1
      [[ ! -z "$GZ_EXT" ]] && gzip ${ROTATE_LOGNAME}.${LOGID} >> $LOGFILE 2>&1
   fi
   cd - > /dev/null
}

#------------------------------------------------------------------------------
# At the start of each run for live_checkpoint.sh, daily_checkpoint.sh, and
# recreate_db_checkpoint.sh, before *any* logging activity occurs, rotate the
# logs from the most recent prior run, always named "checkpoint.log" or "log".
#------------------------------------------------------------------------------
rotate_last_run_logs () {
   # Rotate prior log file for the current script.
   rotate_log_file $LOGFILE

   # Rotate prior server log.
   rotate_log_file "log" ".gz"

   # Rotate prior broker log.
   rotate_log_file "p4broker.log" ".gz"

   # Rotate prior audit log.
   rotate_log_file "audit.log" ".gz"
}

#------------------------------------------------------------------------------
# Remove log files matching a specified name prefix, preserving a specified
# number of the recent logs.
#------------------------------------------------------------------------------
remove_log_files () {
   REMOVE_LOGNAME=$1
   KEEPNUM=$2

   for I_LOGFILE in $(ls -t ${REMOVE_LOGNAME}* 2>/dev/null | $AWK "NR > $KEEPNUM"); do
      log "rm -f $I_LOGFILE"
      rm -f "$I_LOGFILE"
   done
}

#------------------------------------------------------------------------------
# Remove old logs.
#------------------------------------------------------------------------------
remove_old_logs () {
   # Remove old Checkpoint Logs
   # Use KEEPJNLS rather than KEEPLOGS, so we keep the same number
   # of checkpoint logs as we keep checkpoints.
   cd "$LOGS"

   if [[ $KEEPJNLS -eq 0 ]]; then
      log "Skipping cleanup of old checkpoint logs because KEEPJNLS is set to 0."
   else
      log "Deleting old checkpoint logs.  Keeping latest $KEEPJNLS, per KEEPJNLS setting in p4_vars."
      remove_log_files "checkpoint.log" $KEEPJNLS
   fi

   if [[ $KEEPLOGS -eq 0 ]]; then
      log "Skipping cleanup of old server logs because KEEPLOGS is set to 0."
   else
      log "Deleting old server logs.  Keeping latest $KEEPLOGS, per KEEPLOGS setting in p4_vars."
      remove_log_files "log" $KEEPLOGS
      remove_log_files "p4broker.log" $KEEPLOGS
      remove_log_files "audit.log" $KEEPLOGS
      remove_log_files "sync_replica.log" $KEEPLOGS
      remove_log_files "recreate_offline_db.log" $KEEPLOGS
      remove_log_files "upgrade.log" $KEEPLOGS
      remove_log_files "p4login" $KEEPLOGS
      remove_log_files "p4verify.log" $KEEPLOGS
      remove_log_files "journal_watch.log" $KEEPLOGS
      remove_log_files "purge_revisions.log" $KEEPLOGS
   fi
   cd - > /dev/null
}

#------------------------------------------------------------------------------
# Set the SDP Checkpoint counter to indicate last successful SDP checkpoint
# operation. For standby servers, set the SDP Checkpoint counter on the master.
#------------------------------------------------------------------------------
set_counter() {
   $P4CBIN/p4login

   if [[ "$EDGESERVER" -eq 1 || "$STANDBYSERVER" -eq 1 ]]; then
      $P4BIN -u $P4USER -p $P4MASTERPORT counter LastSDPCheckpoint.$SERVERID "$(date +'%s (%Y/%m/%d %H:%M:%S %z %Z)')" > /dev/null
   else
      $P4BIN -u $P4USER -p $P4PORT counter LastSDPCheckpoint.$SERVERID "$(date +'%s (%Y/%m/%d %H:%M:%S %z %Z)')" > /dev/null
   fi
}

#	Change	User	Description
#7	25113	Robert Cowham	Merge latest changes from dev
#6	23430	Robert Cowham	Merge in changes from dev
#5	23205	Robert Cowham	Merged all changes from dev to test
#4	22477	Robert Cowham	Bring latest dev changes into test
#3	22142	Robert Cowham	Merge in latest changes from Dev
#2	20726	Robert Cowham	Catch up from dev
#1	18586	Robert Cowham	Branching using cowhamr.sdp.dev
//guest/perforce_software/sdp/dev/Server/Unix/p4/common/bin/backup_functions.sh
#21	18533	Robert Cowham	Put a date/time suffix onto checkpoint.log.* files in case of any errors to avoid them being overwritten. Make remove_old_logs tidy up appropriately.
#20	18532	Robert Cowham	Correct log message regarding journals replays
#19	18484	Russell C. Jackson (Rusty)	Added comment on WAITCOUNT to explain the value.
#18	18450	Russell C. Jackson (Rusty)	Added a kill for the p4d_stop function in case p4d doesn't shut down. In the process of testing this, I discovered that using $P4DBIN in this case was a bug that didn't work when running in case insensitive mode because the executable doesn't match what is actually running since we end up calling p4d from /p4/common/bin. Corrected the grep so that it would match in either case. #review-18430
#17	16335	C. Thomas Tyler	Routine Merge Down to dev from main using: p4 merge -b perforce_software-sdp-dev
#16	16029	C. Thomas Tyler	Routine merge to dev from main using: p4 merge -b perforce_software-sdp-dev
#15	15797	C. Thomas Tyler	Routine Merge Down to dev from main for SDP.
#14	15778	C. Thomas Tyler	Routine Merge Down to dev from main.
#13	15376	adrian_waters	formatting only - fix spacing; there's inconsistent use of tabs/spaces throughout the file - needs cleanup at some point.
#12	15375	adrian_waters	Routine merge-down from main->dev
#11	15374	adrian_waters	- Ensure backup scripts are run as the OSUSER (to prevent accidental running as root); - in scripts where LOGFILE value is changed from the 'checkpoint.log' set by set_vars, ensure the new assignment is before check_dirs is called, otherwise errors could be written to the 'wrong' log - in 'die()' - detect if running from terminal & also send output to stderr
#10	13931	C. Thomas Tyler	Routine merge-down to dev from main.
#9	13906	C. Thomas Tyler	Normalized P4INSTANCE to SDP_INSTANCE to get Unix/Windows implementations in sync. Reasons: 1. Things that interact with SDP in both Unix and Windows environments shoudn't have to account for this obscure SDP difference between Unix and Windows. (I came across this doing CBD work). 2. The Windows and Unix scripts have different variable names for defining the same concept, the SDP instance. Unix uses P4INSTANCE, while Windows uses SDP_INSTANCE. 3. This instance tag, a data set identifier, is an SDP concept. I prefer the SDP_INSTANCE name over P4INSTANCE, so I prpose to normalize to SDP_INSTANCE. 4. The P4INSTANCE name makes it look like a setting that might be recognized by the p4d itself, which it is not. (There are other such things such as P4SERVER that could perhaps be renamed as a separate task; but I'm not sure we want to totally disallow the P4 prefix for variable names. It looks too right to be wrong in same cases, like P4BIN and P4DBIN. That's a discussion for another day, outside the scope of this task). Meanwhile: * Fixed a bug in the Windows 2013.3 upgrade script that was referencing undefined P4INSTANCE, as the Windows environment defined only SDP_INSTANCE. * Had P4INSTANCE been removed completely, this change would likely cause trouble for users doing updates for existing SDP installations. So, though it involves slight technical debt, I opted to keep a redundant definition of P4INSTANCE in p4_vars.template, with comments indicating SDP_INSTANCE should be used in favor of P4INSTANCE, with a warning that P4INSTANCE may go away in a future release. This should avoid unnecessary upgrade pain. * In mkdirs.sh, the varialbe name was INSTANCE rather than SDP_INSTANCE. I changed that as well. That required manual change rather than sub/replace to avoid corrupting other similar varialbe names (e.g. MASTERINSTANCE). This is a trivial change technically (a substitute/replace, plus tweaks in p4_vars.template), but impacts many files.
#8	12169	Russell C. Jackson (Rusty)	Updated copyright date to 2015 Updated shell scripts to require an instance parameter to eliminate the need for calling p4master_run. Python and Perl still need it since you have to set the environment for them to run in. Incorporated comments from reviewers. Left the . instead of source as that seems more common in the field and has the same functionality.
#7	12028	C. Thomas Tyler	Refreshed SDP dev branch, merging down from main.
#6	11541	Russell C. Jackson (Rusty)	Keeping dev up to date.
#5	11535	Russell C. Jackson (Rusty)	Updated dev from main.
#4	11509	Russell C. Jackson (Rusty)	Added sync_replica.log to backup function log rotations, and added rm on existing gzipped logs with the same name in order to keep the script from hanging waiting for a response to overwrite. Added sync_shared_replica.sh and weekly_sync_shared_replica.sh to support replicas with shared depotdata storage. No rsync is necessary. The logs volume must not be a shared volume with these scripts though.
#3	11483	Russell C. Jackson (Rusty)	Brought over changes from RCJ backup_functions.sh
#2	11463	Russell C. Jackson (Rusty)	Updated dev to prepare for Summit agreed changes.
#1	10638	C. Thomas Tyler	Populate perforce_software-sdp-dev.
//guest/perforce_software/sdp/main/Server/Unix/p4/common/bin/backup_functions.sh
#1	10148	C. Thomas Tyler	Promoted the Perforce Server Deployment Package to The Workshop.