backup_functions.sh #1

#!/bin/bash
#==============================================================================
# Copyright and license info is available in the LICENSE file included with
# the Server Deployment Package (SDP), and also available online:
# https://swarm.workshop.perforce.com/projects/perforce-software-sdp/view/main/LICENSE
#------------------------------------------------------------------------------

set -u

# Common functions used in all backup scripts.

check_vars () {
   if [[ -z "$SDP_INSTANCE" || -z "$P4HOME" || -z "$P4PORT" || -z "$P4ROOT" || -z "$P4JOURNAL" || -z "$P4BIN" || -z "$P4DBIN" || -z "$P4TICKETS" || -z "$KEEPCKPS" || -z "$KEEPLOGS" || -z "$CHECKPOINTS" || -z "$LOGS" || -z "$OSUSER" ]]; then
      echo "Use p4master_run when calling this script."
      echo "Required external variable not set. Abort!"
      exit 1
   fi
}

set_vars () {
   RC=/etc/init.d/p4d_${SDP_INSTANCE}_init
   [[ -f "$RC" ]] || RC=/p4/$SDP_INSTANCE/bin/p4d_${SDP_INSTANCE}_init
   OFFLINE_DB=${P4HOME}/offline_db
   SAVEDIR=${P4ROOT}/save
   EDGESERVER=0
   $P4BIN -u $P4USER -p $P4PORT login < /p4/common/bin/adminpass > /dev/null
   $P4BIN -u $P4USER -p $P4PORT server -o $SERVERID | grep ^Services | grep "edge-server" > /dev/null
   if [[ $(echo $?) -eq 0 ]]; then
      EDGESERVER=1
   fi
}

# check if user is running as required OS user
check_uid () {
	user=$(id -un)
   if [[ ${user} != ${OSUSER} ]]; then
      die "Must be run by user: ${OSUSER}. Abort!"
   fi
}

log () {
   echo -n $(date)    2>&1 >> "$LOGFILE"
   echo " $0: $@" 2>&1 >> "$LOGFILE"
}

# Decide depending on our mail utility, how to specify sender (if we need to)
get_mail_sender_opt () {
   local mail_sender_opt=""
   if [[ ! -z "$MAILFROM" ]]; then
      # Default for CentOS/RHEL, but allow GNU Mailutils alternative flag instead
      mail_sender_opt="-S from=$MAILFROM"
      local mail_ver=$($SDPMAIL -V)
      [[ "$mail_ver" =~ "GNU Mailutils" ]] && mail_sender_opt="-aFrom:$MAILFROM"
   fi
   echo "$mail_sender_opt"
}

mail_log_file () {
   local subject=$1
   local mail_sender_opt=$(get_mail_sender_opt)
   $SDPMAIL -s "$subject" $mail_sender_opt $MAILTO < "$LOGFILE"
}

die () { # send mail and exit
   # mail the error (with more helpful subject line than cron)
   log "ERROR!!! - $HOSTNAME $P4SERVER $0: $@"
   mail_log_file "ERROR!!! - $HOSTNAME $P4SERVER $0: $@"

   # if running from terminal, also send to stderr
   tty >/dev/null
   if [[ $? -eq 0 ]]; then
      echo "$@" >&2
   fi
   exit 1
}

ckp_running() {
   if [[ -f ${LOGS}/ckp_running.txt ]]; then
      die "Last checkpoint not complete. Check the backup process or contact support."
   fi
   echo "Checkpoint running." > ${LOGS}/ckp_running.txt
}

ckp_complete() {
   rm -f ${LOGS}/ckp_running.txt
}

checkdir () {
   local dir=$1
   [[ -w $dir ]] && return
   die "$dir is not writable. Abort!"
}

check_dirs () {
   # Check that key dirs are writable
   dirs_ok=true
   for dir in $OFFLINE_DB $CHECKPOINTS $LOGS; do
      checkdir $dir    # aborts on failure.
   done
}

check_disk_space () {
   # Add the results of df -h or df -m to the log file.
   log "Checking disk space..."
   $P4BIN diskspace >> "$LOGFILE" 2>&1
}

check_journalnum () {
   local JNLNUM=$1
   re='^[0-9]+$'
   if ! [[ $JNLNUM =~ $re ]] ; then
      die "Journal number must be numeric."
   fi
}

get_journalnum () {
   # get the current journal and checkpoint serial numbers.
   JOURNALNUM=$($P4BIN -u $P4USER -p $P4PORT counter journal 2>> $LOGFILE) || die "Cannot get the checkpoint number. Abort!"
   check_journalnum $JOURNALNUM

   # If we are on an edge server, the journal has already rotated, so we have to decrement the value
   # so that we replay the correct journal file and create the correct checkpoint number on the
   # edge server.
   if [[ $EDGESERVER -eq 1 ]]; then
      JOURNALNUM=$(($JOURNALNUM - 1))
   fi
   CHECKPOINTNUM=$(($JOURNALNUM + 1))
}

get_offline_journal_num () {
   # Get the journal number of the offline database
   if [[ ! -f $OFFLINE_DB/offline_db_usable.txt ]]; then
      die "Offline database not in a usable state. Check the backup process."
   fi
   if [[ ! -f $OFFLINE_DB/db.counters ]]; then
      die "Offline database not found. Consider creating it with live_checkpoint.sh. Be aware that it locks the live system and can take a long time! Abort!"
   fi
   OFFLINEJNLNUM=$($P4DBIN -r $OFFLINE_DB -jd - db.counters | grep '@journal@' | cut -d "@" -f 8 2>> $LOGFILE) || die "Cannot get the offline journal number. Abort!"
   check_journalnum $OFFLINEJNLNUM
   log "Offline journal number is: $OFFLINEJNLNUM"
}

remove_old_checkpoints_and_journals () {
   if [[ $KEEPCKPS -eq 0 ]]; then
      log "Skipping cleanup of old checkpoints because KEEPCKPS is set to 0."
   else
      log "Deleting obsolete checkpoints and journals. Keeping latest $KEEPCKPS  per KEEPCKPS setting in p4_vars."

      # Remove selected checkpoint and journal files based on the KEEPCKPS setting regardless of whether compressed or not.
      # We multiply KEEPCKP by 2 for the ckp files because of the md5 files.
      for I_LOGFILE in $(ls -t ${CHECKPOINTS}/${P4SERVER}.ckp.* 2>/dev/null | awk "NR > ($KEEPCKPS * 2)"); do
         log "rm -f $I_LOGFILE"
         rm -f "$I_LOGFILE"
      done      
      for I_LOGFILE in $(ls -t ${CHECKPOINTS}/${P4SERVER}.jnl.* 2>/dev/null | awk "NR > $KEEPCKPS"); do
         log "rm -f $I_LOGFILE"
         rm -f "$I_LOGFILE"
      done      
   fi
}

stop_p4d () {
   log "Shutting down the p4 server"
   $RC stop >> "$LOGFILE" 2>&1
   COUNTER=$( ps -ef | grep -i p4d_$P4INSTANCE | grep -v grep | wc -l )
   declare -i WAITCOUNT=0
   while [[ $COUNTER != "0" ]]; do
      sleep 5
      COUNTER=$( ps -ef | grep -i p4d_$P4INSTANCE | grep -v grep | wc -l )
      WAITCOUNT=$(( $WAITCOUNT + 1 ))
      # The WAITCOUNT value below is 120 * 5 seconds = 10 minutes.
      # If p4d hasn't shut down by then, something is not shutting down on its own, so we kill it.
      if (( $WAITCOUNT > 120 )); then
         ps -ef | grep -i p4d_$P4INSTANCE | awk '{print $2}' | xargs kill > /dev/null 2>&1
      fi
   done
   log "p4 stop finished -- p4 should be down now."
}

start_p4d () {
   log "Starting the p4 server"
   $RC start >> "$LOGFILE" 2>&1
   sleep 3 # Give it a few seconds to start up
   # Confirm that it started - success below means it did
   if $P4BIN -u $P4USER -p $P4PORT info >/dev/null 2>&1 ; then
      log "Server restarted successfully - p4 should be back up now."
   else
      log "Error: Server does not appear to have started."
   fi
}

truncate_journal () {
   [[ -f ${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz ]] && die "Checkpoint ${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz already exists, check the backup process."
   if [[ $EDGESERVER -eq 0 ]]; then
      [[ -f ${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM} ]] && die "Journal ${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM} already exists, check the backup process."
      log "Truncating journal..."
      # 'p4d -jj' does a copy-then-delete, instead of a simple mv.
      # during 'p4d -jj' the perforce server will hang the responses to clients.
      # curly braces are necessary to capture the output of 'time'
      { time $P4DBIN -r $P4ROOT -J $P4JOURNAL -jj ${CHECKPOINTS}/${P4SERVER}; } >> "$LOGFILE" 2>&1 || { start_p4d; die "Journal rotation failed. Abort!"; }
   fi
}

replay_journals_to_offline_db () {
   log "Replay any unreplayed journals to the offline database"
   for (( j=$OFFLINEJNLNUM; $j <= $JOURNALNUM; j++ )); do
      log "Replay journal ${P4SERVER}.jnl.${j} to offline db."
      # curly braces are necessary to capture the output of 'time'
      { time $P4DBIN -r $OFFLINE_DB -jr -f ${CHECKPOINTS}/${P4SERVER}.jnl.${j}; } >> "$LOGFILE" 2>&1 || { die "Offline journal replay failed. Abort!"; }
   done
}

replay_active_journal_to_offline_db () {
   log "Replay active journal to offline db."
   # curly braces are necessary to capture the output of 'time'
   { time $P4DBIN -r $OFFLINE_DB -jr -f ${P4JOURNAL}; } >> "$LOGFILE" 2>&1 || { die "Active Journal replay failed. Abort!"; }
}

recreate_offline_db_files () {
   [[ -f ${CHECKPOINTS}/${P4SERVER}.ckp.*.gz ]] && ckp_complete && die "No checkpoints found - run live_checkpoint.sh"
   rm -f ${OFFLINE_DB}/offline_db_usable.txt
   rm -f ${OFFLINE_DB}/db.* >> "$LOGFILE"
   LASTCKP=$(ls -t ${CHECKPOINTS}/${P4SERVER}.ckp.*.gz | head -1)
   log "Recovering from $LASTCKP"
   # curly braces are necessary to capture the output of 'time'
   { time $P4DBIN -r $OFFLINE_DB -jr -z ${LASTCKP}; } >> "$LOGFILE" 2>&1 || { die "Restore of checkpoint to $OFFLINE_DB failed!"; }
   echo "Offline db file restored successfully." > ${OFFLINE_DB}/offline_db_usable.txt
}

checkpoint () {
   log "Create a new checkpoint from the live db files."
   # curly braces are necessary to capture the output of 'time'
   { time $P4DBIN -r $P4ROOT -jc -Z ${CHECKPOINTS}/${P4SERVER}; } >>"$LOGFILE" 2>&1 || { die "ERROR - New checkpoint failed!"; }
}

dump_checkpoint () {
   log "Dump out new checkpoint from db files in $ROOTDIR."
   # curly braces are necessary to capture the output of 'time'
   { time $P4DBIN -r $ROOTDIR -jd -z ${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz; } >> "$LOGFILE" 2>&1 || { die "New checkpoint dump failed!"; }
}

compare_journal_numbers () {
   # Get the journal number of the offline database
   if [[ ! -f $OFFLINE_DB/offline_db_usable.txt ]]; then
      die "Offline database not in a usable state. Check the backup process."
   fi
   if [[ ! -f $OFFLINE_DB/db.counters ]]; then
      die "Offline database not found. Consider creating it with live_checkpoint.sh. Be aware that it locks the live system and can take a long time! Abort!"
   fi
   local _OFFLINEJNLNUM=$($P4DBIN -r $OFFLINE_DB -jd - db.counters | grep '@journal@' | cut -d "@" -f 8 2>> $LOGFILE) || die "Cannot get $OFFLINE_DB journal number. Abort!"
   check_journalnum $_OFFLINEJNLNUM

   # Get the journal number of the root database
   if [[ ! -f $P4ROOT/db.counters ]]; then
      die "$P4ROOT database not found. Something is seriously wrong since the server was just running a minute ago! Contact support@perforce.com" 
   fi
   local _JNLNUM=$($P4DBIN -r $P4ROOT -jd - db.counters | grep '@journal@' | cut -d "@" -f 8 2>> $LOGFILE) || die "Cannot get $P4ROOT journal number. Abort!"
   check_journalnum $_JNLNUM

   if [[ $_JNLNUM -ne $_OFFLINEJNLNUM ]]; then
      log "$P4ROOT journal number is: $_JNLNUM"
      log "$OFFLINE_DB journal number is: $_OFFLINEJNLNUM"
      die "$P4ROOT and $OFFLINE_DB numbers do not match." 
   fi
}

switch_db_files () {
   # Compare the Offline and Master journal numbers before switching to make sure they match.
   compare_journal_numbers
   log "Switching out db files..."
   [[ -d $SAVEDIR ]] || mkdir -p $SAVEDIR
   rm -f $SAVEDIR/db.* >> $LOGFILE 2>&1
   mv $P4ROOT/db.* $SAVEDIR >> $LOGFILE 2>&1
   rm -f ${OFFLINE_DB}/offline_db_usable.txt
   mv $OFFLINE_DB/db.* $P4ROOT >> $LOGFILE 2>&1 || die "Move of offline db file to $P4ROOT failed."
}

rotate_log_file () {
   ROTATE_LOGNAME=$1
   GZ_EXT=${2:-}
   if [[ -f ${ROTATE_LOGNAME} ]]; then
      mv -f ${ROTATE_LOGNAME} ${ROTATE_LOGNAME}.${LOGID} >> $LOGFILE 2>&1
      [[ ! -z "$GZ_EXT" ]] && gzip ${ROTATE_LOGNAME}.${LOGID} >> $LOGFILE 2>&1
   fi
}

# At the start of each run for live_checkpoint.sh, daily_backup.sh, and
# weekly_backup.sh, before *any* logging activity occurs, rotate the logs
# from the most recent prior run, always named "checkpoint.log" or "log".
rotate_last_run_logs () {
   cd "$LOGS"
   LOGID=${JOURNALNUM}.$(date +'%Y-%m-%d_%H-%M-%S')

   # Rotate prior checkpoint.log
   rotate_log_file $LOGFILE

   # Rotate prior server log.
   rotate_log_file "log" ".gz"

   # Rotate prior broker log.
   rotate_log_file "p4broker.log" ".gz"

   # Rotate prior audit log.
   rotate_log_file "audit.log" ".gz"

   # Rotate prior sync_replica log.
   rotate_log_file "sync_replica.log" ".gz"

   cd - > /dev/null
}

remove_log_files () {
   REMOVE_LOGNAME=$1

   for I_LOGFILE in $(ls -t ${REMOVE_LOGNAME}* 2>/dev/null | awk "NR > $KEEPLOGS"); do
      log "rm -f $I_LOGFILE"
      rm -f "$I_LOGFILE"
   done
}

remove_old_logs () {
   # Remove old Checkpoint Logs
   # Use KEEPCKPS rather than KEEPLOGS, so we keep the same number
   # of checkpoint logs as we keep checkpoints.
   cd "$LOGS"

   if [[ $KEEPCKPS -eq 0 ]]; then
      log "Skipping cleanup of old checkpoint logs because KEEPCKPS is set to 0."
   else
      log "Deleting old checkpoint logs.  Keeping latest $KEEPCKPS, per KEEPCKPS setting in p4_vars."
      remove_log_files "checkpoint.log"
   fi

   if [[ $KEEPLOGS -eq 0 ]]; then
      log "Skipping cleanup of old server logs because KEEPLOGS is set to 0."
   else
      log "Deleting old server logs.  Keeping latest $KEEPLOGS, per KEEPLOGS setting in p4_vars."
      remove_log_files "log"
      remove_log_files "p4broker.log"
      remove_log_files "audit.log"
      remove_log_files "sync_replica.log"
      remove_log_files "recreate_offline_db.log"
      remove_log_files "upgrade.log"
   fi
   cd - > /dev/null
}

set_counter() {
   $P4BIN -u $P4USER -p $P4PORT login < /p4/common/bin/adminpass > /dev/null
   $P4BIN -u $P4USER -p $P4PORT counter lastSDPCheckpoint "$(date +'%s (%Y/%m/%d %H:%M:%S %z %Z)')" > /dev/null
}

#	Change	User	Description
#1	19278	trina	"Forking branch Main of perforce-software-sdp to trina-sdp."
//guest/perforce_software/sdp/main/Server/Unix/p4/common/bin/backup_functions.sh
#31	18961	C. Thomas Tyler	Released: SDP/MultiArch/2016.1/18958 (2016/04/08).
#30	18619	Russell C. Jackson (Rusty)	Updating main with current changes.
#29	18530	Russell C. Jackson (Rusty)	Update main from dev.
#28	16155	Russell C. Jackson (Rusty)	Removed check code that probably hasn't ever worked. Deleted mirror_ldap* since that functionality is built into the server now.
#27	15856	C. Thomas Tyler	Replaced the big license comment block with a shortened form referencing the LICENSE file included with the SDP package, and also by the URL for the license file in The Workshop.
#26	15784	Russell C. Jackson (Rusty)	Added missing bracket.
#25	15780	C. Thomas Tyler	Per Robert: Style police causing problems :) Fixed!
#24	15777	C. Thomas Tyler	No functional changes. Style Policing only on bash scripts only. Normalized indentation and line breaks, removed offending tabs, and general whitespace usage.
#23	15609	C. Thomas Tyler	Pushing SDP 2015.1.15607 (2015/09/02).
#22	15197	Russell C. Jackson (Rusty)	Corrected versions from testing.
#21	15193	Russell C. Jackson (Rusty)	Added semaphore file to indicate state of the offline database and added check into the backup process to fail if the state of the offline db is not good.
#20	15190	Russell C. Jackson (Rusty)	Added a semaphore file to prevent the checkpoint process from running if another one hasn't finished. Added a check to make sure the journal number is numeric.
#19	13928	dsp	Set lastSDPCounter after a successfull SDP checkpoint p4 admin checkpoint sets lastCheckpointAction, which is useful for monitoring, in particular when checkpoint age should be observed from the outside through p4. However the SDP is using p4d directly to create checkpoints and will not set checkpoints. In order to distinguish human actions from the SDP cronjobs, set a new counter lastSDPCounter in a similar format.
#18	13908	C. Thomas Tyler	Pushing SDP 2015.1.13906.
#17	12171	Russell C. Jackson (Rusty)	Merge in changes to remove the need for p4master_run.
#16	11950	Russell C. Jackson (Rusty)	Made die function record ERROR!!! $HOSTNAME and $P4SERVER in subject. Cleaned up message passed to die command and corrected a typo.
#15	11929	Russell C. Jackson (Rusty)	Updated die function to just pass parameter to mail_log_file instead of echo.
#14	11919	Russell C. Jackson (Rusty)	Added a SERVERID variable to p4_vars and updated backup_functions to use it. Changed the location and the names of the config files so that they could live in /p4/common/config (You're welcome Tom). The files names are: p4_$INSTANCE.vars p4_$INSTANCE.p4review.cfg p4_$INSTANCE.vars will now set P4REPLICA to FALSE if SERVERID matches MASTERNAME, otherwise it is TRUE. This change means that a user must change server.id now in order to change the role of the server rather than changing the instance vars file. This makes more sense to a user that is reading the admin guide about server.id rather than overwriting the file based on a setting that isn't in the admin guide. Change mkdirs to reflect all of the above changes.
#13	11908	adrian_waters	Use set -u to trap unbounded variables
#12	11886	Russell C. Jackson (Rusty)	Changed $prog to $0 so that we don't have to set prog in the calling functions.
#11	11766	Robert Cowham	Missed a reference to $MAIL in @11758 Tweaked run_tests.sh to output more error messages on failure. Though this still doesn't show output of individual failed commands.
#10	11758	Russell C. Jackson (Rusty)	Change MAIL variable to SDPMAIL to avoid conflicts with customer variables. Changed sdp_sync.sh to use get_mail_opts from backup_functions to avoid duplicate functions.
#9	11730	Russell C. Jackson (Rusty)	Moved P4SERVER variable to p4_vars so that all scripts can use it properly. replica_status.sh referenced it, but it wasn't working since it was only in backup_functions.sh
#8	11710	Russell C. Jackson (Rusty)	Changed die function to call new email function. Added su to OSUSER functionality to p4master_run to avoid problems with people running scripts manually as root by mistake.
#7	11707	Robert Cowham	Refactored sending of mail to a common function. Make the setting of "MAILFROM" work for Ubuntu (GNU Mailutils) as well as CentOS
#6	11570	Russell C. Jackson (Rusty)	Brought in changes from Mark Foundry to add -S $MAILFROM to mail commands. Changed sync_replica.sh and weekly_sync_replica.sh to use $LOGFILE for consistency. Added mail command to both files as well.
#5	11540	Russell C. Jackson (Rusty)	Converted to unix format.
#4	11534	Russell C. Jackson (Rusty)	Added -f to -jr to cover offline obliterates where the entries are already removed from the offline database.
#3	11524	Russell C. Jackson (Rusty)	Released updated version of the SDP from Dev.
#2	11130	Robert Cowham	Check for the existence of offline database and log error message if not found.
#1	10148	C. Thomas Tyler	Promoted the Perforce Server Deployment Package to The Workshop.