#!/bin/bash
#==============================================================================
# Copyright and license info is available in the LICENSE file included with
# the Server Deployment Package (SDP), and also available online:
# https://swarm.workshop.perforce.com/projects/perforce-software-sdp/view/main/LICENSE
#------------------------------------------------------------------------------

set -u

# Common functions used in all backup scripts.

check_vars () {
   if [[ -z "$SDP_INSTANCE" || -z "$P4HOME" || -z "$P4PORT" || -z "$P4ROOT" || -z "$P4JOURNAL" || -z "$P4BIN" || -z "$P4DBIN" || -z "$P4TICKETS" || -z "$KEEPCKPS" || -z "$KEEPJNLS" || -z "$KEEPLOGS" || -z "$CHECKPOINTS" || -z "$LOGS" || -z "$OSUSER" ]]; then
      echo "Use p4master_run or source p4_vars when calling this script."
      echo "Required external variable not set. Abort!"
      exit 1
   fi
}

set_vars () {
   RC=$P4HOME/bin/p4d_${SDP_INSTANCE}_init
   OFFLINE_DB=${P4HOME}/offline_db
   SAVEDIR=${P4ROOT}/save
   EDGESERVER=0
   $P4CBIN/p4login
   $P4BIN -u $P4USER -p $P4PORT server -o $SERVERID | grep ^Services | grep "edge-server" > /dev/null
   if [[ $(echo $?) -eq 0 ]]; then
      EDGESERVER=1
   fi
}

# check if user is running as required OS user
check_uid () {
	user=$(id -un)
   if [[ ${user} != ${OSUSER} ]]; then
      die "Must be run by user: ${OSUSER}. Abort!"
   fi
}

log () {
   echo -n $(date)    2>&1 >> "$LOGFILE"
   echo " $0: $@" 2>&1 >> "$LOGFILE"
}

# Decide depending on our mail utility, how to specify sender (if we need to)
get_mail_sender_opt () {
   local mail_sender_opt=""
   if [[ ! -z "$MAILFROM" ]]; then
      # Default for CentOS/RHEL, but allow GNU Mailutils alternative flag instead
      mail_sender_opt="-S from=$MAILFROM"
      local mail_ver=$($SDPMAIL -V)
      [[ "$mail_ver" =~ "GNU Mailutils" ]] && mail_sender_opt="-aFrom:$MAILFROM"
   fi
   echo "$mail_sender_opt"
}

mail_log_file () {
   local subject=$1
   local mail_sender_opt=$(get_mail_sender_opt)
   $SDPMAIL -s "$subject" $mail_sender_opt $MAILTO < "$LOGFILE"
}

die () { # send mail and exit
   # mail the error (with more helpful subject line than cron)
   log "ERROR!!! - $HOSTNAME $P4SERVER $0: $@"
   mail_log_file "ERROR!!! - $HOSTNAME $P4SERVER $0: $@"

   # if running from terminal, also send to stderr
   tty >/dev/null
   if [[ $? -eq 0 ]]; then
      echo "$@" >&2
   fi
   exit 1
}

ckp_running() {
   if [[ -f ${LOGS}/ckp_running.txt ]]; then
      die "Last checkpoint not complete. Check the backup process or contact support."
   fi
   echo "Checkpoint running." > ${LOGS}/ckp_running.txt
}

ckp_complete() {
   rm -f ${LOGS}/ckp_running.txt
}

checkdir () {
   local dir=$1
   [[ -w $dir ]] && return
   die "$dir is not writable. Abort!"
}

check_dirs () {
   # Check that key dirs are writable
   dirs_ok=true
   for dir in $OFFLINE_DB $CHECKPOINTS $LOGS; do
      checkdir $dir    # aborts on failure.
   done
}

check_disk_space () {
   # Add the results of df -h or df -m to the log file.
   log "Checking disk space..."
   $P4BIN diskspace >> "$LOGFILE" 2>&1
}

check_journalnum () {
   local JNLNUM=$1
   re='^[0-9]+$'
   if ! [[ $JNLNUM =~ $re ]] ; then
      die "Journal number must be numeric."
   fi
}

get_journalnum () {
   # get the current journal and checkpoint serial numbers.
   JOURNALNUM=$($P4BIN -u $P4USER -p $P4PORT counter journal 2>> $LOGFILE) || die "Cannot get the checkpoint number. Abort!"
   check_journalnum $JOURNALNUM

   # If we are on an edge server, the journal has already rotated, so we have to decrement the value
   # so that we replay the correct journal file and create the correct checkpoint number on the
   # edge server.
   if [[ $EDGESERVER -eq 1 ]]; then
      JOURNALNUM=$(($JOURNALNUM - 1))
   fi
   CHECKPOINTNUM=$(($JOURNALNUM + 1))
}

check_offline_db_usable () {
   # Check it is OK
   if [[ ! -f $OFFLINE_DB/offline_db_usable.txt ]]; then
      die "Offline database not in a usable state. Check the backup process."
   fi
   if [[ ! -f $OFFLINE_DB/db.counters ]]; then
      die "Offline database not found. Consider creating it with live_checkpoint.sh. Be aware that it locks the live system and can take a long time! Abort!"
   fi
}

get_offline_journal_num () {
   # Get the journal number of the offline database
   check_offline_db_usable
   OFFLINEJNLNUM=$($P4DBIN -r $OFFLINE_DB -jd - db.counters | grep '@journal@' | cut -d "@" -f 8 2>> $LOGFILE) || die "Cannot get the offline journal number. Abort!"
   check_journalnum $OFFLINEJNLNUM
   log "Offline journal number is: $OFFLINEJNLNUM"
}

remove_old_checkpoints_and_journals () {
   if [[ $KEEPCKPS -eq 0 ]]; then
      log "Skipping cleanup of old checkpoints because KEEPCKPS is set to 0."
   else
      log "Deleting obsolete checkpoints and journals. Keeping latest $KEEPCKPS  per KEEPCKPS setting in p4_vars."

      # Remove selected checkpoint and journal files based on the KEEPCKPS setting regardless of whether compressed or not.
      # We multiply KEEPCKP by 2 for the ckp files because of the md5 files.
      for I_LOGFILE in $(ls -t ${CHECKPOINTS}/${P4SERVER}.ckp.* 2>/dev/null | awk "NR > ($KEEPCKPS * 2)"); do
         log "rm -f $I_LOGFILE"
         rm -f "$I_LOGFILE"
      done      
      # Use KEEPJNLS to allow for separate journal rotation at a higher frequency.
      for I_LOGFILE in $(ls -t ${CHECKPOINTS}/${P4SERVER}.jnl.* 2>/dev/null | awk "NR > $KEEPJNLS"); do
         log "rm -f $I_LOGFILE"
         rm -f "$I_LOGFILE"
      done      
   fi
}

stop_p4d () {
   log "Shutting down the p4 server"
   $RC stop >> "$LOGFILE" 2>&1
   COUNTER=$( ps -ef | grep -i p4d_$P4INSTANCE | grep -v grep | wc -l )
   declare -i WAITCOUNT=0
   while [[ $COUNTER != "0" ]]; do
      sleep 5
      COUNTER=$( ps -ef | grep -i p4d_$P4INSTANCE | grep -v grep | wc -l )
      WAITCOUNT=$(( $WAITCOUNT + 1 ))
      # The WAITCOUNT value below is 120 * 5 seconds = 10 minutes.
      # If p4d hasn't shut down by then, something is not shutting down on its own, so we kill it.
      if (( $WAITCOUNT > 120 )); then
         ps -ef | grep -i p4d_$P4INSTANCE | awk '{print $2}' | xargs kill > /dev/null 2>&1
      fi
   done
   log "p4 stop finished -- p4 should be down now."
}

start_p4d () {
   log "Starting the p4 server"
   $RC start >> "$LOGFILE" 2>&1
   sleep 3 # Give it a few seconds to start up
   # Confirm that it started - success below means it did
   if $P4BIN -u $P4USER -p $P4PORT info >/dev/null 2>&1 ; then
      log "Server restarted successfully - p4 should be back up now."
   else
      log "Error: Server does not appear to have started."
   fi
}

truncate_journal () {
   [[ -f ${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz ]] && die "Checkpoint ${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz already exists, check the backup process."
   if [[ $EDGESERVER -eq 0 ]]; then
      [[ -f ${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM} ]] && die "Journal ${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM} already exists, check the backup process."
      log "Truncating journal..."
      # 'p4d -jj' does a copy-then-delete, instead of a simple mv.
      # during 'p4d -jj' the perforce server will hang the responses to clients.
      # curly braces are necessary to capture the output of 'time'
      $P4CBIN/p4login -p $P4MASTERPORT
      { time $P4BIN -p $P4MASTERPORT admin journal ${CHECKPOINTS}/${P4SERVER}; } >> "$LOGFILE" 2>&1 || { die "Journal rotation failed. Abort!"; }
      # The test below waits until the journal file exists in the checkpoints directory before proceeding.
      test=1
      while [ $test != 0 ]
      do
          sleep 5
          if [ -f "${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM}" ];then
              test=0
          fi
      done
      $P4CBIN/p4login
   fi
}

p4d_truncate_journal () {
   [[ -f ${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz ]] && die "Checkpoint ${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz already exists, check the backup process."
   if [[ $EDGESERVER -eq 0 ]]; then
      [[ -f ${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM} ]] && die "Journal ${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM} already exists, check the backup process."
      log "Truncating journal..."
      # 'p4d -jj' does a copy-then-delete, instead of a simple mv.
      # during 'p4d -jj' the perforce server will hang the responses to clients.
      # curly braces are necessary to capture the output of 'time'
      { time $P4DBIN -r $P4ROOT -J $P4JOURNAL -jj ${CHECKPOINTS}/${P4SERVER}; } >> "$LOGFILE" 2>&1 || { start_p4d; die "Journal rotation failed. Abort!"; }
   fi
}

replay_journals_to_offline_db () {
   log "Replay any unreplayed journals to the offline database"
   for (( j=$OFFLINEJNLNUM; $j <= $JOURNALNUM; j++ )); do
      log "Replay journal ${P4SERVER}.jnl.${j} to offline db."
      # curly braces are necessary to capture the output of 'time'
      { time $P4DBIN -r $OFFLINE_DB -jr -f ${CHECKPOINTS}/${P4SERVER}.jnl.${j}; } >> "$LOGFILE" 2>&1 || { die "Offline journal replay failed. Abort!"; }
   done
}

replay_active_journal_to_offline_db () {
   log "Replay active journal to offline db."
   # curly braces are necessary to capture the output of 'time'
   { time $P4DBIN -r $OFFLINE_DB -jr -f ${P4JOURNAL}; } >> "$LOGFILE" 2>&1 || { die "Active Journal replay failed. Abort!"; }
}

recreate_offline_db_files () {
   [[ -f ${CHECKPOINTS}/${P4SERVER}.ckp.*.gz ]] && ckp_complete && die "No checkpoints found - run live_checkpoint.sh"
   rm -f ${OFFLINE_DB}/offline_db_usable.txt
   rm -f ${OFFLINE_DB}/db.* >> "$LOGFILE"
   LASTCKP=$(ls -t ${CHECKPOINTS}/${P4SERVER}.ckp.*.gz | head -1)
   log "Recovering from $LASTCKP"
   # curly braces are necessary to capture the output of 'time'
   { time $P4DBIN -r $OFFLINE_DB -jr -z ${LASTCKP}; } >> "$LOGFILE" 2>&1 || { die "Restore of checkpoint to $OFFLINE_DB failed!"; }
   echo "Offline db file restored successfully." > ${OFFLINE_DB}/offline_db_usable.txt
}

checkpoint () {
   log "Create a new checkpoint from the live db files."
   # curly braces are necessary to capture the output of 'time'
   { time $P4DBIN -r $P4ROOT -jc -Z ${CHECKPOINTS}/${P4SERVER}; } >>"$LOGFILE" 2>&1 || { die "ERROR - New checkpoint failed!"; }
}

dump_checkpoint () {
   log "Dump out new checkpoint from db files in $ROOTDIR."
   # curly braces are necessary to capture the output of 'time'
   { time $P4DBIN -r $ROOTDIR -jd -z ${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz; } >> "$LOGFILE" 2>&1 || { die "New checkpoint dump failed!"; }
}

compare_journal_numbers () {
   # Get the journal number of the offline database
   if [[ ! -f $OFFLINE_DB/offline_db_usable.txt ]]; then
      die "Offline database not in a usable state. Check the backup process."
   fi
   if [[ ! -f $OFFLINE_DB/db.counters ]]; then
      die "Offline database not found. Consider creating it with live_checkpoint.sh. Be aware that it locks the live system and can take a long time! Abort!"
   fi
   local _OFFLINEJNLNUM=$($P4DBIN -r $OFFLINE_DB -jd - db.counters | grep '@journal@' | cut -d "@" -f 8 2>> $LOGFILE) || die "Cannot get $OFFLINE_DB journal number. Abort!"
   check_journalnum $_OFFLINEJNLNUM

   # Get the journal number of the root database
   if [[ ! -f $P4ROOT/db.counters ]]; then
      die "$P4ROOT database not found. Something is seriously wrong since the server was just running a minute ago! Contact support@perforce.com" 
   fi
   local _JNLNUM=$($P4DBIN -r $P4ROOT -jd - db.counters | grep '@journal@' | cut -d "@" -f 8 2>> $LOGFILE) || die "Cannot get $P4ROOT journal number. Abort!"
   check_journalnum $_JNLNUM

   if [[ $_JNLNUM -ne $_OFFLINEJNLNUM ]]; then
      log "$P4ROOT journal number is: $_JNLNUM"
      log "$OFFLINE_DB journal number is: $_OFFLINEJNLNUM"
      die "$P4ROOT and $OFFLINE_DB numbers do not match." 
   fi
}

switch_db_files () {
   # Compare the Offline and Master journal numbers before switching to make sure they match.
   compare_journal_numbers
   log "Switching out db files..."
   [[ -d $SAVEDIR ]] || mkdir -p $SAVEDIR
   rm -f $SAVEDIR/db.* >> $LOGFILE 2>&1
   mv $P4ROOT/db.* $SAVEDIR >> $LOGFILE 2>&1
   rm -f ${OFFLINE_DB}/offline_db_usable.txt
   mv $OFFLINE_DB/db.* $P4ROOT >> $LOGFILE 2>&1 || die "Move of offline db file to $P4ROOT failed."
}

rotate_log_file () {
   ROTATE_LOGNAME=$1
   GZ_EXT=${2:-}
   if [[ -f ${ROTATE_LOGNAME} ]]; then
      mv -f ${ROTATE_LOGNAME} ${ROTATE_LOGNAME}.${LOGID} >> $LOGFILE 2>&1
      [[ ! -z "$GZ_EXT" ]] && gzip ${ROTATE_LOGNAME}.${LOGID} >> $LOGFILE 2>&1
   fi
}

# At the start of each run for live_checkpoint.sh, daily_checkpoint.sh, and
# recreate_db_checkpoint.sh, before *any* logging activity occurs, rotate the logs
# from the most recent prior run, always named "checkpoint.log" or "log".
rotate_last_run_logs () {
   cd "$LOGS"
   LOGID=${JOURNALNUM}.$(date +'%Y-%m-%d_%H-%M-%S')

   # Rotate prior log file for the current script. 
   rotate_log_file $LOGFILE

   # Rotate prior server log.
   rotate_log_file "log" ".gz"

   # Rotate prior broker log.
   rotate_log_file "p4broker.log" ".gz"

   # Rotate prior audit log.
   rotate_log_file "audit.log" ".gz"

   cd - > /dev/null
}

remove_log_files () {
   REMOVE_LOGNAME=$1
   KEEPNUM=$2

   for I_LOGFILE in $(ls -t ${REMOVE_LOGNAME}* 2>/dev/null | awk "NR > $KEEPNUM"); do
      log "rm -f $I_LOGFILE"
      rm -f "$I_LOGFILE"
   done
}

remove_old_logs () {
   # Remove old Checkpoint Logs
   # Use KEEPJNLS rather than KEEPLOGS, so we keep the same number
   # of checkpoint logs as we keep checkpoints.
   cd "$LOGS"

   if [[ $KEEPJNLS -eq 0 ]]; then
      log "Skipping cleanup of old checkpoint logs because KEEPJNLS is set to 0."
   else
      log "Deleting old checkpoint logs.  Keeping latest $KEEPJNLS, per KEEPJNLS setting in p4_vars."
      remove_log_files "checkpoint.log" $KEEPJNLS
   fi

   if [[ $KEEPLOGS -eq 0 ]]; then
      log "Skipping cleanup of old server logs because KEEPLOGS is set to 0."
   else
      log "Deleting old server logs.  Keeping latest $KEEPLOGS, per KEEPLOGS setting in p4_vars."
      remove_log_files "log" $KEEPLOGS
      remove_log_files "p4broker.log" $KEEPLOGS
      remove_log_files "audit.log" $KEEPLOGS
      remove_log_files "sync_replica.log" $KEEPLOGS
      remove_log_files "recreate_offline_db.log" $KEEPLOGS
      remove_log_files "upgrade.log" $KEEPLOGS
   fi
   cd - > /dev/null
}

set_counter() {
   $P4CBIN/p4login
   $P4BIN -u $P4USER -p $P4PORT counter lastSDPCheckpoint "$(date +'%s (%Y/%m/%d %H:%M:%S %z %Z)')" > /dev/null
}

