#!/bin/bash #============================================================================== # Copyright and license info is available in the LICENSE file included with # the Server Deployment Package (SDP), and also available online: # https://swarm.workshop.perforce.com/projects/perforce-software-sdp/view/main/LICENSE #------------------------------------------------------------------------------ set -u # Common functions used in all backup scripts. check_vars () { if [[ -z "$SDP_INSTANCE" || -z "$P4HOME" || -z "$P4PORT" || -z "$P4ROOT" || -z "$P4JOURNAL" || -z "$P4BIN" || -z "$P4DBIN" || -z "$P4TICKETS" || -z "$KEEPCKPS" || -z "$KEEPLOGS" || -z "$CHECKPOINTS" || -z "$LOGS" || -z "$OSUSER" ]]; then echo "Use p4master_run when calling this script." echo "Required external variable not set. Abort!" exit 1 fi } set_vars () { RC=/etc/init.d/p4d_${SDP_INSTANCE}_init [[ -f "$RC" ]] || RC=/p4/$SDP_INSTANCE/bin/p4d_${SDP_INSTANCE}_init OFFLINE_DB=${P4HOME}/offline_db SAVEDIR=${P4ROOT}/save EDGESERVER=0 $P4BIN -u $P4USER -p $P4PORT login < /p4/common/bin/adminpass > /dev/null $P4BIN -u $P4USER -p $P4PORT server -o $SERVERID | grep ^Services | grep "edge-server" > /dev/null if [[ $(echo $?) -eq 0 ]]; then EDGESERVER=1 fi } # check if user is running as required OS user check_uid () { user=$(id -un) if [[ ${user} != ${OSUSER} ]]; then die "Must be run by user: ${OSUSER}. Abort!" fi } log () { echo -n $(date) 2>&1 >> "$LOGFILE" echo " $0: $@" 2>&1 >> "$LOGFILE" } # Decide depending on our mail utility, how to specify sender (if we need to) get_mail_sender_opt () { local mail_sender_opt="" if [[ ! -z "$MAILFROM" ]]; then # Default for CentOS/RHEL, but allow GNU Mailutils alternative flag instead mail_sender_opt="-S from=$MAILFROM" local mail_ver=$($SDPMAIL -V) [[ "$mail_ver" =~ "GNU Mailutils" ]] && mail_sender_opt="-aFrom:$MAILFROM" fi echo "$mail_sender_opt" } mail_log_file () { local subject=$1 local mail_sender_opt=$(get_mail_sender_opt) $SDPMAIL -s "$subject" $mail_sender_opt $MAILTO < "$LOGFILE" } die () { # send mail and exit # mail the error (with more helpful subject line than cron) log "ERROR!!! - $HOSTNAME $P4SERVER $0: $@" mail_log_file "ERROR!!! - $HOSTNAME $P4SERVER $0: $@" # if running from terminal, also send to stderr tty >/dev/null if [[ $? -eq 0 ]]; then echo "$@" >&2 fi exit 1 } ckp_running() { if [[ -f ${LOGS}/ckp_running.txt ]]; then die "Last checkpoint not complete. Check the backup process or contact support." fi echo "Checkpoint running." > ${LOGS}/ckp_running.txt } ckp_complete() { rm -f ${LOGS}/ckp_running.txt } checkdir () { local dir=$1 [[ -w $dir ]] && return die "$dir is not writable. Abort!" } check_dirs () { # Check that key dirs are writable dirs_ok=true for dir in $OFFLINE_DB $CHECKPOINTS $LOGS; do checkdir $dir # aborts on failure. done } check_disk_space () { # Add the results of df -h or df -m to the log file. log "Checking disk space..." $P4BIN diskspace >> "$LOGFILE" 2>&1 } check_journalnum () { local JNLNUM=$1 re='^[0-9]+$' if ! [[ $JNLNUM =~ $re ]] ; then die "Journal number must be numeric." fi } get_journalnum () { # get the current journal and checkpoint serial numbers. JOURNALNUM=$($P4BIN -u $P4USER -p $P4PORT counter journal 2>> $LOGFILE) || die "Cannot get the checkpoint number. Abort!" check_journalnum $JOURNALNUM # If we are on an edge server, the journal has already rotated, so we have to decrement the value # so that we replay the correct journal file and create the correct checkpoint number on the # edge server. if [[ $EDGESERVER -eq 1 ]]; then JOURNALNUM=$(($JOURNALNUM - 1)) fi CHECKPOINTNUM=$(($JOURNALNUM + 1)) } get_offline_journal_num () { # Get the journal number of the offline database if [[ ! -f $OFFLINE_DB/offline_db_usable.txt ]]; then die "Offline database not in a usable state. Check the backup process." fi if [[ ! -f $OFFLINE_DB/db.counters ]]; then die "Offline database not found. Consider creating it with live_checkpoint.sh. Be aware that it locks the live system and can take a long time! Abort!" fi OFFLINEJNLNUM=$($P4DBIN -r $OFFLINE_DB -jd - db.counters | grep '@journal@' | cut -d "@" -f 8 2>> $LOGFILE) || die "Cannot get the offline journal number. Abort!" check_journalnum $OFFLINEJNLNUM log "Offline journal number is: $OFFLINEJNLNUM" } remove_old_checkpoints_and_journals () { if [[ $KEEPCKPS -eq 0 ]]; then log "Skipping cleanup of old checkpoints because KEEPCKPS is set to 0." else log "Deleting obsolete checkpoints and journals. Keeping latest $KEEPCKPS per KEEPCKPS setting in p4_vars." # Remove selected checkpoint and journal files based on the KEEPCKPS setting regardless of whether compressed or not. # We multiply KEEPCKP by 2 for the ckp files because of the md5 files. for I_LOGFILE in $(ls -t ${CHECKPOINTS}/${P4SERVER}.ckp.* 2>/dev/null | awk "NR > ($KEEPCKPS * 2)"); do log "rm -f $I_LOGFILE" rm -f "$I_LOGFILE" done for I_LOGFILE in $(ls -t ${CHECKPOINTS}/${P4SERVER}.jnl.* 2>/dev/null | awk "NR > $KEEPCKPS"); do log "rm -f $I_LOGFILE" rm -f "$I_LOGFILE" done fi } stop_p4d () { log "Shutting down the p4 server" $RC stop >> "$LOGFILE" 2>&1 COUNTER=$( ps -ef | grep -i p4d_$P4INSTANCE | grep -v grep | wc -l ) declare -i WAITCOUNT=0 while [[ $COUNTER != "0" ]]; do sleep 5 COUNTER=$( ps -ef | grep -i p4d_$P4INSTANCE | grep -v grep | wc -l ) WAITCOUNT=$(( $WAITCOUNT + 1 )) # The WAITCOUNT value below is 120 * 5 seconds = 10 minutes. # If p4d hasn't shut down by then, something is not shutting down on its own, so we kill it. if (( $WAITCOUNT > 120 )); then ps -ef | grep -i p4d_$P4INSTANCE | awk '{print $2}' | xargs kill > /dev/null 2>&1 fi done log "p4 stop finished -- p4 should be down now." } start_p4d () { log "Starting the p4 server" $RC start >> "$LOGFILE" 2>&1 sleep 3 # Give it a few seconds to start up # Confirm that it started - success below means it did if $P4BIN -u $P4USER -p $P4PORT info >/dev/null 2>&1 ; then log "Server restarted successfully - p4 should be back up now." else log "Error: Server does not appear to have started." fi } truncate_journal () { [[ -f ${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz ]] && die "Checkpoint ${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz already exists, check the backup process." if [[ $EDGESERVER -eq 0 ]]; then [[ -f ${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM} ]] && die "Journal ${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM} already exists, check the backup process." log "Truncating journal..." # 'p4d -jj' does a copy-then-delete, instead of a simple mv. # during 'p4d -jj' the perforce server will hang the responses to clients. # curly braces are necessary to capture the output of 'time' { time $P4DBIN -r $P4ROOT -J $P4JOURNAL -jj ${CHECKPOINTS}/${P4SERVER}; } >> "$LOGFILE" 2>&1 || { start_p4d; die "Journal rotation failed. Abort!"; } fi } replay_journals_to_offline_db () { log "Replay any unreplayed journals to the offline database" for (( j=$OFFLINEJNLNUM; $j <= $JOURNALNUM; j++ )); do log "Replay journal ${P4SERVER}.jnl.${j} to offline db." # curly braces are necessary to capture the output of 'time' { time $P4DBIN -r $OFFLINE_DB -jr -f ${CHECKPOINTS}/${P4SERVER}.jnl.${j}; } >> "$LOGFILE" 2>&1 || { die "Offline journal replay failed. Abort!"; } done } replay_active_journal_to_offline_db () { log "Replay active journal to offline db." # curly braces are necessary to capture the output of 'time' { time $P4DBIN -r $OFFLINE_DB -jr -f ${P4JOURNAL}; } >> "$LOGFILE" 2>&1 || { die "Active Journal replay failed. Abort!"; } } recreate_offline_db_files () { [[ -f ${CHECKPOINTS}/${P4SERVER}.ckp.*.gz ]] && ckp_complete && die "No checkpoints found - run live_checkpoint.sh" rm -f ${OFFLINE_DB}/offline_db_usable.txt rm -f ${OFFLINE_DB}/db.* >> "$LOGFILE" LASTCKP=$(ls -t ${CHECKPOINTS}/${P4SERVER}.ckp.*.gz | head -1) log "Recovering from $LASTCKP" # curly braces are necessary to capture the output of 'time' { time $P4DBIN -r $OFFLINE_DB -jr -z ${LASTCKP}; } >> "$LOGFILE" 2>&1 || { die "Restore of checkpoint to $OFFLINE_DB failed!"; } echo "Offline db file restored successfully." > ${OFFLINE_DB}/offline_db_usable.txt } checkpoint () { log "Create a new checkpoint from the live db files." # curly braces are necessary to capture the output of 'time' { time $P4DBIN -r $P4ROOT -jc -Z ${CHECKPOINTS}/${P4SERVER}; } >>"$LOGFILE" 2>&1 || { die "ERROR - New checkpoint failed!"; } } dump_checkpoint () { log "Dump out new checkpoint from db files in $ROOTDIR." # curly braces are necessary to capture the output of 'time' { time $P4DBIN -r $ROOTDIR -jd -z ${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz; } >> "$LOGFILE" 2>&1 || { die "New checkpoint dump failed!"; } } compare_journal_numbers () { # Get the journal number of the offline database if [[ ! -f $OFFLINE_DB/offline_db_usable.txt ]]; then die "Offline database not in a usable state. Check the backup process." fi if [[ ! -f $OFFLINE_DB/db.counters ]]; then die "Offline database not found. Consider creating it with live_checkpoint.sh. Be aware that it locks the live system and can take a long time! Abort!" fi local _OFFLINEJNLNUM=$($P4DBIN -r $OFFLINE_DB -jd - db.counters | grep '@journal@' | cut -d "@" -f 8 2>> $LOGFILE) || die "Cannot get $OFFLINE_DB journal number. Abort!" check_journalnum $_OFFLINEJNLNUM # Get the journal number of the root database if [[ ! -f $P4ROOT/db.counters ]]; then die "$P4ROOT database not found. Something is seriously wrong since the server was just running a minute ago! Contact support@perforce.com" fi local _JNLNUM=$($P4DBIN -r $P4ROOT -jd - db.counters | grep '@journal@' | cut -d "@" -f 8 2>> $LOGFILE) || die "Cannot get $P4ROOT journal number. Abort!" check_journalnum $_JNLNUM if [[ $_JNLNUM -ne $_OFFLINEJNLNUM ]]; then log "$P4ROOT journal number is: $_JNLNUM" log "$OFFLINE_DB journal number is: $_OFFLINEJNLNUM" die "$P4ROOT and $OFFLINE_DB numbers do not match." fi } switch_db_files () { # Compare the Offline and Master journal numbers before switching to make sure they match. compare_journal_numbers log "Switching out db files..." [[ -d $SAVEDIR ]] || mkdir -p $SAVEDIR rm -f $SAVEDIR/db.* >> $LOGFILE 2>&1 mv $P4ROOT/db.* $SAVEDIR >> $LOGFILE 2>&1 rm -f ${OFFLINE_DB}/offline_db_usable.txt mv $OFFLINE_DB/db.* $P4ROOT >> $LOGFILE 2>&1 || die "Move of offline db file to $P4ROOT failed." } rotate_log_file () { ROTATE_LOGNAME=$1 GZ_EXT=${2:-} if [[ -f ${ROTATE_LOGNAME} ]]; then mv -f ${ROTATE_LOGNAME} ${ROTATE_LOGNAME}.${LOGID} >> $LOGFILE 2>&1 [[ ! -z "$GZ_EXT" ]] && gzip ${ROTATE_LOGNAME}.${LOGID} >> $LOGFILE 2>&1 fi } # At the start of each run for live_checkpoint.sh, daily_backup.sh, and # weekly_backup.sh, before *any* logging activity occurs, rotate the logs # from the most recent prior run, always named "checkpoint.log" or "log". rotate_last_run_logs () { cd "$LOGS" LOGID=${JOURNALNUM}.$(date +'%Y-%m-%d_%H-%M-%S') # Rotate prior checkpoint.log rotate_log_file $LOGFILE # Rotate prior server log. rotate_log_file "log" ".gz" # Rotate prior broker log. rotate_log_file "p4broker.log" ".gz" # Rotate prior audit log. rotate_log_file "audit.log" ".gz" # Rotate prior sync_replica log. rotate_log_file "sync_replica.log" ".gz" cd - > /dev/null } remove_log_files () { REMOVE_LOGNAME=$1 for I_LOGFILE in $(ls -t ${REMOVE_LOGNAME}* 2>/dev/null | awk "NR > $KEEPLOGS"); do log "rm -f $I_LOGFILE" rm -f "$I_LOGFILE" done } remove_old_logs () { # Remove old Checkpoint Logs # Use KEEPCKPS rather than KEEPLOGS, so we keep the same number # of checkpoint logs as we keep checkpoints. cd "$LOGS" if [[ $KEEPCKPS -eq 0 ]]; then log "Skipping cleanup of old checkpoint logs because KEEPCKPS is set to 0." else log "Deleting old checkpoint logs. Keeping latest $KEEPCKPS, per KEEPCKPS setting in p4_vars." remove_log_files "checkpoint.log" fi if [[ $KEEPLOGS -eq 0 ]]; then log "Skipping cleanup of old server logs because KEEPLOGS is set to 0." else log "Deleting old server logs. Keeping latest $KEEPLOGS, per KEEPLOGS setting in p4_vars." remove_log_files "log" remove_log_files "p4broker.log" remove_log_files "audit.log" remove_log_files "sync_replica.log" remove_log_files "recreate_offline_db.log" remove_log_files "upgrade.log" fi cd - > /dev/null } set_counter() { $P4BIN -u $P4USER -p $P4PORT login < /p4/common/bin/adminpass > /dev/null $P4BIN -u $P4USER -p $P4PORT counter lastSDPCheckpoint "$(date +'%s (%Y/%m/%d %H:%M:%S %z %Z)')" > /dev/null }
# | Change | User | Description | Committed | |
---|---|---|---|---|---|
#1 | 19278 | trina | "Forking branch Main of perforce-software-sdp to trina-sdp." | ||
//guest/perforce_software/sdp/main/Server/Unix/p4/common/bin/backup_functions.sh | |||||
#31 | 18961 | C. Thomas Tyler | Released: SDP/MultiArch/2016.1/18958 (2016/04/08). | ||
#30 | 18619 | Russell C. Jackson (Rusty) | Updating main with current changes. | ||
#29 | 18530 | Russell C. Jackson (Rusty) | Update main from dev. | ||
#28 | 16155 | Russell C. Jackson (Rusty) |
Removed check code that probably hasn't ever worked. Deleted mirror_ldap* since that functionality is built into the server now. |
||
#27 | 15856 | C. Thomas Tyler |
Replaced the big license comment block with a shortened form referencing the LICENSE file included with the SDP package, and also by the URL for the license file in The Workshop. |
||
#26 | 15784 | Russell C. Jackson (Rusty) | Added missing bracket. | ||
#25 | 15780 | C. Thomas Tyler |
Per Robert: Style police causing problems :) Fixed! |
||
#24 | 15777 | C. Thomas Tyler |
No functional changes. Style Policing only on bash scripts only. Normalized indentation and line breaks, removed offending tabs, and general whitespace usage. |
||
#23 | 15609 | C. Thomas Tyler | Pushing SDP 2015.1.15607 (2015/09/02). | ||
#22 | 15197 | Russell C. Jackson (Rusty) | Corrected versions from testing. | ||
#21 | 15193 | Russell C. Jackson (Rusty) |
Added semaphore file to indicate state of the offline database and added check into the backup process to fail if the state of the offline db is not good. |
||
#20 | 15190 | Russell C. Jackson (Rusty) |
Added a semaphore file to prevent the checkpoint process from running if another one hasn't finished. Added a check to make sure the journal number is numeric. |
||
#19 | 13928 | dsp |
Set lastSDPCounter after a successfull SDP checkpoint p4 admin checkpoint sets lastCheckpointAction, which is useful for monitoring, in particular when checkpoint age should be observed from the outside through p4. However the SDP is using p4d directly to create checkpoints and will not set checkpoints. In order to distinguish human actions from the SDP cronjobs, set a new counter lastSDPCounter in a similar format. |
||
#18 | 13908 | C. Thomas Tyler | Pushing SDP 2015.1.13906. | ||
#17 | 12171 | Russell C. Jackson (Rusty) | Merge in changes to remove the need for p4master_run. | ||
#16 | 11950 | Russell C. Jackson (Rusty) |
Made die function record ERROR!!! $HOSTNAME and $P4SERVER in subject. Cleaned up message passed to die command and corrected a typo. |
||
#15 | 11929 | Russell C. Jackson (Rusty) | Updated die function to just pass parameter to mail_log_file instead of echo. | ||
#14 | 11919 | Russell C. Jackson (Rusty) |
Added a SERVERID variable to p4_vars and updated backup_functions to use it. Changed the location and the names of the config files so that they could live in /p4/common/config (You're welcome Tom). The files names are: p4_$INSTANCE.vars p4_$INSTANCE.p4review.cfg p4_$INSTANCE.vars will now set P4REPLICA to FALSE if SERVERID matches MASTERNAME, otherwise it is TRUE. This change means that a user must change server.id now in order to change the role of the server rather than changing the instance vars file. This makes more sense to a user that is reading the admin guide about server.id rather than overwriting the file based on a setting that isn't in the admin guide. Change mkdirs to reflect all of the above changes. |
||
#13 | 11908 | adrian_waters | Use set -u to trap unbounded variables | ||
#12 | 11886 | Russell C. Jackson (Rusty) | Changed $prog to $0 so that we don't have to set prog in the calling functions. | ||
#11 | 11766 | Robert Cowham |
Missed a reference to $MAIL in @11758 Tweaked run_tests.sh to output more error messages on failure. Though this still doesn't show output of individual failed commands. |
||
#10 | 11758 | Russell C. Jackson (Rusty) |
Change MAIL variable to SDPMAIL to avoid conflicts with customer variables. Changed sdp_sync.sh to use get_mail_opts from backup_functions to avoid duplicate functions. |
||
#9 | 11730 | Russell C. Jackson (Rusty) |
Moved P4SERVER variable to p4_vars so that all scripts can use it properly. replica_status.sh referenced it, but it wasn't working since it was only in backup_functions.sh |
||
#8 | 11710 | Russell C. Jackson (Rusty) |
Changed die function to call new email function. Added su to OSUSER functionality to p4master_run to avoid problems with people running scripts manually as root by mistake. |
||
#7 | 11707 | Robert Cowham |
Refactored sending of mail to a common function. Make the setting of "MAILFROM" work for Ubuntu (GNU Mailutils) as well as CentOS |
||
#6 | 11570 | Russell C. Jackson (Rusty) |
Brought in changes from Mark Foundry to add -S $MAILFROM to mail commands. Changed sync_replica.sh and weekly_sync_replica.sh to use $LOGFILE for consistency. Added mail command to both files as well. |
||
#5 | 11540 | Russell C. Jackson (Rusty) | Converted to unix format. | ||
#4 | 11534 | Russell C. Jackson (Rusty) | Added -f to -jr to cover offline obliterates where the entries are already removed from the offline database. | ||
#3 | 11524 | Russell C. Jackson (Rusty) | Released updated version of the SDP from Dev. | ||
#2 | 11130 | Robert Cowham | Check for the existence of offline database and log error message if not found. | ||
#1 | 10148 | C. Thomas Tyler | Promoted the Perforce Server Deployment Package to The Workshop. |