#!/bin/bash #============================================================================== # Copyright and license info is available in the LICENSE file included with # the Server Deployment Package (SDP), and also available online: # https://swarm.workshop.perforce.com/projects/perforce-software-sdp/view/main/LICENSE #------------------------------------------------------------------------------ set -u # Common functions used in all backup scripts. check_vars () { if [[ -z "$SDP_INSTANCE" || -z "$P4HOME" || -z "$P4PORT" || -z "$P4ROOT" || -z "$P4JOURNAL" || -z "$P4BIN" || -z "$P4DBIN" || -z "$P4TICKETS" || -z "$KEEPCKPS" || -z "$KEEPJNLS" || -z "$KEEPLOGS" || -z "$CHECKPOINTS" || -z "$LOGS" || -z "$OSUSER" ]]; then echo "Use p4master_run or source p4_vars when calling this script." echo "Required external variable not set. Abort!" exit 1 fi } set_vars () { RC=$P4HOME/bin/p4d_${SDP_INSTANCE}_init OFFLINE_DB=${P4HOME}/offline_db EDGESERVER=0 $P4CBIN/p4login $P4BIN -u $P4USER -p $P4PORT server -o $SERVERID | grep ^Services | grep "edge-server" > /dev/null if [[ $(echo $?) -eq 0 ]]; then EDGESERVER=1 fi } # check if user is running as required OS user check_uid () { user=$(id -un) if [[ ${user} != ${OSUSER} ]]; then die "Must be run by user: ${OSUSER}. Abort!" fi } log () { echo -n $(date) 2>&1 >> "$LOGFILE" echo " $0: $@" 2>&1 >> "$LOGFILE" } # Decide depending on our mail utility, how to specify sender (if we need to). # Mail on some platforms sets sender by default. # If the mail utility returns what looks like a version identifier # when given the '-V' flag, use a '-S' flag. If it does not return a # verision identifier, don't set a mail sender option. # Allow GNU Mailutils alternative flag instead. get_mail_sender_opt () { local mail_sender_opt= local mail_ver= if [[ -n "$MAILFROM" ]]; then mail_ver=$($SDPMAIL -V 2>&1) if [[ "$mail_ver" =~ "GNU Mailutils" ]]; then mail_sender_opt="-aFrom:$MAILFROM" elif [[ "$mail_ver" =~ ^[0-9]+\.[0-9] ]]; then mail_sender_opt="-S from=$MAILFROM" fi fi echo "$mail_sender_opt" } mail_log_file () { local subject=$1 local mail_sender_opt=$(get_mail_sender_opt) $SDPMAIL -s "$subject" $mail_sender_opt $MAILTO < "$LOGFILE" } die () { # send mail and exit # mail the error (with more helpful subject line than cron) log "ERROR!!! - $HOSTNAME $P4SERVER $0: $@" mail_log_file "ERROR!!! - $HOSTNAME $P4SERVER $0: $@" # if running from terminal, also send to stderr tty >/dev/null if [[ $? -eq 0 ]]; then echo "$@" >&2 fi exit 1 } ckp_running() { if [[ -f ${LOGS}/ckp_running.txt ]]; then die "Last checkpoint not complete. Check the backup process or contact support." fi echo "Checkpoint running." > ${LOGS}/ckp_running.txt } ckp_complete() { rm -f ${LOGS}/ckp_running.txt } checkdir () { local dir=$1 [[ -w $dir ]] && return die "$dir is not writable. Abort!" } check_dirs () { # Check that key dirs are writable dirs_ok=true for dir in $OFFLINE_DB $CHECKPOINTS $LOGS; do checkdir $dir # aborts on failure. done } check_disk_space () { # Add the results of df -h or df -m to the log file. log "Checking disk space..." $P4BIN diskspace >> "$LOGFILE" 2>&1 } check_journalnum () { local JNLNUM=$1 re='^[0-9]+$' if ! [[ $JNLNUM =~ $re ]] ; then die "Journal number must be numeric." fi } get_journalnum () { # get the current journal and checkpoint serial numbers. JOURNALNUM=$($P4BIN -u $P4USER -p $P4PORT counter journal 2>> $LOGFILE) || die "Cannot get the checkpoint number. Abort!" check_journalnum $JOURNALNUM # If we are on an edge server, the journal has already rotated, so we have to decrement the value # so that we replay the correct journal file and create the correct checkpoint number on the # edge server. if [[ $EDGESERVER -eq 1 ]]; then JOURNALNUM=$(($JOURNALNUM - 1)) fi CHECKPOINTNUM=$(($JOURNALNUM + 1)) } check_offline_db_usable () { # Check it is OK if [[ ! -f $OFFLINE_DB/offline_db_usable.txt ]]; then die "Offline database not in a usable state. Check the backup process." fi if [[ ! -f $OFFLINE_DB/db.counters ]]; then die "Offline database not found. Consider creating it with live_checkpoint.sh. Be aware that it locks the live system and can take a long time! Abort!" fi } get_offline_journal_num () { # Get the journal number of the offline database check_offline_db_usable OFFLINEJNLNUM=$($P4DBIN -r $OFFLINE_DB -jd - db.counters | grep '@journal@' | cut -d "@" -f 8 2>> $LOGFILE) || die "Cannot get the offline journal number. Abort!" check_journalnum $OFFLINEJNLNUM log "Offline journal number is: $OFFLINEJNLNUM" } remove_old_checkpoints_and_journals () { if [[ $KEEPCKPS -eq 0 ]]; then log "Skipping cleanup of old checkpoints because KEEPCKPS is set to 0." else log "Deleting obsolete checkpoints and journals. Keeping latest $KEEPCKPS per KEEPCKPS setting in p4_vars." # Remove selected checkpoint and journal files based on the KEEPCKPS setting regardless of whether compressed or not. # We multiply KEEPCKP by 2 for the ckp files because of the md5 files. for I_LOGFILE in $(ls -t ${CHECKPOINTS}/${P4SERVER}.ckp.* 2>/dev/null | awk "NR > ($KEEPCKPS * 2)"); do log "rm -f $I_LOGFILE" rm -f "$I_LOGFILE" done # Use KEEPJNLS to allow for separate journal rotation at a higher frequency. for I_LOGFILE in $(ls -t ${CHECKPOINTS}/${P4SERVER}.jnl.* 2>/dev/null | awk "NR > $KEEPJNLS"); do log "rm -f $I_LOGFILE" rm -f "$I_LOGFILE" done fi } stop_p4d () { log "Shutting down the p4 server" $RC stop >> "$LOGFILE" 2>&1 COUNTER=$( ps -ef | grep -i p4d_$P4INSTANCE | grep -v grep | wc -l ) declare -i WAITCOUNT=0 while [[ $COUNTER != "0" ]]; do sleep 5 COUNTER=$( ps -ef | grep -i p4d_$P4INSTANCE | grep -v grep | wc -l ) WAITCOUNT=$(( $WAITCOUNT + 1 )) # The WAITCOUNT value below is 120 * 5 seconds = 10 minutes. # If p4d hasn't shut down by then, something is not shutting down on its own, so we kill it. if (( $WAITCOUNT > 120 )); then ps -ef | grep -i p4d_$P4INSTANCE | awk '{print $2}' | xargs kill > /dev/null 2>&1 fi done log "p4 stop finished -- p4 should be down now." } start_p4d () { log "Starting the p4 server" $RC start >> "$LOGFILE" 2>&1 sleep 3 # Give it a few seconds to start up # Confirm that it started - success below means it did if $P4BIN -u $P4USER -p $P4PORT info >/dev/null 2>&1 ; then log "Server restarted successfully - p4 should be back up now." else log "Error: Server does not appear to have started." fi } truncate_journal () { [[ -f ${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz ]] && die "Checkpoint ${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz already exists, check the backup process." if [[ $EDGESERVER -eq 0 ]]; then [[ -f ${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM} ]] && die "Journal ${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM} already exists, check the backup process." log "Truncating journal..." # 'p4d -jj' does a copy-then-delete, instead of a simple mv. # during 'p4d -jj' the perforce server will hang the responses to clients. # curly braces are necessary to capture the output of 'time' $P4CBIN/p4login -p $P4MASTERPORT { time $P4BIN -p $P4MASTERPORT admin journal ${CHECKPOINTS}/${P4SERVER}; } >> "$LOGFILE" 2>&1 || { die "Journal rotation failed. Abort!"; } # The test below waits until the journal file exists in the checkpoints directory before proceeding. test=1 while [ $test != 0 ] do sleep 5 if [ -f "${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM}" ];then test=0 fi done $P4CBIN/p4login fi } p4d_truncate_journal () { [[ -f ${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz ]] && die "Checkpoint ${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz already exists, check the backup process." if [[ $EDGESERVER -eq 0 ]]; then [[ -f ${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM} ]] && die "Journal ${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM} already exists, check the backup process." log "Truncating journal..." # 'p4d -jj' does a copy-then-delete, instead of a simple mv. # during 'p4d -jj' the perforce server will hang the responses to clients. # curly braces are necessary to capture the output of 'time' { time $P4DBIN -r $P4ROOT -J $P4JOURNAL -jj ${CHECKPOINTS}/${P4SERVER}; } >> "$LOGFILE" 2>&1 || { start_p4d; die "Journal rotation failed. Abort!"; } fi } replay_journals_to_offline_db () { log "Replay any unreplayed journals to the offline database" for (( j=$OFFLINEJNLNUM; $j <= $JOURNALNUM; j++ )); do log "Replay journal ${P4SERVER}.jnl.${j} to offline db." # curly braces are necessary to capture the output of 'time' { time $P4DBIN -r $OFFLINE_DB -jr -f ${CHECKPOINTS}/${P4SERVER}.jnl.${j}; } >> "$LOGFILE" 2>&1 || { die "Offline journal replay failed. Abort!"; } done } replay_active_journal_to_offline_db () { log "Replay active journal to offline db." # curly braces are necessary to capture the output of 'time' { time $P4DBIN -r $OFFLINE_DB -jr -f ${P4JOURNAL}; } >> "$LOGFILE" 2>&1 || { die "Active Journal replay failed. Abort!"; } } recreate_offline_db_files () { [[ -f ${CHECKPOINTS}/${P4SERVER}.ckp.*.gz ]] && ckp_complete && die "No checkpoints found - run live_checkpoint.sh" rm -f ${OFFLINE_DB}/offline_db_usable.txt rm -f ${OFFLINE_DB}/db.* >> "$LOGFILE" rm -f ${OFFLINE_DB}/save/db.* >> "$LOGFILE" LASTCKP=$(ls -t ${CHECKPOINTS}/${P4SERVER}.ckp.*.gz | head -1) log "Recovering from $LASTCKP" # curly braces are necessary to capture the output of 'time' { time $P4DBIN -r $OFFLINE_DB -jr -z ${LASTCKP}; } >> "$LOGFILE" 2>&1 || { die "Restore of checkpoint to $OFFLINE_DB failed!"; } echo "Offline db file restored successfully." > ${OFFLINE_DB}/offline_db_usable.txt } checkpoint () { log "Create a new checkpoint from the live db files." # curly braces are necessary to capture the output of 'time' { time $P4DBIN -r $P4ROOT -jc -Z ${CHECKPOINTS}/${P4SERVER}; } >>"$LOGFILE" 2>&1 || { die "ERROR - New checkpoint failed!"; } } dump_checkpoint () { log "Dump out new checkpoint from db files in $ROOTDIR." # curly braces are necessary to capture the output of 'time' { time $P4DBIN -r $ROOTDIR -jd -z ${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz; } >> "$LOGFILE" 2>&1 || { die "New checkpoint dump failed!"; } } compare_journal_numbers () { # Get the journal number of the offline database if [[ ! -f $OFFLINE_DB/offline_db_usable.txt ]]; then die "Offline database not in a usable state. Check the backup process." fi if [[ ! -f $OFFLINE_DB/db.counters ]]; then die "Offline database not found. Consider creating it with live_checkpoint.sh. Be aware that it locks the live system and can take a long time! Abort!" fi local _OFFLINEJNLNUM=$($P4DBIN -r $OFFLINE_DB -jd - db.counters | grep '@journal@' | cut -d "@" -f 8 2>> $LOGFILE) || die "Cannot get $OFFLINE_DB journal number. Abort!" check_journalnum $_OFFLINEJNLNUM # Get the journal number of the root database if [[ ! -f $P4ROOT/db.counters ]]; then die "$P4ROOT database not found. Something is seriously wrong since the server was just running a minute ago! Contact support@perforce.com" fi local _JNLNUM=$($P4DBIN -r $P4ROOT -jd - db.counters | grep '@journal@' | cut -d "@" -f 8 2>> $LOGFILE) || die "Cannot get $P4ROOT journal number. Abort!" check_journalnum $_JNLNUM if [[ $_JNLNUM -ne $_OFFLINEJNLNUM ]]; then log "$P4ROOT journal number is: $_JNLNUM" log "$OFFLINE_DB journal number is: $_OFFLINEJNLNUM" die "$P4ROOT and $OFFLINE_DB numbers do not match." fi } switch_db_files () { # Compare the Offline and Master journal numbers before switching to make sure they match. compare_journal_numbers log "Switching root and offline_db links..." [[ -d ${P4ROOT}/save ]] || mkdir -p ${P4ROOT}/save rm -f ${P4ROOT}/save/db.* >> $LOGFILE 2>&1 mv ${P4ROOT}/db.* ${P4ROOT}/save >> $LOGFILE 2>&1 mv ${P4ROOT}/license* ${OFFLINE_DB} >> $LOGFILE 2>&1 mv ${P4ROOT}/rdb.lbr ${OFFLINE_DB} >> $LOGFILE 2>&1 mv ${P4ROOT}/state* ${OFFLINE_DB} >> $LOGFILE 2>&1 mv ${P4ROOT}/server.id ${OFFLINE_DB} >> $LOGFILE 2>&1 rm -f ${OFFLINE_DB}/offline_db_usable.txt OLDBLNK=$(readlink $OFFLINE_DB) ROOTLNK=$(readlink $P4ROOT) unlink $OFFLINE_DB unlink $P4ROOT ln -s $OLDBLNK $P4ROOT >> $LOGFILE 2>&1 || die "Link of $OLDBLNK to $P4ROOT failed." ln -s $ROOTLNK $OFFLINE_DB >> $LOGFILE 2>&1 || die "Link of $ROOTLNK to $OFFLINE_DB failed." } rotate_log_file () { cd "$LOGS" ROTATE_LOGNAME=$1 GZ_EXT=${2:-} LOGID=$(date +'%Y-%m-%d_%H-%M-%S') if [[ -f ${ROTATE_LOGNAME} ]]; then mv -f ${ROTATE_LOGNAME} ${ROTATE_LOGNAME}.${LOGID} >> $LOGFILE 2>&1 [[ ! -z "$GZ_EXT" ]] && gzip ${ROTATE_LOGNAME}.${LOGID} >> $LOGFILE 2>&1 fi cd - > /dev/null } # At the start of each run for live_checkpoint.sh, daily_checkpoint.sh, and # recreate_db_checkpoint.sh, before *any* logging activity occurs, rotate the logs # from the most recent prior run, always named "checkpoint.log" or "log". rotate_last_run_logs () { # Rotate prior log file for the current script. rotate_log_file $LOGFILE # Rotate prior server log. rotate_log_file "log" ".gz" # Rotate prior broker log. rotate_log_file "p4broker.log" ".gz" # Rotate prior audit log. rotate_log_file "audit.log" ".gz" } remove_log_files () { REMOVE_LOGNAME=$1 KEEPNUM=$2 for I_LOGFILE in $(ls -t ${REMOVE_LOGNAME}* 2>/dev/null | awk "NR > $KEEPNUM"); do log "rm -f $I_LOGFILE" rm -f "$I_LOGFILE" done } remove_old_logs () { # Remove old Checkpoint Logs # Use KEEPJNLS rather than KEEPLOGS, so we keep the same number # of checkpoint logs as we keep checkpoints. cd "$LOGS" if [[ $KEEPJNLS -eq 0 ]]; then log "Skipping cleanup of old checkpoint logs because KEEPJNLS is set to 0." else log "Deleting old checkpoint logs. Keeping latest $KEEPJNLS, per KEEPJNLS setting in p4_vars." remove_log_files "checkpoint.log" $KEEPJNLS fi if [[ $KEEPLOGS -eq 0 ]]; then log "Skipping cleanup of old server logs because KEEPLOGS is set to 0." else log "Deleting old server logs. Keeping latest $KEEPLOGS, per KEEPLOGS setting in p4_vars." remove_log_files "log" $KEEPLOGS remove_log_files "p4broker.log" $KEEPLOGS remove_log_files "audit.log" $KEEPLOGS remove_log_files "sync_replica.log" $KEEPLOGS remove_log_files "recreate_offline_db.log" $KEEPLOGS remove_log_files "upgrade.log" $KEEPLOGS remove_log_files "p4login" $KEEPLOGS remove_log_files "p4verify.log" $KEEPLOGS fi cd - > /dev/null } set_counter() { $P4CBIN/p4login $P4BIN -u $P4USER -p $P4PORT counter lastSDPCheckpoint "$(date +'%s (%Y/%m/%d %H:%M:%S %z %Z)')" > /dev/null }
# | Change | User | Description | Committed | |
---|---|---|---|---|---|
#7 | 25113 | Robert Cowham | Merge latest changes from dev | ||
#6 | 23430 | Robert Cowham | Merge in changes from dev | ||
#5 | 23205 | Robert Cowham | Merged all changes from dev to test | ||
#4 | 22477 | Robert Cowham | Bring latest dev changes into test | ||
#3 | 22142 | Robert Cowham | Merge in latest changes from Dev | ||
#2 | 20726 | Robert Cowham | Catch up from dev | ||
#1 | 18586 | Robert Cowham | Branching using cowhamr.sdp.dev | ||
//guest/perforce_software/sdp/dev/Server/Unix/p4/common/bin/backup_functions.sh | |||||
#21 | 18533 | Robert Cowham |
Put a date/time suffix onto checkpoint.log.* files in case of any errors to avoid them being overwritten. Make remove_old_logs tidy up appropriately. |
||
#20 | 18532 | Robert Cowham | Correct log message regarding journals replays | ||
#19 | 18484 | Russell C. Jackson (Rusty) | Added comment on WAITCOUNT to explain the value. | ||
#18 | 18450 | Russell C. Jackson (Rusty) |
Added a kill for the p4d_stop function in case p4d doesn't shut down. In the process of testing this, I discovered that using $P4DBIN in this case was a bug that didn't work when running in case insensitive mode because the executable doesn't match what is actually running since we end up calling p4d from /p4/common/bin. Corrected the grep so that it would match in either case. #review-18430 |
||
#17 | 16335 | C. Thomas Tyler |
Routine Merge Down to dev from main using: p4 merge -b perforce_software-sdp-dev |
||
#16 | 16029 | C. Thomas Tyler |
Routine merge to dev from main using: p4 merge -b perforce_software-sdp-dev |
||
#15 | 15797 | C. Thomas Tyler | Routine Merge Down to dev from main for SDP. | ||
#14 | 15778 | C. Thomas Tyler | Routine Merge Down to dev from main. | ||
#13 | 15376 | adrian_waters | formatting only - fix spacing; there's inconsistent use of tabs/spaces throughout the file - needs cleanup at some point. | ||
#12 | 15375 | adrian_waters | Routine merge-down from main->dev | ||
#11 | 15374 | adrian_waters |
- Ensure backup scripts are run as the OSUSER (to prevent accidental running as root); - in scripts where LOGFILE value is changed from the 'checkpoint.log' set by set_vars, ensure the new assignment is before check_dirs is called, otherwise errors could be written to the 'wrong' log - in 'die()' - detect if running from terminal & also send output to stderr |
||
#10 | 13931 | C. Thomas Tyler | Routine merge-down to dev from main. | ||
#9 | 13906 | C. Thomas Tyler |
Normalized P4INSTANCE to SDP_INSTANCE to get Unix/Windows implementations in sync. Reasons: 1. Things that interact with SDP in both Unix and Windows environments shoudn't have to account for this obscure SDP difference between Unix and Windows. (I came across this doing CBD work). 2. The Windows and Unix scripts have different variable names for defining the same concept, the SDP instance. Unix uses P4INSTANCE, while Windows uses SDP_INSTANCE. 3. This instance tag, a data set identifier, is an SDP concept. I prefer the SDP_INSTANCE name over P4INSTANCE, so I prpose to normalize to SDP_INSTANCE. 4. The P4INSTANCE name makes it look like a setting that might be recognized by the p4d itself, which it is not. (There are other such things such as P4SERVER that could perhaps be renamed as a separate task; but I'm not sure we want to totally disallow the P4 prefix for variable names. It looks too right to be wrong in same cases, like P4BIN and P4DBIN. That's a discussion for another day, outside the scope of this task). Meanwhile: * Fixed a bug in the Windows 2013.3 upgrade script that was referencing undefined P4INSTANCE, as the Windows environment defined only SDP_INSTANCE. * Had P4INSTANCE been removed completely, this change would likely cause trouble for users doing updates for existing SDP installations. So, though it involves slight technical debt, I opted to keep a redundant definition of P4INSTANCE in p4_vars.template, with comments indicating SDP_INSTANCE should be used in favor of P4INSTANCE, with a warning that P4INSTANCE may go away in a future release. This should avoid unnecessary upgrade pain. * In mkdirs.sh, the varialbe name was INSTANCE rather than SDP_INSTANCE. I changed that as well. That required manual change rather than sub/replace to avoid corrupting other similar varialbe names (e.g. MASTERINSTANCE). This is a trivial change technically (a substitute/replace, plus tweaks in p4_vars.template), but impacts many files. |
||
#8 | 12169 | Russell C. Jackson (Rusty) |
Updated copyright date to 2015 Updated shell scripts to require an instance parameter to eliminate the need for calling p4master_run. Python and Perl still need it since you have to set the environment for them to run in. Incorporated comments from reviewers. Left the . instead of source as that seems more common in the field and has the same functionality. |
||
#7 | 12028 | C. Thomas Tyler | Refreshed SDP dev branch, merging down from main. | ||
#6 | 11541 | Russell C. Jackson (Rusty) | Keeping dev up to date. | ||
#5 | 11535 | Russell C. Jackson (Rusty) | Updated dev from main. | ||
#4 | 11509 | Russell C. Jackson (Rusty) |
Added sync_replica.log to backup function log rotations, and added rm on existing gzipped logs with the same name in order to keep the script from hanging waiting for a response to overwrite. Added sync_shared_replica.sh and weekly_sync_shared_replica.sh to support replicas with shared depotdata storage. No rsync is necessary. The logs volume must not be a shared volume with these scripts though. |
||
#3 | 11483 | Russell C. Jackson (Rusty) | Brought over changes from RCJ backup_functions.sh | ||
#2 | 11463 | Russell C. Jackson (Rusty) | Updated dev to prepare for Summit agreed changes. | ||
#1 | 10638 | C. Thomas Tyler | Populate perforce_software-sdp-dev. | ||
//guest/perforce_software/sdp/main/Server/Unix/p4/common/bin/backup_functions.sh | |||||
#1 | 10148 | C. Thomas Tyler | Promoted the Perforce Server Deployment Package to The Workshop. |