recover_edge.sh #7

#!/bin/bash
#==============================================================================
# Copyright and license info is available in the LICENSE file included with
# the Server Deployment Package (SDP), and also available online:
# https://swarm.workshop.perforce.com/projects/perforce-software-sdp/view/main/LICENSE
#------------------------------------------------------------------------------

# This script is designed to rebuild an Edge server from a checkpoint of the master WHILE
# KEEPING THE EXISTING EDGE SPECIFIC DATA.
# You have to first copy the checkpoint from the master to the edge server before running this script.
# Then you run this script on the Edge server with the instance number and full path and filename
# of the master checkpoint as parameters.
#
# Run example:
#  ./recover_edge.sh 1 /depotdata/p4_1.ckp.9188.gz

export SDP_INSTANCE=${SDP_INSTANCE:-Undefined}
export SDP_INSTANCE=${1:-$SDP_INSTANCE}
if [[ $SDP_INSTANCE == Undefined ]]; then
   echo "Instance parameter not supplied."
   echo "You must supply the Perforce instance as a parameter to this script."
   exit 1
fi

. /p4/common/bin/p4_vars $SDP_INSTANCE
. /p4/common/bin/backup_functions.sh
LOGFILE=$LOGS/recover_edge.log

######### Start of Script ##########

check_vars
set_vars

if [[ "$2" == "" ]]; then
   echo You must pass in the full path and filename of the checkpoint you copied over from the master server.
   exit 2
fi

MASTERCKP=$2

log "Remove offline db"
rm -f /p4/${SDP_INSTANCE}/offline_db/db.* > $LOGFILE

# With -K we filter out the various Edge-specific tables which will be replaced with 
# current live versions.
EXCLUDED_TABLES=db.have,db.working,db.resolve,db.locks,db.revsh,db.workingx,db.resolvex

log "Recover checkpoint from master into offline_db skipping tables not used on the edge."
/p4/${SDP_INSTANCE}/bin/p4d_${SDP_INSTANCE} -r /p4/${SDP_INSTANCE}/offline_db/ -K $EXCLUDED_TABLES -jr -z $MASTERCKP >> $LOGFILE 2>&1

log "Stopping the edge server."
/p4/${SDP_INSTANCE}/bin/p4d_${SDP_INSTANCE}_init stop  >> $LOGFILE 2>&1

# With -k we filter and only checkpoint the specified tables from the current live Edge DB.
CKP_TABLES=$EXCLUDED_TABLES,db.view,db.label,db.revsx,db.revux

log "Creating a dump of the edge specific data from P4ROOT"
/p4/${SDP_INSTANCE}/bin/p4d_${SDP_INSTANCE} -r /p4/${SDP_INSTANCE}/root/ -k $CKP_TABLES -jd /p4/${SDP_INSTANCE}/checkpoints/edgedump  >> $LOGFILE 2>&1

log "Recover the edge dump into offline_db"
/p4/${SDP_INSTANCE}/bin/p4d_${SDP_INSTANCE} -r /p4/${SDP_INSTANCE}/offline_db -jr /p4/${SDP_INSTANCE}/checkpoints/edgedump  >> $LOGFILE 2>&1

log "Reset the replication state and clear the P4ROOT folder db files."
rm -f /p4/${SDP_INSTANCE}/root/db.*  >> $LOGFILE 2>&1
rm -f /p4/${SDP_INSTANCE}/root/state  >> $LOGFILE 2>&1
rm -f /p4/${SDP_INSTANCE}/root/rdb.lbr  >> $LOGFILE 2>&1
rm -f /p4/${SDP_INSTANCE}/logs/journal  >> $LOGFILE 2>&1

log "Move the rebuilt database to P4ROOT"
mv /p4/${SDP_INSTANCE}/offline_db/db.* /p4/${SDP_INSTANCE}/root/  >> $LOGFILE 2>&1

log "Start the edge server back up."
/p4/${SDP_INSTANCE}/bin/p4d_${SDP_INSTANCE}_init start  >> $LOGFILE 2>&1

log "Recreate the offline_db"
/p4/${SDP_INSTANCE}/bin/p4d_${SDP_INSTANCE} -r /p4/${SDP_INSTANCE}/offline_db/ -K $EXCLUDED_TABLES -jr -z $MASTERCKP  >> $LOGFILE 2>&1
/p4/${SDP_INSTANCE}/bin/p4d_${SDP_INSTANCE} -r /p4/${SDP_INSTANCE}/offline_db -jr /p4/${SDP_INSTANCE}/checkpoints/edgedump  >> $LOGFILE 2>&1

log "Create a new edge checkpoint from offline_db"
/p4/${SDP_INSTANCE}/bin/p4d_${SDP_INSTANCE} -r /p4/${SDP_INSTANCE}/offline_db -jd -z /p4/${SDP_INSTANCE}/checkpoints/rebuilt_edge_dump.gz  >> $LOGFILE 2>&1

# Inform the user about follow up tasks
log "Rebuilt checkpoint is: /p4/${SDP_INSTANCE}/checkpoints/rebuilt_edge_dump.gz"
log "If you run this script the night before a recreate_db_checkpoint.sh is going to run," 
log "you need to delete the highest numbered checkpoint in /p4/${SDP_INSTANCE}/checkpoints"
log "and rename /p4/${SDP_INSTANCE}/checkpoints/rebuilt_edge_dump.gz to replace that file."

cat $LOGFILE

#	Change	User	Description
#16	28175	C. Thomas Tyler	The recover_edge.sh script now generates the offline_db_usable.txt file. Added more error handling. #review-28176
#15	27178	ashaikh	The recover_edge.sh SDP script errors out because a variable is accessed before it is declared. In this case, the following line throws an error: declare EdgeCheckpointsDir="${CHECKPOINTS}.${SERVERID#p4d_}" Error: /p4/common/bin/recover_edge.sh: line 27: CHECKPOINTS: unbound variable Opened a job regarding this error: https://swarm.workshop.perforce.com/jobs/SDP-579
#14	26492	C. Thomas Tyler	Updated recover_edge.sh to use start_p4d() and stop_p4d() functions.
#13	25949	C. Thomas Tyler	Fixed typo in variable name.
#12	25938	C. Thomas Tyler	Fixed bug in safety check.
#11	25920	C. Thomas Tyler	Added new 'edge_vars' file to dynamically set list of edge-specific db tables based on current P4D version. Updated edge_dump.sh, recover_edge.sh, and load_checkpoint.sh to use the new edge_vars file. Made edge_dump.sh and recover_edge.sh shellcheck v0.6.0 compliant, along with load_checkpoint.sh.
#10	23297	C. Thomas Tyler	Added safety checks to avoid running commands that will certainly fail in upgrade.sh. Generally, /p4/common/bin will be the same on all hosts in a Helix topolgy. However, on any given machine, the /p4/<N>/bin/<EXE>_<N>_init scripts should exist only for executables that run on that machine. This change to upgrade.sh should work on machines even where only a proxy or broker runs. Also, it will not generate errors in cases where there is, say, a p4p_N_bin symlink in /p4/common/bin but no /p4/N/bin/p4p_N_init script, which will a common situation since /p4/common/bin will contain all executables used anywhere, while /p4/N/bin is host-specific. Also made cosmetic fixes and style convergence change. In dump_edge.sh and recover_edge_dump.sh, just fixed cosmetic typos.
#9	23266	C. Thomas Tyler	Fixes and Enhancements: * Enabled daily_checkpoint.sh operate on edge servers, to keep /p4/N/offline_db current on those hosts for site-local recovery w/o requiring a site-local replica (though having a site-local replica can still be useful). * Disabled live_checkpoint.sh for edge servers. * More fully support topologies using edge severs, in both geographically distributed and horizaontal scaling "wokspace server" solutions. * Fix broken EDGESERVER value definition. * Modified name of SDP counter that gets set when a checkpoint is taken to incorporate ServerID, so now the counter name will look like lastSDPCheckpoint.master.1, or lastSDPCheckpoint.p4d_edge_sfo, rather than just lastSDPCheckpoint. There will be multiple such counters in a topology that uses edge servers, and/or which takes checkpoints on replicas. * Added comments for all functions. For the master server, journalPrefix remains: /p4/N/checkpoints/p4_N The /p4/N/checkpoints is reserved for writing by the master/commit server only. For non-standby (possibly filtered) replicas and edge serves, journalPrefix is: /p4/N/checkpoints.<ShortServerID>/p4_N.<ShortServerID> Here, ShortServerID is just the ServerID with the 'p4d_' prefix trimmed, since it is redundant in this context. See mkrep.sh, which enshines a ServerID (server spec) naming standard, with values like 'p4d_fr_bos' (forwarding replica in Boston) and p4d_edge_blr (Edge server in Bangalore). So the journalPrefix for the p4d_edge_bos replica would be: /p4/N/checkpoints.edge_bos/p4_N.edge_bos For "standby" (aka journalcopy) replicas, journalPrefix is set to /p4/N/journals.rep. which is written to the $LOGS volume, due to the nature of standby replicas using journalPrefix to write active server logs to pre-rotated journals. Some take-away to be updated in docs: * The /p4/N/checkpoints folder must be reserved for checkpoints that originate on the master. It should be safe to rsync this folder (with --delete if desired) to any replica or edge server. This is consistent with the current SDP. * I want to change 'journals.rep' to 'checkpoints.<ShortServerID>' for non-standby replicas, to ensure that checkpoints and journals taken on those hosts are written to a volume where they are backed up. * In sites with multiple edge serves, some sharing achive files ('workspace servers'), multiple edge servers will share the same SAN. So we one checkpoints dir per ServerID, and we want that dir to be on the /hxdepots volume. Note that the journalPrefix for replicas was a fixed /p4/N/journals.rep. This was on the /hxlogs volume - a presumably fast-for-writes volume, but typically NOT backed up and not very large. This change puts it under /p4/N/checkpoints.* for edge servers and non-standby replicas, but ensures other replica types and edge servers can generate checkpoints to a location that is backed up and has plenty of storage capacity. For standby replicas only (which cannot be filtered), the journalPrefix remains /p4/N/journals.rep on the /hxlogs volume.
#8	22889	Russell C. Jackson (Rusty)	Enhanced to mark when it is running so that a checkpoint doesn't stomp on the offline_db, and also made it just go ahead and create the correct checkpoint name.
#7	21280	Russell C. Jackson (Rusty)	Added standard logging and use of SDP_INSTANCE.
#6	19113	Russell C. Jackson (Rusty)	Changed name of daily_backup.sh to daily_checkpoint.sh Changed name of weekly_backup.sh to recreate_db_checkpoint.sh Updated crontabs with new names, and changed to run recreate_db_checkpoint on the 1st Sat. of Jan. and July. For most companies, this is a better practice than recreating weekly per discussion with Anton. Remove solaris crontab since Solaris is pretty much dead, and we don't test on it. Updated docs to reflect name changes, and did a little clean other other sections while I was in there.
#5	17293	Robert Cowham	Clarifications in comments - no functional change.
#4	17219	C. Thomas Tyler	Routine Merge Down to dev from main.
#3	16029	C. Thomas Tyler	Routine merge to dev from main using: p4 merge -b perforce_software-sdp-dev
#2	15778	C. Thomas Tyler	Routine Merge Down to dev from main.
#1	15753	C. Thomas Tyler	Routine Merge Down to dev from main.
//guest/perforce_software/sdp/main/Server/Unix/p4/common/bin/recover_edge.sh
#1	15716	Russell C. Jackson (Rusty)	Script for rebuilding an Edge server.