recover_edge.sh #20

  • //
  • guest/
  • perforce_software/
  • sdp/
  • dev/
  • Server/
  • Unix/
  • p4/
  • common/
  • bin/
  • recover_edge.sh
  • View
  • Commits
  • Open Download .zip Download (10 KB)
#!/bin/bash
#==============================================================================
# Copyright and license info is available in the LICENSE file included with
# the Server Deployment Package (SDP), and also available online:
# https://swarm.workshop.perforce.com/projects/perforce-software-sdp/view/main/LICENSE
#------------------------------------------------------------------------------

# This script is designed to rebuild an Edge server from a seed checkpoint from
# the commit server WHILE KEEPING THE EXISTING EDGE SPECIFIC DATA.
#
# You have to first copy the seed checkpoint from the commit server, created with
# edge_dump.sh, to the edge server before running this script.  (Alternately,
# a full checkpoint from the commit server can be used so long as the edge server
# spec does not specify any filtering, e.g. does not make use of the
# ArchiveDataFilter or RevisionDataFilter fields of the server spec.)

# Then run this script on the edge server with the SDP instance name and full
# path of the commit seed checkpoint as parameters. The checkpoint can be
# a file or directory (for parallel checkpoints).
#
# Example 1: Recover for SDP instance 1, with a checkpoint file copied
# from the commit server in the usual place:
#  ./recover_edge.sh 1 /p4/1/checkpoints/p4_1.edge_syd.seed.ckp.9188.gz
#
#
# Example 2: Recover for SDP instance abc, with a checkpoint directory
# (from a parallel checkpoint) stored in a non-SDP location:
#  ./recover_edge.sh abc /home/perforce/xfer_ckp/p4_abc.edge_syd.seed.ckp.9188

set -u

declare ExcludedTables=
declare CheckpointTables=
declare Cmd=
declare EdgeCheckpointsDir=
declare EdgeDumpPrefix=
declare NewEdgeDumpPrefix=
declare -i DoEdgeSeedReplayParallel=0

# These two Create*Checkpoint values are set in the set_vars() function in
# backup_functions.sh:
# declare -i CreateParallelCheckpoint=0
# declare -i CreateMultifileParallelCheckpoint=0
declare -i Threads=
declare Timestamp=
declare MovedFile=
declare LogLink=

function usage () {
   echo -e "Usage:\\n\\t${0##*/} <SDP_Instance> <EdgeSeedCheckpoint>\\n"
   exit 1
}

[[ $# -ne 2 || ${1:-Unset} == -h ]] && usage

export SDP_INSTANCE=${SDP_INSTANCE:-Undefined}
export SDP_INSTANCE=${1:-$SDP_INSTANCE}
if [[ $SDP_INSTANCE == Undefined ]]; then
   echo -e "Usage Error: Instance parameter not supplied."
   usage
fi

declare EdgeSeedCheckpoint=${2:-Unset}
if [[ "$EdgeSeedCheckpoint" == Unset ]]; then
   echo -e "Usage Error: EdgeSeedCheckpoint parameter not supplied.  Usage:\n\t${0##*/} <SDP_Instance> <EdgeSeedCheckpoint>\n"
   echo "You must supply the Perforce instance as the second parameter to this script."
   exit 1
fi

# shellcheck disable=SC1091
source /p4/common/bin/p4_vars "$SDP_INSTANCE"
# shellcheck disable=SC1091
source /p4/common/bin/backup_functions.sh
# shellcheck disable=SC1091
source /p4/common/bin/edge_vars
# shellcheck disable=SC1091
source /p4/common/bin/log_functions.sh

export LOGFILE=
LOGFILE="$LOGS/recover_edge.$(date +'%Y%m%d-%H%M%S').log"

# The LogLink symlink has no timestamp. It points to the most recent log file.
LogLink="$LOGS/recover_edge.log"


[[ -n "$ExcludedTables" && -n "$CheckpointTables" ]] ||\
   die "Values for \$ExcludedTables and/or \$CheckpointTables not defined in $P4CBIN/edge_vars."

######### Start of Script ##########

if [[ -e "$LogLink" ]]; then
   if [[ -L "$LogLink" ]]; then
      rm -f "$LogLink"
   else
      # If the name that should be a symlink is not a symlink, move it aside before
      # creating the symlink.
      OldLogTimestamp=$(get_old_log_timestamp "$LogLink")
      mv -f "$LogLink" "${LogLink%.log}.${OldLogTimestamp}.log"
   fi
fi

# Point LogLink symlink to current log. Use a subshell so the 'cd' doesn't persist.
( cd "$LOGS" && ln -s "${LOGFILE##*/}" "${LogLink##*/}"; )


EdgeCheckpointsDir="${CHECKPOINTS}.${SERVERID#p4d_}"
EdgeDumpPrefix="$EdgeCheckpointsDir/${P4SERVER}.$(date +'%Y%m%d-%H%M%S').edge_dump"
if [[ -r "$EdgeSeedCheckpoint" ]]; then
   # If the specified edge checkpoint is a directory, use paralell replay options.
   if [[ -d "$EdgeSeedCheckpoint" ]]; then
      DoEdgeSeedReplayParallel=1

      # shellcheck disable=SC2072
      [[ "$P4D_VERSION" > "2022.2" ]] ||\
         die "The specifed edge seed checkpoint [$EdgeSeedCheckpoint] is a directory, but P4D version [$P4D_VERSION] is not new enough to handle parallel checkpoint directories. It must be 2023.1+. Aborting."

      # shellcheck disable=SC2072
      # If we're doing parallel checkpoints (because we detected that the specified edge checkpoint is a directory),
      # determine the preferred number of threads from SDP configuration. Default to 4 otherwise.
      if [[ -n "${DO_PARALLEL_CHECKPOINTS:-}" && "$DO_PARALLEL_CHECKPOINTS" != "0" && "$DO_PARALLEL_CHECKPOINTS" =~ ^[1-9]{1}[0-9]*$ ]]; then
         Threads="$DO_PARALLEL_CHECKPOINTS"
      else
         Threads=4
      fi
   else
      DoEdgeSeedReplayParallel=0
   fi
else
   die "The specifed edge seed checkpoint [$EdgeSeedCheckpoint] does not exist. Aborting."
fi

echo "Processing. This may take a while depending on checkpoint duration."
echo "Log file is: $LOGFILE"

check_vars
set_vars
ckp_running

log "Remove offline db"
rm -f "$OFFLINE_DB"/db.* > "$LOGFILE" 2>&1

# With -K filter out the various Edge-specific tables to be replaced with 
# current live versions.

log "Phase 1: Recover edge seed from commit server into offline_db."
if [[ "$DoEdgeSeedReplayParallel" -eq 1 ]]; then
   log "Recover checkpoint directory from commit server into offline_db, skipping tables not used on the edge."
   Cmd="$P4DBIN -r $OFFLINE_DB -K $ExcludedTables -N ${Threads:-4} -z -jrp $EdgeSeedCheckpoint"
else
   log "Recover checkpoint file from commit server into offline_db, skipping tables not used on the edge."
   Cmd="$P4DBIN -r $OFFLINE_DB -K $ExcludedTables -z -jr $EdgeSeedCheckpoint"
fi

log "Running: $Cmd"
$Cmd >> "$LOGFILE" 2>&1 || die "Failed to recover from $EdgeSeedCheckpoint."

log "Phase 2: Shutdown the edge server."
stop_p4d

log "Phase 3: Create dump of local edge tables."
# With -k (lowecase) we filter to include only edge-specific tables from the edge's live P4ROOT.
Cmd="$P4DBIN -r $P4ROOT -k $CheckpointTables -z -jd $EdgeDumpPrefix"
log "Creating a dump file of the edge specific data from P4ROOT."

log "Running: $Cmd"
$Cmd >> "$LOGFILE" 2>&1 ||\
   die "Failed to dump with this command: $Cmd"

log "Phase 4: Blend edge dump into offline_db (where seed was replayed earlier)."
log "Phase 4: Recover the edge dump into offline_db."
Cmd="$P4DBIN -r $OFFLINE_DB -jr ${EdgeDumpPrefix}.gz"
log "Running: $Cmd"

$Cmd >> "$LOGFILE" 2>&1 ||\
   die "Failed to recover from edge dump with this command: $Cmd"

log "Phase 5: Swap Tables and Restart Edge Replication"

if [[ -r "$P4LOG" ]]; then
   Timestamp="$(date +'%Y-%m-%d-%H%M%S')"
   MovedFile="${P4LOG}.moved.${Timestamp}"
   log "Moving P4LOG [$P4LOG] aside to [$MovedFile]."
   mv -f "$P4LOG" "$MovedFile"
else
   log "No P4LOG [$P4LOG] found. Skipping move of P4LOG."
fi

if [[ -r "$P4JOURNAL" ]]; then
   Timestamp="$(date +'%Y-%m-%d-%H%M%S')"
   MovedFile="${P4JOURNAL}.moved.${Timestamp}"
   log "Moving P4JOURNAL [$P4JOURNAL] aside to [$MovedFile]."
   mv -f "$P4JOURNAL" "$MovedFile"
else
   log "No P4JOURNAL [$P4JOURNAL] found. Skipping move of P4JOURNAL."
fi

log "Reset replication state and clear the P4ROOT folder db files."
# shellcheck disable=SC2129
rm -f "$P4ROOT"/db.* >> "$LOGFILE" 2>&1
rm -f "$P4ROOT"/state >> "$LOGFILE" 2>&1
rm -f "$P4ROOT"/rdb.lbr >> "$LOGFILE" 2>&1
rm -f "$P4JOURNAL" >> "$LOGFILE" 2>&1

log "Move the rebuilt database to P4ROOT"
mv "$OFFLINE_DB"/db.* "$P4ROOT"/. >> "$LOGFILE" 2>&1

log "Start the edge server back up."
start_p4d

log "Phase 6: Recreate the offline_db."
# With -K (uppercase), we filter to exclude edge-specific data from the commit.
log "Phase 6A: Load seed from commit server into offline_db."
if [[ "$DoEdgeSeedReplayParallel" -eq 1 ]]; then
   log "Recover checkpoint directory from commit server into offline_db, skipping tables not used on the edge."
   Cmd="$P4DBIN -r $OFFLINE_DB -K $ExcludedTables -N ${Threads:-4} -z -jrp $EdgeSeedCheckpoint"
else
   log "Recover checkpoint file from commit server into offline_db, skipping tables not used on the edge."
   Cmd="$P4DBIN -r $OFFLINE_DB -K $ExcludedTables -jr $EdgeSeedCheckpoint"
fi
log "Running: $Cmd"
$Cmd >> "$LOGFILE" 2>&1 ||\
   die "Edge recovered OK, but could not replay edge seed into offline_db."

log "Phase 6B: Load local edge dump into offline_db."
Cmd="$P4DBIN -r $OFFLINE_DB -jr ${EdgeDumpPrefix}.gz"

log "Running: $Cmd"
$Cmd >> "$LOGFILE" 2>&1 ||\
   die "Edge recovered OK, but could not replay edge tables into offline_db."

echo "Offline db file restored successfully." > "${OFFLINE_DB}/offline_db_usable.txt"

log "Phase 7: Create a new edge checkpoint from offline_db."
get_offline_journal_num

NewEdgeDumpPrefix="$EdgeCheckpointsDir/${P4SERVER}.${SERVERID#p4d_}.ckp.$((OFFLINEJNLNUM+1))"

# CreateParallelCheckpoint is defined in set_vars in backup_functions.sh.
# shellcheck disable=SC2154
if [[ "$CreateParallelCheckpoint" -eq 1 ]]; then
   # CreateMultifileParallelCheckpoint is defined in set_vars in backup_functions.sh.
   # shellcheck disable=SC2154
   if [[ "$CreateMultifileParallelCheckpoint" -eq 1 ]]; then
      Cmd="$P4DBIN -r $OFFLINE_DB -z -N ${Threads:-4} -jdpm $NewEdgeDumpPrefix"
      log "Creating a dump directory of the edge specific data from offline_db."
   else
      Cmd="$P4DBIN -r $OFFLINE_DB -z -N ${Threads:-4} -jdp $NewEdgeDumpPrefix"
      log "Creating a dump directory of the edge specific data from offline_db."
   fi
else
   Cmd="$P4DBIN -r $OFFLINE_DB -z -jd $NewEdgeDumpPrefix"
   log "Creating a dump file of the edge specific data from offline_db."
fi

log "Running: $Cmd"
$Cmd >> "$LOGFILE" 2>&1 ||\
   die "Edge recovered OK, but could not create new edge seed checkpoint from offline_db."

ckp_complete
log "End $P4SERVER Recover Edge"
mail_log_file "$HOSTNAME $P4SERVER Recover Edge log."
# Change User Description Committed
#20 30848 C. Thomas Tyler Semantically separated DoParallelCheckpionts into two variables,
CreateParallelCheckpoint and LoadParallelCheckpoint, which are
logicall independent.
#19 30654 C. Thomas Tyler Revised logic replaying checkpoints to select the most recent checkpoint
regardless of whether it is parallel or not. This logic now applies
consistently across scripts (even some not in this changelist that   
call functions in backup_functions.sh), such as sync_replica.sh,
sync_replica.sh, load_checkpoint.sh, recover_edge.sh, etc.

The edge_dump.sh script now creates parallel checkpoints if parallel
checkpoints are configured.

The load_checkpoint.sh now reliably detects the most recent checkpoint
when '-latest' is used reliably, serial or parallel.  This script now
also finds checkpoints for a standby of an edge.

Also fixed bug cleaning up old *.OK files from earlier parallel
checkpoints.

The recover_edge.sh script similarly detects the latest checkpoint
correctly, serial or parallel.

This chagne was tested with a new regression test suite that operates
in a Battle School Lab environment, allowing for more sophisticated
testing of sequences of operations.

#review-30655
#18 30638 C. Thomas Tyler Tweaked log name.
#17 30636 C. Thomas Tyler Incrementally improved log handling in edge_dump.sh, recover_edge.sh, and
recreate_offline_db.sh, to use the mechanism pioneered in ccheck.sh.

This is a precursor to a coming standardation of log handling in SDP.

As before, each script creates a log in $LOGS named <script_log>.<timestamp>.log.
With this change, symlinks in the LOGS directory are created so that
<script_log>.log (.e. sans the timestamp) points to the timestampped log
file.

So $LOGS/recover_edge.log points to $LOGS/recover_edge.<most_recent_timestamp>.log.

Each run of the script creates a new timestamped log, and updates the symlink
very early in proccessing.

This change is better for users and also simplifies automated
testing.

#review-30637
#16 28175 C. Thomas Tyler The recover_edge.sh script now generates the offline_db_usable.txt file.
Added more error handling.

#review-28176
#15 27178 ashaikh The recover_edge.sh SDP script errors out because a variable is accessed before it is declared.

In this case, the following line throws an error:

declare EdgeCheckpointsDir="${CHECKPOINTS}.${SERVERID#p4d_}"

Error:
/p4/common/bin/recover_edge.sh: line 27: CHECKPOINTS: unbound variable

Opened a job regarding this error: https://swarm.workshop.perforce.com/jobs/SDP-579
#14 26492 C. Thomas Tyler Updated recover_edge.sh to use start_p4d() and stop_p4d() functions.
#13 25949 C. Thomas Tyler Fixed typo in variable name.
#12 25938 C. Thomas Tyler Fixed bug in safety check.
#11 25920 C. Thomas Tyler Added new 'edge_vars' file to dynamically set list of edge-specific
db tables based on current P4D version.

Updated edge_dump.sh, recover_edge.sh, and load_checkpoint.sh
to use the new edge_vars file.

Made edge_dump.sh and recover_edge.sh shellcheck v0.6.0 compliant,
along with load_checkpoint.sh.
#10 23297 C. Thomas Tyler Added safety checks to avoid running commands that will certainly fail
in upgrade.sh.

Generally, /p4/common/bin will be the same on all hosts in a Helix topolgy.
However, on any given machine, the /p4/<N>/bin/<EXE>_<N>_init scripts
should exist only for executables that run on that machine.

This change to upgrade.sh should work on machines even where only a
proxy or broker runs.  Also, it will not generate errors in cases
where there is, say, a p4p_N_bin symlink in /p4/common/bin but no
/p4/N/bin/p4p_N_init script, which will a common situation since
/p4/common/bin will contain all executables used anywhere, while
/p4/N/bin is host-specific.

Also made cosmetic fixes and style convergence change.

In dump_edge.sh and recover_edge_dump.sh, just fixed cosmetic typos.
#9 23266 C. Thomas Tyler Fixes and Enhancements:
* Enabled daily_checkpoint.sh operate on edge servers, to
keep /p4/N/offline_db current on those hosts for site-local
recovery w/o requiring a site-local replica (though having
a site-local replica can still be useful).
* Disabled live_checkpoint.sh for edge servers.
* More fully support topologies using edge severs, in both
geographically distributed and horizaontal scaling "wokspace
server" solutions.
* Fix broken EDGESERVER value definition.
* Modified name of SDP counter that gets set when a checkpoint is taken
to incorporate ServerID, so now the counter name will look like
lastSDPCheckpoint.master.1, or lastSDPCheckpoint.p4d_edge_sfo, rather
than just lastSDPCheckpoint.

There will be multiple such counters in a topology that uses edge
servers, and/or which takes checkpoints on replicas.

* Added comments for all functions.

For the master server, journalPrefix remains:
/p4/N/checkpoints/p4_N

The /p4/N/checkpoints is reserved for writing by the
master/commit server only.

For non-standby (possibly filtered) replicas and edge serves,
journalPrefix is:
/p4/N/checkpoints.<ShortServerID>/p4_N.<ShortServerID>

Here, ShortServerID is just the ServerID with the 'p4d_' prefix
trimmed, since it is redundant in this context.  See mkrep.sh,
which enshines a ServerID (server spec) naming standard, with
values like 'p4d_fr_bos' (forwarding replica in Boston) and
p4d_edge_blr (Edge server in Bangalore).  So the journalPrefix
for the p4d_edge_bos replica would be:
/p4/N/checkpoints.edge_bos/p4_N.edge_bos

For "standby" (aka journalcopy) replicas, journalPrefix is set
to /p4/N/journals.rep. which is written to the $LOGS volume, due
to the nature of standby replicas using journalPrefix to write
active server logs to pre-rotated journals.

Some take-away to be updated in docs:
* The /p4/N/checkpoints folder must be reserved for checkpoints that
originate on the master. It should be safe to rsync this folder
(with --delete if desired) to any replica or edge server.  This is
consistent with the current SDP.
* I want to change 'journals.rep' to 'checkpoints.<ShortServerID>'
for non-standby replicas, to ensure that checkpoints and journals
taken on those hosts are written to a volume where they are backed
up.
* In sites with multiple edge serves, some sharing achive files
('workspace servers'), multiple edge servers will share the same
SAN. So we one checkpoints dir per ServerID, and we want that
dir to be on the /hxdepots volume.

Note that the journalPrefix for replicas was a fixed /p4/N/journals.rep.
This was on the /hxlogs volume - a presumably fast-for-writes volume,
but typically NOT backed up and not very large. This change puts it
under /p4/N/checkpoints.* for edge servers and non-standby replicas,
but ensures other replica types and edge servers can generate
checkpoints to a location that is backed up and has plenty of storage
capacity.  For standby replicas only (which cannot be filtered),
the journalPrefix remains /p4/N/journals.rep on the /hxlogs volume.
#8 22889 Russell C. Jackson (Rusty) Enhanced to mark when it is running so that a checkpoint doesn't stomp on the
offline_db, and also made it just go ahead and create the correct checkpoint
name.
#7 21280 Russell C. Jackson (Rusty) Added standard logging and use of SDP_INSTANCE.
#6 19113 Russell C. Jackson (Rusty) Changed name of daily_backup.sh to daily_checkpoint.sh
Changed name of weekly_backup.sh to recreate_db_checkpoint.sh

Updated crontabs with new names, and changed to run recreate_db_checkpoint
on the 1st Sat. of Jan. and July. For most companies, this is a better
practice than recreating weekly per discussion with Anton.

Remove solaris crontab since Solaris is pretty much dead, and we don't test on it.

Updated docs to reflect name changes, and did a little clean other other sections
while I was in there.
#5 17293 Robert Cowham Clarifications in comments - no functional change.
#4 17219 C. Thomas Tyler Routine Merge Down to dev from main.
#3 16029 C. Thomas Tyler Routine merge to dev from main using:
p4 merge -b perforce_software-sdp-dev
#2 15778 C. Thomas Tyler Routine Merge Down to dev from main.
#1 15753 C. Thomas Tyler Routine Merge Down to dev from main.
//guest/perforce_software/sdp/main/Server/Unix/p4/common/bin/recover_edge.sh
#1 15716 Russell C. Jackson (Rusty) Script for rebuilding an Edge server.