#!/bin/bash #------------------------------------------------------------------------------ ### WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING ### THIS SCRIPT SHOULD BE USED WITH GREAT CAUTION. #------------------------------------------------------------------------------ # Copyright and license info is available in the LICENSE file included with # the Server Deployment Package (SDP), and also available online: # https://swarm.workshop.perforce.com/projects/perforce-software-sdp/view/main/LICENSE #------------------------------------------------------------------------------ set -u # To see documentation for this script, run: # load_checkpoint.sh -man #============================================================================== # Declarations and Environment declare ThisScript=${0##*/} declare CmdLine="$0 $*" declare ThisUser= declare Version=3.2.5 declare -i SilentMode=0 declare -i GenSSLCertsIfNeeded=1 declare -i LicenseFileNeeded=1 declare -i StartWithoutLicenseFile=0 declare -i GetLatestCheckpoint=0 declare -i GetLatestJournals=0 declare -i DoVerify=0 declare -i DoSchemaUpgrade=1 declare -i ReplayJournalsOnly=0 declare -i LoadLiveJournal=0 declare -i ErrorCount=0 declare -i WarningCount=0 declare -i Debug=${SDP_DEBUG:-0} declare DefaultVerifyOptions="-o MISSING" declare DefaultVerifyDelay="600" declare VerifyOptions= declare VerifyDelay= declare -i i=0 declare -i j=0 declare Checkpoint= declare ParallelCheckpointDir= declare ShortServerID= declare TargetServerID= declare CheckpointMD5= declare -i DoParallelCheckpoints= declare Threads=4 declare JournalCounter= declare SDPInstance= declare SDPInstanceVarsFile= declare OfflineDB= declare OfflineDBUsableFile= declare CaseMode= declare CaseFlag= declare CompressFlag= declare P4BrokerCfg= declare SetServerID= declare ReplicaTypeTag= declare InferredReplicaTypeTag= declare ThisHost=${HOSTNAME%%.*} declare P4DInitScript= declare P4DServiceName= declare -i UseSystemdForP4D=0 declare P4BrokerInitScript= declare P4BrokerServiceName= declare -i UseSystemdForP4Broker=0 declare Datestamp= declare INode1= declare INode2= declare -i JournalCount=0 declare -a Journals declare MaxStopDelay= declare -i StopVerified= declare H1="==============================================================================" declare H2="------------------------------------------------------------------------------" declare Log= declare DBSizes= declare DiskSpaceAvail= declare CheckpointsDir= declare JournalPrefix= # Safety Factor is the multiple of curret db.* files needed to allow # the. Using '-F=0' bypasses the disk space check. declare SafetyFactor=1.2 #============================================================================== # Local Functions function msg () { echo -e "$*"; } function msgn () { echo -e -n "$*"; } function dbg () { [[ "$Debug" -eq 0 ]] || msg "DEBUG: $*"; } function errmsg () { msg "\\nError: ${1:-Unknown Error}\\n"; ErrorCount+=1; } function warnmsg () { msg "\\nWarning: ${1:-Unknown Warning}\\n"; WarningCount+=1; } function bail () { errmsg "${1:-Unknown Error}"; exit "$ErrorCount"; } #------------------------------------------------------------------------------ # Infer the replica type from the ServerID if possible (if the SDP Server Spec # Naming Standard was followed). # # Function: infer_replica_type_tag ($serverID) function infer_replica_type_tag () { local serverID=${1:-} local inferredReplicaTypeTag= [[ -n "$serverID" ]] || return # Short circuit if we're on the commit server. # shellcheck disable=SC2153 if [[ "$P4MASTER_ID" == "$SERVERID" ]]; then echo commit return fi # To get the type tag from the ServerID, trim the 'p4d_' prefix ... inferredReplicaTypeTag=${serverID#p4d_} # Next, remove numbers, as the naming standard allows for things like # p4d_ha2_nyc (2nd HA in NYC) or p4d_edge2_syd, and we don't # want the numbers in the type tag. inferredReplicaTypeTag=${inferredReplicaTypeTag/[0-9]*/} # Then figure out from what's left of the string. case "$inferredReplicaTypeTag" in (fs_edge*) inferredReplicaTypeTag=fs_edge;; (fsm_edge*) inferredReplicaTypeTag=fsm_edge;; (ha_edge*) inferredReplicaTypeTag=ha_edge;; (ham_edge*) inferredReplicaTypeTag=ham_edge;; (edge*) inferredReplicaTypeTag=edge;; (ffr*) inferredReplicaTypeTag=ffr;; (fr*) inferredReplicaTypeTag=fr;; # If we haven't matched from the above list, trim the # _<SiteTag> suffix to get the type tag. (*) inferredReplicaTypeTag=${inferredReplicaTypeTag%%_*};; esac echo "$inferredReplicaTypeTag" } #------------------------------------------------------------------------------ function user_confirmation_and_warning () { local interactive=${1:-1} local input="" msg "\\n ============================================================================== WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING\\n This script will remove files in P4ROOT and the offline_db folder, including db.* files, state* files, and others. It is STRONGLY recommended that these files be preserved first unless it is absolutely certain they will never potentially be useful in any recovery scenario. See the '-R' option. This is being run on host $HOSTNAME at $(date). WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING ============================================================================== \\n\\n" if [[ $interactive -eq 0 ]]; then msg "Bypassing interactive confirmation due to -y on command line." return 0 fi if [[ ! -t 0 ]]; then errmsg "Interactive confirmation not bypassed with '-y' and /dev/null provided as input. Aborting." return 1 fi while [[ -z "$input" ]]; do msgn "\\nEnter Y to proceed, N to stop [y/Y/n/N]: " read -r -e input if [[ "${input^^}" == "Y" ]]; then continue elif [[ "${input^^}" == "N" ]]; then msg "Confirmation to proceed not received." return 1 else input= fi done return 0 } #------------------------------------------------------------------------------ # Function: terminate # shellcheck disable=SC2317 function terminate { # Disable signal trapping. trap - EXIT SIGINT SIGTERM dbg "$ThisScript: EXITCODE: $ErrorCount" msg "\\nLog is: $Log\\n${H1}" # With the trap removed, exit. exit "$ErrorCount" } #------------------------------------------------------------------------------ # Function: usage (required function) # # Input: # $1 - style, either -h (for short form) or -man (for man-page like format). # The default is -h. # # $2 - error message (optional). Specify this if usage() is called due to # user error, in which case the given message displayed first, followed by the # standard usage message (short or long depending on $1). If displaying an # error, usually $1 should be -h so that the longer usage message doesn't # obscure the error message. # # Sample Usage: # usage # usage -h # usage -man # usage -h "Incorrect command line usage." #------------------------------------------------------------------------------ function usage { declare style=${1:--h} declare errorMessage=${2:-Unset} if [[ "$errorMessage" != Unset ]]; then msg "\\n\\nUsage Error:\\n\\n$errorMessage\\n\\n" fi msg "USAGE for $ThisScript v$Version: $ThisScript {<checkpoint> [<jnl.1> <jnl.2> ...] | -latest | -latest_jnls | -jo <jnl.1> [<jnl.2> ...] | -jo_latest } [-R|-F <SafetyFactor>] [-i <instance>] [-s <ServerID>] [-t <Type>] [-no_start | [-no_xu] [-verify {default|\"Verify Options\"} [-delay <delay>]]] [-c] [-l] [-r] [-b] [-y] [-L <log>] [-si] [-d|-D] or $ThisScript [-h|-man] " if [[ $style == -man ]]; then msg " DESCRIPTION: This script can load a specified checkpoint and/or numbered journals into P4ROOT (/p4/N/root) and/or /p4/N/offline_db (where 'N' is the SDP instance name). It supports a variety of use cases for replaying checkpoints and journals, including: * Seeding or Reseeding a replica or edge server. * Loading a checkpoint on the commit, e.g. in a recovery scenario. Checkpoints and/or journals can be specified in one of two ways: they can be specified as parameters to this script, or they can be determined by this script if they appear in the SDP standard location according to the journalPrefix standard. They key methods are: * Specify the path to the checkpoint to replay. The checkpoint can be in the form of a compressed .gz file, an uncompressed checkpoint file, or a directory (for parallel checkpoints). * Use '-latest' to have this script find the latest checkpoint available. For a commit server, /p4/N/checkpoints/p4_N is searched. For other servers, their journalPrefix is used. The timestamp on the latest available *.md5 file is used to determine what checkpoint is the latest available, regardless of checkpoint form (compressed or uncompressed file, or a directory for parallel checkpoints). * Use '-latest_jnls' to find the latest checkpoint as with '-latest', and then also find and replay any available subsequent numbered journals. * Use '-jo' (\"journal only\") to specify path(s) to one or more numbered journals to be supplied as parameters to this script. Journal files provided may be compressed or uncompressed. * Use '-jo_latest' to find any numbered journals available to be replayed based on the journal counter of the data set. At the start of processing, preflight checks are done. Preflight checks include: * The specified checkpoint and corresponding *.md5 file must exist. * The specified checkpoint can be a compressed or uncompressed file or a directory (for parallel checkpoints). * All journal files to replay (if any are specified) must exist. * The \$P4ROOT/server.id file must exist, unless '-s' is specified. * If the \$P4ROOT/server.id file exists and '-s' is specified, the values must match. * The \$P4ROOT/license file must exist, unless '-l' is specified or if the replica type does not require a license (such as an edge server). * The SDP structure and key files must exist. * Disk space checks are done to attempt to determine if sufficient space is available to replay the checkpoint. If the preflight passes, the p4d_N service is shutdown. The p4broker_N service is shutdown if it is configured. If a P4LOG file exists, it is moved aside so there is a fresh p4d server log corresponding to operation after the checkpoint load. If a P4JOURNAL file exists, it is moved aside as the old journal data is no longer relevant after a checkpoint replay. (Exception: If the P4JOURNAL is speciffed in a list of journals to reply, then it is not moved aside). Next, any existing state* files in P4ROOT are removed. Next, any existing database it is files in P4ROOT are preserved and moved aside, unless '-R' is specified to remove them. Next, the specified checkpoint is loaded. Upon successful completion, 'p4d -xu' is executed (by default) to help ensure the service can be started with the p4d binary used to replay the checkpoint. Then the Helix Core service is started with the current p4d binary. If the server to be started is a replica, the serviceUser configured for the replica is logged into the P4TARGET server. Any needed 'p4 trust' and 'p4 login' commands are done to enable replication. Note that this part of the processing will fail if the correct super user password is not stored in the standard SDP password file, /p4/common/config/.p4passwd.p4_N.admin After starting the server, a local 'p4 trust' is done if needed, and then a 'p4login -service -v' and 'p4login -v'. By default, the p4d_N service is started, but the p4broker_N service is not. Specify '-b' to restart both services. Finally, the offline_db is rebuilt using the same specified checkpoint and journals. ARGUMENTS AND OPTIONS: <checkpoint> Specify the path to the checkpoint file or directory to load. Exactly one checkpoint must be specified. If a checkpoint file is specified, a serial checkpoint replay will be done. If a checkpoint directory is specified, a parallel replay will be done using the individual files in the directory. For checkpoint files: The file may be a compressed or uncompressed checkpoint, and it may be a case sensitive or case-insensitive checkpoint. The checkpoint file must have a corresponding *.md5 checksum file in the same directory, with one of two name variations: If the checkpoint file is /somewhere/foo.gz, the checksum file may be named /somewhere/foo.gz.md5 or /somewhere/foo.md5. For checkpoint directories: This option is required unless the '-latest' option is used. <jnl.1> [<jnl.2> ...] Specify the path to the one or more journal files to replay after the checkpoint, in the correct sequence order. -latest Specify this as an alternative to providing a specific checkpoint file or directory. The script will then search for the latest *.md5 file in the standard checkpoints directory and use that to replay. The standard checkpoints directory search is one of the following: Commit servers: /p4/${SDPInstance:-N}/checkpoints Standby servers: /p4/${SDPInstance:-N}/checkpoints Edge servers: /p4/${SDPInstance:-N}/checkpoints.<ShortServerID> For standby servers that target an edge server, where the ServerID starts with p4d_ha_edge, p4d_ham_edge, p4d_fs_edge, or p4d_fsm_edge, the directory for the target edge server is searched. (If NFS sharing, this directory will naturally exist. Otherise, the directory should be created and populated as needed on the standby of the edge for seeing with checkpoints from the edge. The most recent *.md5 file found in the standard checkpoints directory determines which checkpoint to load. The actual checkpoint can be a file (gzipped or not) or directory (for parallel checkpoints). This option is mutually exclusive with '-latest_jnls'. -latest_jnls This option is similar to '-latest'. However, with '-latest_jnls', in addition to replaying the latest checkpoint, any subsequent numbered journals available in the standard checkpoints directory are also replayed. This option will only replay numbered journals, not the live P4JOURNAL file. However, if the \$P4JOURNAL is provided, then it will be replayed after all available numbered journals are replayed. This option is mutually exclusive with '-latest'. If used with '-jo', where the checkpoint and possibly some numbered journals will already have been replayed into P4ROOT, then the meaning of this option changes. It will replay needed numbered journal up to the latest available, so long as those journals appear in the standard checkpoints directory with the usual naming convention. With this option, the journal needed are calculated based on the journal counter stored in database in the P4ROOT dir. -R Specify '-R' to remove db.* files in P4ROOT rather than moving them aside. By default, databases are preserved for possible future for investigation. A folder named 'MovedDBs.<datestamp>' is created under the P4ROOT directory, and databases are moved there. Keeping an extra set of databases requires sufficient disk space to hold the extra set of db.* files. If -R specified, old databases in P4ROOT are removed, along with state* and other files, and the server.locks directory. -F <SafetyFactor> When replacing an existing set of db.* files, a safety factor is used. This is simply the factor by which the size of pre-existing databases is multiplied when comparing against available disk space. Specify '-F 0' to disable the safety factor check. The disk space safety check is only meaningful if P4ROOT was previously populated with a full set of data. Specifying a nubmer greater than 1, say 1.2 (the default) gives more breathing room. Specifying a value lower than 1, say 0.95, may be OK if you are certain the expanded-from-a-checkpoint db.* files are significantly smaller than size the prior set of db.* files. This option is mutually exclusive with '-R'. If '-R' is used, databases are removed, and there is no need to calculate disk space. -i <instance> Specify the SDP instance. This can be omitted if SDP_INSTANCE is already defined. -s <ServerID> Specify the ServerID. This value is written into \$P4ROOT/server.id file. If no \$P4ROOT/server.id file exists, this flag is required. If the \$P4ROOT/server.id file exists, this argument is not needed. If this '-s <ServerID>' is given and a \$P4ROOT/server.id file exists, the value in the file must match the value specified with this argument. -t <Type> Specify the replica type tag if the checkpoint to be loaded is for an edge server or replica. The set of valid values for the replica type tag are defined in the documentation for mkrep.sh. See: mkrep.sh -man If the type is specified, the '-s <ServerID>' is required. If the SDP Server Spec Naming Standard is followed, the ServerID specified with '-s' will start with 'p4d_'. In that case, the value for '-t edge' value is inferred, and '-t' is not required. If the type is specified or inferred, certain behaviors change based on the type: * If the type is edge, only the correct edge-specific subset of database tables are loaded. * The P4ROOT/license file check is suppressed unless the type is ha, ham, fs, for fsm (standby replicas usable with 'p4 failover'). Do not use this '-t <Type>' option if the checkpoint being loaded is for a commit server. For an edge server, an edge seed checkpoint created with edge_dump.sh must be used if the edge is filtered, e.g. if any of the *DataFilter fields in the server spec are used. If the edge server is not filtered by means other than being an edge server (for which certain tables are filtered by nature), a standard full checkpoint from the commit can be used. For a filtered forwarding replica, a proper seed checkpoint must be loaded. This can be created on the commit using key options to p4d, including '-P <ServerID> -jd <SeedCkp' on the commit (possibly using the 'offline_db' to avoid downtime, similar to how edge_dump.sh works for edge servers). WARNING: While this script is useful for seeding a new edge server, this script is NOT to be used for recovering or reseeding an existing edge server, because all edge-local database tables (mostly workspace data) would be lost. To recover an existing edge server, see the recover_edge.sh script. Warning: If this option is specified with the incorrect type for the checkpoint specified, results will be unpredictable. -verify default [-delay <delay>] -verify \"Verify Options\" [-delay <delay>] Specify '-verify' to initiate a call to 'p4verify.sh' after the server is online. On a replica, this can be useful to cause the server to pull missing archive files from its P4TARGET server. If this load_checkpoint.sh script is used in a recovery situation for a commit server, this '-verify' option can be used to discover if archive files are missing after the metadata is recovered. The 'p4verify.sh' script has a rich set of options. See 'p4verify.sh -man' for more info. The options to pass to p4verify.sh can be passed in a quoted list, or '-verify default' can be used to indicate these default options: $DefaultVerifyOptions By default, a fast verify is used if the p4d version is new enough (2021.1+). See 'p4verify.sh -man' for more information, specifically the description of the '-o MISSING' option. In all cases, p4verify.sh is invoked as a background process; this load_checkpoint.sh script does not wait for it to complete. The p4verify.sh script will email as per normal when it completes. The optional delay option specifies how long to wait until kicking off the p4verify.sh command, in seconds. The default is $DefaultVerifyDelay seconds. This is intended to give the replica time get get caught up with metadata before the archive pulls are scheduled. The delay is a workaround for job079842. This option is cannot be used with '-no_start'. -c Specify that SSL certificates are required, and not to be generated with 'p4d_N -Gc'. By default, if '-c' is not supplied and SSL certs are not available, certs are generated automatically with 'p4d_N -Gc'. -l Specify that the server is to start without a license file. By default, if there is no \$P4ROOT/license file, this script will abort. Note that if '-l' is specified and a license file is actually needed, the attempt this script makes to start the server after loading the checkpoint will fail. If '-t <type>' is specified, the license check is skipped unless the type is 'ha', 'ham', 'fs,' or 'fsm'. Replicas that are potential targets for a 'p4 failover' need a license file for a failover to work. -r Specify '-r' to replay only to P4ROOT. By default, this script replays both to P4ROOT and the offline_db. -no_start Specify '-no_start' to avoid starting the p4d service after loading the checkpoint. This option is cannot be used with '-verify'. -no_xu Specify '-no_xu' to skip the 'p4d -xu' step that upgrade the database schema. By default, a 'p4d -xu' is done to help ensure the service can be started with the current p4d binary after the checkpoint is replayed. If the p4d binary used to replay the checkpoint is a newer major version than the one used to create the checkpoint, the service will not start after the replay until the 'p4d -xu' step is done. If this '-no_xu' option is used and the p4d binary is a newer major version, have a plan to get the 'p4d -xu' done before the service is started. In EXAMPLES below, see the example titled \"Multi Pass Replay of Checkpoints and Journals\" for an example of using this option as part of a migration procedure. -jo <jnl.1> [<jnl.2> ...] Specify '-jo' to replay only one or more numbered journals without first replaying a full checkpoint. With this option, the cleanup that normally occurs before the replay is disabled. The db.* and state* files in P4ROOT, as well as P4LOG and P4JOURNAL files, etc. are left in place. With '-jo', the paths to journal files must be specified. This option is mutually exclusive to the similar option '-jo_latest'. This option implies '-r'. -jo_latest Specify '-jo_latest' to replay only one or more numbered journals without first replaying a full checkpoint. With this option, the cleanup that normally occurs before the replay is disabled. The db.* and state* files in P4ROOT, as well as P4LOG and P4JOURNAL files, etc. are left in place. With '-jo_latest', numbered journals to replay are calculated and determined, not specified as parameters. This option is mutually exclusive to the similar option '-jo'. This option implies '-r'. -b Specify '-b' to start the a p4broker process (if configured). By default the p4d process is started after loading the checkpoint, but the p4broker process is not. This can be useful to ensure the human administrator has an opportunity to do sanity checks before enabling the broker to allow access by end users (if the broker is deployed for this usage). -y Use the '-y' flag to bypass an interactive warning and confirmation prompt. -L <log> Specify the path to a log file. By default, all output (stdout and stderr) goes to: /p4/<instance>/logs/${ThisScript%.sh}.<timestamp>.log NOTE: This script is self-logging. That is, output displayed on the screen is simultaneously captured in the log file. Do not run this script with redirection operators like '> log' or '2>&1', and do not use 'tee.' -si Operate silently. All output (stdout and stderr) is redirected to the log only; no output appears on the terminal. -d Set debugging verbosity. -D Extreme debugging verbosity using bash 'set -x' mode. HELP OPTIONS: -h Display short help message -man Display man-style help message USAGE TIP: All the non-interactive examples below illustrate the practice of using redirects to create an extra log file named 'load.log' in the \$LOGS directory for the instance. This load.log file is identical to, and in addition to, the standard timestamped log generated by this script. The intent of this practice is to make it easier to find the log for the last checkpoint loaded on any given server machine. This convention is only useful if used consistently. Several examples below illustrate the instance option, '-i' option to specify the SDP instance. This is optional and can safely be omitted in an environment where the standard SDP shell environment is sourced on login, and where there is only a single instance on the server machine. EXAMPLES: EXAMPLE 1: Non-interactive Usage Non-interactive usage (bash syntax) to load a checkpoint: nohup $P4CBIN/load_checkpoint.sh /p4/1/checkpoints/p4_1.ckp.4025.gz -i 1 -y < /dev/null > /p4/1/logs/load.log 2>&1 & Then, monitor with: tail -f \$(ls -t \$LOGS/load_checkpoint.*.log|head -1)\\n EXAMPLE 2: Checkpoint Load then Verify, for the SDP Instance alpha. Non-interactive usage (bash syntax) to load a checkpoint followed by a full verify of recent archives files only with other options passed to verify.sh: nohup $P4CBIN/load_checkpoint.sh /p4/alpha/checkpoints/p4_alpha.ckp.95442.gz -i alpha -verify "-recent -nu -ns" -y < /dev/null > /p4/alpha/logs/load.log 2>&1 & EXAMPLE 3: Load Checkpoint and Journals Non-interactive usage (bash syntax) to loading a checkpoint and subsequent journals: nohup $P4CBIN/load_checkpoint.sh /p4/1/checkpoints/p4_1.ckp.4025.gz /p4/1/checkpoints/p4_1.jnl.4025 /p4/1/checkpoints/p4_1.jnl.4026 -i 1 -y < /dev/null > /p4/1/logs/load.log 2>&1 & Then, monitor with: tail -f \$(ls -t \$LOGS/load_checkpoint.*.log|head -1)\\n EXAMPLE 4: Interactive usage. Interactive usage to load a checkpoint with no license file. $P4CBIN/load_checkpoint.sh /p4/1/checkpoints/p4_1.ckp.4025.gz -i 1 -l With interactive usage, logging still occurs; all output to the screen is captured. Note that non-interactive usage with nohup is recommended for checkpoints with a long replay duration, to make operation more reliable in event of a shell session disconnect. Alternately, running interactively in a 'screen' session (if 'screen' is available) provides similar protection against shell session disconnects. EXAMPLE 5: Seed New Edge Seeding a new edge server. nohup $P4CBIN/load_checkpoint.sh /p4/1/checkpoints/p4_1.ckp.4025.gz -i 1 -s p4d_edge_syd < /dev/null > /p4/1/logs/load.log 2>&1 & WARNING: While this script is useful for seeding a new edge server, this script is NOT to be used for recovering or reseeding an existing edge server, because all edge-local database tables (mostly workspace data) would be lost. To recover an existing edge server, see the recover_edge.sh script. EXAMPLE 6: Seed New Edge and Verify Seeding a new edge server and then do a verify with default options. nohup $P4CBIN/load_checkpoint.sh /p4/1/checkpoints/p4_1.ckp.4025.gz -i 1 -s p4d_edge_syd -verify default < /dev/null > /p4/1/logs/load.log 2>&1 & EXAMPLE 7: Load a Parallel Checkpoint on an Edge and Verify Recent This non-interactive example loads a parallel checkpoint directory. The usage difference is that the checkpoint path provided is a parallel checkpoint directory rather than a single checkpoint file. This example loads the checkpoint for a new edge server, and verifes only the most recent 3 changes in each depot. The delay before calling p4verify.sh, 10 minutes (600) by default, is shortened to 5 seconds in this example. nohup $P4CBIN/load_checkpoint.sh /p4/1/checkpoints/p4_1.ckp.4025 -i 1 -s p4d_edge_syd -verify \"-o MISSING -recent=3 -ns -L /p4/1/logs/p4verify.fast_and_recent.log\" -delay 5 -y < /dev/null > /p4/1/logs/load.log 2>&1 & EXAMPLE 8: Multi Pass Replay of Checkpoints and Journals In this example, we want to use a multi-pass procedure involving replay of a checkpoint at one point in time, and then later replay subsequent numbered journals later. This method can be useful to reduce downtime required for migration procedures involving a checkpoint replay if the checkpoint replay takes a while, e.g. a few hours or more. The gist of the approach is to replay the checkpoint a day or so a head of the scheduled maintenance. Then replay subsequent numbered journals each day after. Then in the maintenance window, replay just the last numbered journal from the old environment in the new environment. This approach involves a few options: * When the checkpoint is replayed, '-no_start' and '-no_xu'. Either specify the path to the checkpoint, or use '-latest'. * When the numbered journals are replayed in days leading up to the maintenance window, use the '-jo_latest' option to replay only a numbered journal. * During the maintenance window, load any final numbered journals, then start the service. Pass 1, 3 days before maintenance: nohup load_checkpoint.sh -latest -no_start -no_xu -r -y < /dev/null > /p4/1/logs/load.log 2>&1 & Pass 2, 2 days before maintenance: nohup load_checkpoint.sh -jo_latest -no_start -no_xu -y < /dev/null > /p4/1/logs/load.log 2>&1 & Pass 3, 1 day before maintenance: nohup load_checkpoint.sh -jo_latest -no_start -no_xu -y < /dev/null > /p4/1/logs/load.log 2>&1 & Pass 4, during the maintenance window: nohup load_checkpoint.sh -jo_latest -y < /dev/null > /p4/1/logs/load.log 2>&1 & " fi exit 2 } #============================================================================== # Command Line Processing declare -i LoadOfflineDB=1 declare -i StartP4BrokerWhenDone=0 declare -i StartP4DWhenDone=1 declare -i Interactive=1 declare -i KeepRootDBs=1 declare -i shiftArgs=0 set +u while [[ $# -gt 0 ]]; do case $1 in (-i) SDPInstance="$2"; shiftArgs=1;; (-s) SetServerID="$2"; shiftArgs=1;; (-t) ReplicaTypeTag="$2"; shiftArgs=1;; (-verify) DoVerify=1; VerifyOptions="$2" shiftArgs=1;; (-delay) VerifyDelay="$2"; shiftArgs=1;; (-c) GenSSLCertsIfNeeded=0;; (-k) dbg "Ignoring obsolete -k option.";; # '-k' was replaced by '-R'. (-R) KeepRootDBs=0;; (-F) SafetyFactor="$2" shiftArgs=1;; (-l) StartWithoutLicenseFile=1;; (-r) LoadOfflineDB=0;; (-b) StartP4BrokerWhenDone=1;; (-no_start) StartP4DWhenDone=0;; (-jo) ReplayJournalsOnly=1 LoadOfflineDB=0 ;; (-jo_latest) GetLatestJournals=1 ReplayJournalsOnly=1 LoadOfflineDB=0 ;; (-no_xu) DoSchemaUpgrade=0;; (-y) Interactive=0;; (-latest) GetLatestCheckpoint=1;; (-latest_jnls) GetLatestCheckpoint=1; GetLatestJournals=1;; (-h) usage -h;; (-man) usage -man;; (-V) usage -h;; # Deprecated option. (-L) Log="$2"; shiftArgs=1;; (-si) SilentMode=1;; (-d) Debug=1;; (-D) Debug=1; set -x;; # Debug; use bash 'set -x' extreme debugging mode. (-*) usage -h "Unknown option ($1).";; # For path arguments, we need to figure out what is provided. # If '-jo_latest' is specified, no path arguments are allowed. # If neither '-latest' nor '-latest_jnls' is provided: # If there is one path argument: It should be a checkpoint file or # directory. # If there are mulitple path args: First is a checkpoint, the rest are journals # (which may or may not end with the P4JOURNAL). # If '-latest' is specified, any path args are journals, which may or may not end # with the P4JOURNAL file. # If '-latest_jnls' is specified, there should be no path args, or exactly one, and # if a path is provided, it must be the P4JOURNAL. (*) [[ "$ReplayJournalsOnly" -eq 1 && "$GetLatestJournals" -eq 1 ]] && \ usage -h "A path argument was provided ($1), but no path args are expected when '-jo_latest' is used." # Unless '-latest' was specified, the first path argument is the user-specified checkpoint. if [[ "$GetLatestCheckpoint" -eq 0 && -z "$Checkpoint" && "$ReplayJournalsOnly" -eq 0 ]]; then Checkpoint="$1" # Unless '-latest_jnls' is specified, additional path arguments are user-specified journals. elif [[ "$GetLatestJournals" -eq 0 ]]; then Journals[JournalCount]="$1" JournalCount+=1 fi ;; esac # Shift (modify $#) the appropriate number of times. shift; while [[ $shiftArgs -gt 0 ]]; do [[ $# -eq 0 ]] && usage -h "Incorrect number of arguments." shiftArgs=$shiftArgs-1 shift done done set -u #============================================================================== # Command Line Verification [[ -n "$Log" ]] || \ Log="${LOGS:-/tmp}/${ThisScript%.sh}.$(date +'%Y%m%d-%H%M%S').log" if [[ -z "$SDPInstance" ]]; then if [[ "${SDP_INSTANCE:-Unset}" != Unset ]]; then SDPInstance="$SDP_INSTANCE" else usage -h "SDP Instance parameter is missing and SDP_INSTANCE is undefined." fi fi if [[ -n "$ReplicaTypeTag" && -z "$SetServerID" ]]; then usage -h "The '-t' was used without '-s'. If '-t <Type>' is specified, the '-s <ServerID>' argument is required." fi if [[ -n "$VerifyDelay" && "$DoVerify" -eq 0 ]]; then usage -h "The '-d <delay>' requires '-verify' to also be used. See info for the '-verify' option." fi if [[ "$StartP4DWhenDone" -eq 0 && "$DoVerify" -eq 1 ]]; then usage -h "The '-verify' and '-no_start' options are mutually exclusive." fi if [[ -n "$VerifyOptions" && "$VerifyOptions" == default ]]; then VerifyOptions="$DefaultVerifyOptions" fi [[ -n "$VerifyDelay" ]] || VerifyDelay="$DefaultVerifyDelay" [[ "$ReplayJournalsOnly" -eq 1 && "$GetLatestJournals" -eq 0 && "$JournalCount" -eq 0 ]] && \ usage -h "The '-jo' option was specified, but no journals were provided. Provide paths to journals, or consider using '-jo_latest'." [[ -n "${SDP_DEBUG:-}" ]] && Debug=1 #============================================================================== # Main Program trap terminate EXIT SIGINT SIGTERM # Initialize logging. touch "$Log" || bail "Couldn't touch log file [$Log]." # Redirect stdout and stderr to a log file. if [[ "$SilentMode" -eq 0 ]]; then exec > >(tee "$Log") exec 2>&1 else exec >"$Log" exec 2>&1 fi msg "${H1}\\nLog is: $Log" ThisUser=$(id -n -u) msg "Started $ThisScript v$Version as $ThisUser@$ThisHost on $(date) as:\\n$CmdLine\\n" # shellcheck disable=SC2153 SDPInstanceVarsFile="/p4/common/config/p4_${SDPInstance}.vars" [[ -r "$SDPInstanceVarsFile" ]] ||\ bail "Missing SDP instance vars file. Typo in instance name? Expected file: $SDPInstanceVarsFile" # shellcheck disable=SC1091 source /p4/common/bin/p4_vars "$SDPInstance" # shellcheck disable=SC1091 source /p4/common/bin/ps_functions.sh ||\ bail "Failed to load ps_functions.sh." #shellcheck disable=SC1091 source /p4/common/bin/edge_vars ||\ bail "Failed to load edge_vars." [[ "$ReplayJournalsOnly" -eq 0 && "$GetLatestCheckpoint" -eq 0 && -z "$Checkpoint" ]] && \ usage -h "No checkpoint specified. A checkpoint must be specified unless '-latest', '-jo', or '-jo_latest' is used." #------------------------------------------------------------------------------ msg "${H2}\\nPhase 1 - Preflight Checks." if [[ "$ReplayJournalsOnly" -eq 0 ]]; then if [[ -r "$Checkpoint" ]]; then if [[ -f "$Checkpoint" ]]; then msg "Checkpoint is a file [$Checkpoint] - Checkpoint Replay will be serial." elif [[ -d "$Checkpoint" ]]; then msg "Checkpoint is a directory [$Checkpoint] - Checkpoint Replay will be parallel." ParallelCheckpointDir="$Checkpoint" DoParallelCheckpoints=1 # For this load_checkpoint.sh script, we don't rely on configuration to indicate # whether to use a serial or parallel checkpoint -- we simply the latest checkpoint # available or the the one specified by the user. However, if parallel checkpoints # are configured, we use that to determine the dessired number of threads to use, # defaulting to 4 if we can't find a configuration to the contrary. if [[ "$DO_PARALLEL_CHECKPOINTS" =~ ^[1-9]{1}[0-9]*$ ]]; then if [[ "$DO_PARALLEL_CHECKPOINTS" == 1 ]]; then Threads=4 else # Strip leading zeros from the DO_PARALLEL_CHECKPOINTS value # (if the 'bc' utility is available). if [[ -n "$(command -v bc)" ]]; then Threads=$(echo "$DO_PARALLEL_CHECKPOINTS"|bc) else Threads="$DO_PARALLEL_CHECKPOINTS" fi fi else Threads=4 fi else bail "Unknown file type for checkpoint [$Checkpoint]. It is neither a file nor directory or symlink to a file or directory." fi # Bail unless the user specified '-latest', in which case no checkpoint # is provided. elif [[ "$GetLatestCheckpoint" -eq 0 ]]; then bail "Specified checkpoint does not exist: $Checkpoint" fi fi if [[ "$ReplayJournalsOnly" -eq 0 ]]; then [[ -z "$Checkpoint" && "$GetLatestCheckpoint" -eq 0 ]] &&\ usage -h "Missing parameter. Specify a checkpoint file/directory, or use '-latest' or '-latest_jnls'." fi # Set or check ServerID value. if [[ -n "$SetServerID" ]]; then if [[ -f "$P4ROOT/server.id" ]]; then if grep -q "$SetServerID" "$P4ROOT/server.id"; then msg "Verified: $P4ROOT/server.id value matches set value: $SetServerID" else bail "Existing $P4ROOT/server.id file is not set to $SetServerID, but to $(cat "$P4ROOT"/server.id). If you are certain you want to use the ServerID of $SetServerID specified with '-s', remove the existing $P4ROOT/server.id file, e.g. with:\\n\\trm -f $P4ROOT/server.id\\n\\nAnd then try again. Aborting for now." fi else msg "Setting ServerID $SetServerID in $P4ROOT/server.id" echo "$SetServerID" > "$P4ROOT/server.id" ||\ bail "Failed to write $P4ROOT/server.id. Aborting" export SERVERID="$SetServerID" fi if [[ "$SetServerID" == "p4d_"* ]]; then InferredReplicaTypeTag=$(infer_replica_type_tag "$SetServerID") msg "Inferred '-t $InferredReplicaTypeTag' from '-s $SetServerID'." if [[ -z "$ReplicaTypeTag" ]]; then ReplicaTypeTag="$InferredReplicaTypeTag" elif [[ "$ReplicaTypeTag" == "$InferredReplicaTypeTag" ]]; then msg "Verified: Value specified with '-t $InferredReplicaTypeTag' matches value inferred from '-s $SetServerID'." else usage -h "The replica type '-t $ReplicaTypeTag' was specified but does not match the value '$InferredReplicaTypeTag' inferred from '-s $SetServerID'." fi fi else [[ -n "${SERVERID:-}" ]] || bail "SERVERID not detected. Ensure this is a valid $P4ROOT/server.id file.":w if [[ -z "$ReplicaTypeTag" ]]; then # shellcheck disable=SC2153 ReplicaTypeTag=$(infer_replica_type_tag "$SERVERID") if [[ -n "$ReplicaTypeTag" ]]; then msg "Inferred '-t $ReplicaTypeTag' from ServerID [$SERVERID]." else warnmsg "Unable to infer a replica type from ServerID [$SERVERID]." fi fi fi # The replica type may have been set with '-t <Type>' or inferred from # '-s <ServerID>'. In either case, disable the license check if it's # not needed. if [[ -n "$ReplicaTypeTag" ]]; then # If a replica, the license file is not required by default unless # the type tag indicates it is a type used for High Availability, # which should have a license file to support 'p4 failover' (unless # it is an HA for an edge server). case "$ReplicaTypeTag" in (ha|ham) true;; (fs|fsm) true;; (*) LicenseFileNeeded=0;; esac fi # Another method of disabling unnecesary license checks based on SERVERID. case "${SERVERID:-}" in (p4d_fs_edge_*) LicenseFileNeeded=0;; (p4d_fsm_edge_*) LicenseFileNeeded=0;; (p4d_ha_edge_*) LicenseFileNeeded=0;; (p4d_ham_edge_*) LicenseFileNeeded=0;; (p4d_edge_*) LicenseFileNeeded=0;; (p4d_ffr_*) LicenseFileNeeded=0;; (p4d_fr_*) LicenseFileNeeded=0;; esac # shellcheck disable=SC2153 if [[ "$P4MASTER_ID" == "$SERVERID" ]]; then CheckpointsDir="$CHECKPOINTS" dbg "On commit server, CheckpointsDir=$CHECKPOINTS" else # Figure out if we should be using the Second Form of the journalPrefix per the # standard, and set CheckpointsDir accordingly. if [[ "$ReplicaTypeTag" =~ ^(ha_edge|ham_edge|fs_edge|fsm_edge) ]]; then ShortServerID="${SERVERID#p4d_}" TargetServerID=$(echo "$ShortServerID" | sed -E 's@^(ha_|ham_|fs_|fsm)@@g') CheckpointsDir="/p4/${SDP_INSTANCE}/checkpoints.$TargetServerID" JournalPrefix="${CheckpointsDir}/${P4SERVER}.$TargetServerID" dbg "CheckpointsDir=$CheckpointsDir (on $ReplicaTypeTag)." dbg "JournalPrefix=$JournalPrefix" elif [[ "$ReplicaTypeTag" =~ ^(edge|ffr) ]]; then ShortServerID="${SERVERID#p4d_}" CheckpointsDir="/p4/${SDP_INSTANCE}/checkpoints.$ShortServerID" JournalPrefix="${CheckpointsDir}/${P4SERVER}.$ShortServerID" dbg "CheckpointsDir=$CheckpointsDir (on $ReplicaTypeTag)." dbg "JournalPrefix=$JournalPrefix" else CheckpointsDir="$CHECKPOINTS" JournalPrefix="$CHECKPOINTS/$P4SERVER" dbg "CheckpointsDir=$CHECKPOINTS" dbg "JournalPrefix=$JournalPrefix" fi fi # If the user requested to find the latest checkpoint, see if we can find one. Find the latest *.md5 # file, and then determine if there is an associated checkpoint file or directory. Pick whatever is # associated with the latest *.md5, which may be a checkpoint file or parallel checkpoint directory. if [[ "$GetLatestCheckpoint" -eq 1 ]]; then # shellcheck disable=SC2012 CheckpointMD5=$(ls -t "$CheckpointsDir"/*.md5 2>/dev/null | head -1) if [[ -n "$CheckpointMD5" ]]; then if [[ -d "${CheckpointMD5%.md5}" ]]; then ParallelCheckpointDir="${CheckpointMD5%.md5}" DoParallelCheckpoints=1 Checkpoint="$ParallelCheckpointDir" msg "With '-latest', selected parallel checkpoint dir: $ParallelCheckpointDir" elif [[ -r "${CheckpointMD5%.md5}" ]]; then Checkpoint="${CheckpointMD5%.md5}" msg "With '-latest', selected serial checkpoint file: $Checkpoint" elif [[ -r "${CheckpointMD5/.md5/.gz}" ]]; then Checkpoint="${CheckpointMD5/.md5/.gz}" msg "With '-latest', selected serial checkpoint file: $Checkpoint" else bail "The '-latest' option was used, but only an MD5 file was found ($CheckpointMD5) -- no corresponding checkpoint file or directory was found: ${CheckpointMD5%.md5} (file or directory) or ${CheckpointMD5/.md5/.gz} (file) expected." fi else bail "The '-latest' or '-latest_jnls' option was used, but no *.md5 file is available in $CheckpointsDir." fi fi # If the user requested to find the latest journals, see if we can find any. if [[ "$GetLatestJournals" -eq 1 ]]; then # We need to determine the journal counter. If we are about to load a checkpoint, # get the journal counter from the data set we're about to load. If the checkpoint # was loaded in an earlier pass, extract the journal counter from the data set. if [[ "$ReplayJournalsOnly" -eq 1 ]]; then # Extract JournalCounter from existing data set in P4ROOT. if [[ -r "$P4ROOT/db.counters" ]]; then JournalCounter=$("$P4DBIN" -r "$P4ROOT" -k db.counters -jd - 2>/dev/null | grep '@journal@' | cut -d@ -f8) if [[ ! "$JournalCounter" =~ ^[0-9]+$ ]]; then bail "Journal Counter extracted from $P4ROOT/db.counters is invalid: $JournalCounter" fi else bail "Could not determine journal counter; missing file $P4ROOT/db.counters." fi else if [[ "$DoParallelCheckpoints" -eq 1 ]]; then # Extract JournalCounter from the parallel checkpoint directory we are about to load. if [[ -r "$ParallelCheckpointDir/db.counters.ckp.gz" ]]; then JournalCounter=$(zgrep @journal@ "$ParallelCheckpointDir/db.counters.ckp.gz" | cut -d@ -f8) elif [[ -r "$ParallelCheckpointDir/db.counters.ckp" ]]; then JournalCounter=$(grep @journal@ "$ParallelCheckpointDir/db.counters.ckp" | cut -d@ -f8) else bail "Could not determine journal counter from db files in parallel checkpoint dir: [$ParallelCheckpointDir]." fi else # Extract JournalCounter from the checkpoint file we are about to load. JournalCounter=${Checkpoint##*.ckp.} JournalCounter=${JournalCounter%.gz} if [[ ! "$JournalCounter" =~ ^[0-9]+$ ]]; then bail "Journal Counter extracted from checkpoint file name [$Checkpoint] is invalid: $JournalCounter" fi fi fi msg "Searching for journals $CheckpointsDir/$P4SERVER.jnl.* with JournalCounter >= $JournalCounter." JournalCount=0 # Find numbered journals, which may or may not be compressed. while true; do if [[ -r "${CheckpointsDir}/${P4SERVER}.jnl.${JournalCounter}" ]]; then Journals[JournalCount]="${CheckpointsDir}/${P4SERVER}.jnl.${JournalCounter}" JournalCount=$((JournalCount+1)) JournalCounter=$((JournalCounter+1)) elif [[ -r "${CheckpointsDir}/${P4SERVER}.jnl.${JournalCounter}.gz" ]]; then Journals[JournalCount]="${CheckpointsDir}/${P4SERVER}.jnl.${JournalCounter}" JournalCount=$((JournalCount+1)) JournalCounter=$((JournalCounter+1)) else break fi done if [[ "$JournalCount" -ne 0 ]]; then msg "Found $JournalCount numbered journals to replay." else # If '-latest_jnls' was specfied, and we found no journals, that's OK. # If '-jo_latest' was specified and we found no journals, that's an error. if [[ "$ReplayJournalsOnly" -eq 1 ]]; then bail "The '-jo_latest' option was specified, but no numbered were found to relay. Aborting." else msg "Found no numbered journals to replay." fi fi fi # Preflight - check that specified journals exist. Also, determine if the live # P4JOURNAL file is on the list of journal to be replied. if [[ "$JournalCount" -gt 0 ]]; then if [[ -r "$P4JOURNAL" ]]; then # shellcheck disable=SC2012 INode1="$(ls -i "$P4JOURNAL" 2>/dev/null|awk '{print $1}')" fi for jnl in "${Journals[@]}"; do if [[ -r "$jnl" ]]; then if [[ -n "$INode1" ]]; then # shellcheck disable=SC2012 INode2="$(ls -i "$jnl" 2>/dev/null|awk '{print $1}')" [[ "$INode1" == "$INode2" ]] && LoadLiveJournal=1 fi else bail "Specified journal file does not exist: $jnl" fi done # Extra sanity check: If the live $P4JOURNAL is specified, it # must be the last one in the last. Give an error if it's # not the last path argument in the list. if [[ "$LoadLiveJournal" -eq 1 ]]; then j=${#Journals[@]} if [[ "$j" -ne 1 ]]; then j=$((j-1)) for ((i=0; i<j; i++)); do jnl=${Journals[$i]} # shellcheck disable=SC2012 INode2="$(ls -i "$jnl" 2>/dev/null|awk '{print $1}')" if [[ "$INode1" == "$INode2" ]]; then bail "The live P4JOURNAL appears in a list of path arguments. If a list of journals are provided including the live P4JOURNAL, that live journal must appear as the last file in the list. The order or journals does matter, and the live journal must always be the last replayed." fi done fi fi fi # shellcheck disable=SC2153 P4DInitScript="$P4HOME/bin/p4d_${SDP_INSTANCE}_init" P4BrokerInitScript="$P4HOME/bin/p4broker_${SDP_INSTANCE}_init" cd "$P4ROOT" || bail "Could not cd to P4ROOT [$P4ROOT]." [[ "$PWD" == "/p4/${SDPInstance}/root" ]] || bail "Unexpected P4ROOT value of: $P4ROOT" if [[ "$ReplayJournalsOnly" -eq 0 ]]; then CheckpointMD5="${Checkpoint}.md5" if [[ -r "$CheckpointMD5" ]]; then msg "Verified: MD5 file for checkpoint exists: $CheckpointMD5" else CheckpointMD5="${Checkpoint/.gz/.md5}" if [[ -r "$CheckpointMD5" ]]; then msg "Verified: MD5 file for checkpoint exists: $CheckpointMD5" else bail "Could not find MD5 file for checkpoint. Tried ${Checkpoint}.md5 and ${Checkpoint/.gz/.md5}" fi fi if [[ "$LicenseFileNeeded" -eq 1 ]]; then msg "Checking for license file." if [[ ! -r "$P4ROOT/license" ]]; then if [[ "$StartWithoutLicenseFile" -eq 1 ]]; then warnmsg "No $P4ROOT/license file found. Continuing due to '-l'." else bail "No $P4ROOT/license file found. If none needed, specify '-l' and try again." fi fi fi fi msg "\\nPreflight Summary:" msg " ServerID: $SERVERID" msg " Type: $ReplicaTypeTag\\n" if [[ "$ReplayJournalsOnly" -eq 0 ]]; then echo -n " Checkpoint to replay is: $Checkpoint " if [[ "$DoParallelCheckpoints" -eq 1 ]]; then echo "(Parallel Checkpoint Replay, $Threads threads)" else echo "(Seriel Checkpoint Replay)" fi fi if [[ "$JournalCount" -gt 0 ]]; then for jnl in "${Journals[@]}"; do msg " Journal to replay: $jnl" done fi echo -n " Upgrade Schema: " if [[ "$DoSchemaUpgrade" -eq 1 ]]; then echo YES; else echo NO; fi echo -n " Start P4D When Done: " if [[ "$StartP4DWhenDone" -eq 1 ]]; then echo YES; else echo NO; fi echo -n " Start Broker When Done: " if [[ "$StartP4BrokerWhenDone" -eq 1 ]]; then echo YES; else echo NO; fi echo -n " License Needed: " if [[ "$LicenseFileNeeded" -eq 1 ]]; then echo YES; else echo NO; fi msg "\\n P4D Version: $P4D_VERSION" user_confirmation_and_warning "$Interactive" ||\ bail "User confirmation not verified. Aborting." #------------------------------------------------------------------------------ msg "${H2}\\nPhase 2 - Prepare to Load Checkpoint." if [[ "$ReplayJournalsOnly" -eq 0 ]]; then if [[ "$P4PORT" =~ ^ssl[46]*: && ! -r "$P4SSLDIR/certificate.txt" ]]; then if [[ "$GenSSLCertsIfNeeded" -eq 1 ]]; then if [[ ! -d "$P4SSLDIR" ]]; then mkdir -p "$P4SSLDIR" ||\ bail "Failed to create P4SSLDIR [$P4SSLDIR]." fi chmod 700 "$P4SSLDIR" ||\ bail "Failed to do 'chmod 700 $P4SSLDIR'." "$P4DBIN" -Gc ||\ bail "There was a problem generating SSL certificates with '$P4DBIN -Gc'." else bail "The P4PORT has SSL is enabled, but there is no certificate file\\n[$P4SSLDIR/certificate.txt], and the '-c' flag as specified requiring SSL certificates to be in place.\\n" fi fi fi [[ -x "$P4DInitScript" ]] ||\ bail "Aborting: P4D Init script is missing or not executable: $P4DInitScript" msg "Checking to see if broker is configured." if [[ -r "/p4/common/config/p4_${SDPInstance}.broker.${ThisHost}.cfg" ]]; then P4BrokerCfg="/p4/common/config/p4_${SDPInstance}.broker.${ThisHost}.cfg" msg "Broker configured with: $P4BrokerCfg" elif [[ -r "/p4/common/config/p4_${SDPInstance}.broker.cfg" ]]; then P4BrokerCfg="/p4/common/config/p4_${SDPInstance}.broker.cfg" msg "Broker configured with: $P4BrokerCfg" else msg "No broker config detected." fi P4BrokerServiceName="${P4BROKERBIN##*/}" if [[ -x "$P4BrokerInitScript" && -r "$P4BrokerCfg" ]]; then if [[ -n "$(command -v systemctl)" ]]; then if [[ -n "$(systemctl is-enabled "$P4BrokerServiceName" 2>/dev/null)" ]]; then UseSystemdForP4Broker=1 fi fi msg "Checking p4broker status." if "$P4BrokerInitScript" status > /dev/null 2>&1; then msg "Shutting down p4broker." if [[ "$UseSystemdForP4Broker" -eq 1 ]]; then sudo systemctl stop "$P4BrokerServiceName" ||\ bail "Failed to execute: sudo systemctl stop $P4BrokerServiceName" else "$P4BrokerInitScript" stop fi else msg "Verified: $P4BrokerServiceName is down." fi fi P4DServiceName="${P4DBIN##*/}" if [[ -n "$(command -v systemctl)" ]]; then if [[ -n "$(systemctl is-enabled "$P4DServiceName" 2>/dev/null)" ]]; then UseSystemdForP4D=1 fi fi msg "Checking p4d status." if "$P4DInitScript" status > /dev/null 2>&1; then MaxStopDelay=${SDP_MAX_STOP_DELAY_P4D:-43200} msg "\\nShutting down p4d." if [[ "$UseSystemdForP4D" -eq 1 ]]; then sudo systemctl stop "$P4DServiceName" ||\ bail "Failed to execute: sudo systemctl stop $P4DServiceName" # With systemd, we must independently confirm service stop, # waiting if needed. StopVerified=0 i=0; while [[ "$i" -lt "$MaxStopDelay" ]]; do if "$P4DInitScript" status > /dev/null 2>&1; then sleep 1 else StopVerified=1 break fi i=$((i+1)) done if [[ "$StopVerified" -eq 1 ]]; then msg "Verified: p4d has shutdown." else errmsg "Server $P4DServiceName did not stop after $MaxStopDelay seconds. Tailing $P4LOG:" tail "$P4LOG" bail "Aborting due to failed p4d stop." fi else "$P4DInitScript" stop fi else msg "Verified: $P4DServiceName is down." fi #------------------------------------------------------------------------------ if [[ "$ReplayJournalsOnly" -eq 0 ]]; then msg "${H2}\\nPhase 3 - Cleanup dbs, logs, journals, state* files, etc." echo "P4ROOT is not available during load_checkpoint.sh processing." > "$P4ROOT/P4ROOT_not_usable.txt" # Move Aside or Remove db.* files, but not server.id or license files. Datestamp=$(date +'%Y-%m-%d-%H%M%S') if [[ "$KeepRootDBs" -eq 1 ]]; then # Try to avoid an out-of-diskspace issue. if [[ "$SafetyFactor" != "0" && -n "$(command -v bc)" ]]; then if [[ -n "$(ls -t db.* rdb.* 2>/dev/null)" ]]; then DBSizes=$(du -bc db.* rdb.lbr* 2>/dev/null | grep total | cut -f1) dbg "DBSizes1=[$DBSizes] (in bytes)." DBSizes=$(echo "$DBSizes*$SafetyFactor/1024"|bc 2>/dev/null) dbg "DBSizes2=[$DBSizes] (in K, with Safety factor of $SafetyFactor)." DiskSpaceAvail=$(df -k . 2>/dev/null | grep / | awk '{print $4}') dbg "DiskSpaceAvail=[$DiskSpaceAvail]." if [[ "$DBSizes" =~ ^[0-9.]+$ && "$DiskSpaceAvail" =~ ^[0-9]+$ ]]; then if [[ "$(echo "$DBSizes >= $DiskSpaceAvail" | bc -l)" == "1" ]]; then if [[ -n "$(command -v numfmt)" ]]; then bail "There may be insufficient disk space to replay a checkpoint; $(echo "$DBSizes" | numfmt --to=iec --from-unit=1024) is the estimated need, and $(echo "$DiskSpaceAvail" | numfmt --to=iec --from-unit=1024) is available. Consider using the '-R' or '-F' options, or find a way to increase available disk space." else bail "There may be insufficient disk space to replay a checkpoint; ${DBSizes}K is the estimated need, and ${DiskSpaceAvail}K is available. Consider using the '-R' or '-F' options, or find a way to increase available disk space." fi else dbg "Verified: Sufficient disk space is estimated to be available." fi else warnmsg "Skipping disk space check; could not get sizes using: du -bc db.* rdb.*" fi else msg "Skipping disk space check; no DBs found." fi else dbg "Skipping disk space check; Safety Factor set to 0 or bc not available." fi MovedDBDir="MovedDBs.$Datestamp" mkdir "$MovedDBDir" || bail "In $PWD could not do: mkdir \"$MovedDBDir\"" msg "Moving aside various db and other files in $PWD:" find "$PWD"/ -maxdepth 1 -type f -name "db.*" -print -exec mv -f {} "$MovedDBDir/" \; find "$PWD"/ -maxdepth 1 -type f -name "rdb.*" -print -exec mv -f {} "$MovedDBDir/" \; find "$PWD"/ -maxdepth 1 -type f -name "state*" -print -exec mv -f {} "$MovedDBDir/" \; find "$PWD"/ -maxdepth 1 -type f -name "server.pid" -print -exec mv -f {} "$MovedDBDir/" \; else if [[ -n "$(ls -t db.* rdb.* 2>/dev/null)" ]]; then Cmd='/bin/rm -f db.* rdb.*' msg "Cleaning up databases with this command in $PWD:\\n$Cmd" eval "$Cmd" || bail "Failed to clean up old databases in $P4ROOT" else msg "Skipping cleanup of DBs; none found." fi # Remove state* files. if [[ -n "$(ls -t state* server.pid 2>/dev/null)" ]]; then Cmd='/bin/rm -f state* server.pid' msg "Cleaning up state* and server.pid files with this command in $PWD:\\n$Cmd" eval "$Cmd" || bail "Failed to clean up old state files in $P4ROOT" else msg "Skipping cleanup of state* and server.pid files; none found." fi fi if [[ -d server.locks ]]; then Cmd="/bin/rm -rf server.locks" msg "Cleaning up server.locks directory with this command in $PWD:\\n$Cmd" eval "$Cmd" || bail "Failed to clean up old server.locks directory in $P4ROOT" else dbg "Skipping cleanup of server.locks directory (not found)." fi msg "Directory listing in $PWD after cleanup:\\n$(ls -lrt)\\n" # Move P4LOG aside if it exists. if [[ -r "$P4LOG" ]]; then Cmd="mv $P4LOG $P4LOG.moved.$(date +'%Y%m%d-%H%M')" msg "Moving P4LOG aside with this command:\\n$Cmd" eval "$Cmd" || bail "Failed to move P4LOG aside." fi # Move P4JOURNAL aside if it exists, unless the live P4JOURNAL was on # the list of journal to replay. if [[ "$LoadLiveJournal" -eq 0 && -r "$P4JOURNAL" ]]; then Cmd="mv $P4JOURNAL $P4JOURNAL.moved.$(date +'%Y%m%d-%H%M')" msg "Moving P4JOURNAL aside with this command:\\n$Cmd" eval "$Cmd" || bail "Failed to move P4JOURNAL aside." fi # Move journal.NNN files (for standby replicas) aside if they exist. cd "$LOGS" || bail "Could not cd to: $LOGS" # shellcheck disable=SC2045 for f in $(ls journal.* 2>/dev/null); do [[ "$f" =~ ^journal.[0-9]+$ ]] || continue Cmd="mv $f $f.moved.$Datestamp" msg "Moving file $f aside with this command:\\n$Cmd" eval "$Cmd" || bail "Failed to move file $f aside." done cd - > /dev/null || bail "Could not cd to: $OLDPWD" # For checkpoint replay operations, determine whether case sensitivity # flag is needed, and whether '-z' is needed. if [[ "$Checkpoint" == *".gz" ]]; then if [[ "$(uname -s)" == "Darwin" ]]; then CaseMode=$(zcat < "$Checkpoint" | head -1 | grep -E '^@nx@ (0|2) ' | cut -d ' ' -f5) else CaseMode=$(zcat "$Checkpoint" | head -1 | grep -E '^@nx@ (0|2) ' | cut -d ' ' -f5) fi if [[ "$CaseMode" == "2" ]]; then CaseFlag='-C1' msg "Case-insensitive checkpoint detected." else msg "Case-sensitive checkpoint detected." fi # In P4D 2018.1, the '-z' flag should be dropped, as the need for # compression/decompression is determined by P4D when replying, # and thus '-z' should not be used (though it is supported by P4D # for backward compatibility, and we use it here if using an older # server. # shellcheck disable=SC2072 [[ "$P4D_VERSION" < "2018.1" ]] && CompressFlag='-z' else if [[ -d "$Checkpoint" ]]; then if [[ -r "$Checkpoint/db.config.ckp.gz" ]]; then if [[ "$(uname -s)" == "Darwin" ]]; then CaseMode=$(zcat < "$Checkpoint/db.config.ckp.gz" | head -1 | grep -E '^@nx@ (0|2) ' | cut -d ' ' -f5) else CaseMode=$(zcat "$Checkpoint/db.config.ckp.gz" | head -1 | grep -E '^@nx@ (0|2) ' | cut -d ' ' -f5) fi dbg "CaseMode from compressed parallel checkpoint dir detected as $CaseMode." elif [[ -r "$Checkpoint/db.config.ckp" ]]; then CaseMode=$(head -1 "$Checkpoint/db.config.ckp"| grep -E '^@nx@ (0|2) ' | cut -d ' ' -f5) dbg "CaseMode from uncompressed parallel checkpoint dir detected as $CaseMode." else warnmsg "Could not determine case for checkpiont dir [$Checkpoint]. Assuming CaseMode=1." CaseMode=1 fi else # Determine case sensitivity from uncompressed checkpoint file. CaseMode=$(head -1 "$Checkpoint"| grep -E '^@nx@ (0|2) ' | cut -d ' ' -f5) if [[ "$CaseMode" == "2" ]]; then CaseFlag='-C1' msg "Case-insensitive checkpoint detected." else msg "Case-sensitive checkpoint detected." fi fi fi else msg "Skipping Phase 3; replaying journals only." fi #------------------------------------------------------------------------------ msg "${H2}\\nPhase 4 - Load Checkpoint in P4ROOT." if [[ "$ReplayJournalsOnly" -eq 0 ]]; then Cmd="$P4DBIN -r $P4ROOT $CompressFlag $CaseFlag" # For edge servers, ignore tables specified by $ExcludedTables, defined in # /p4/common/bin/edge_vars. if [[ "$ReplicaTypeTag" == "edge" ]]; then # shellcheck disable=SC2154 Cmd+=" -K $ExcludedTables" fi if [[ "$DoParallelCheckpoints" -eq 1 ]]; then Cmd+=" -N $Threads -jrp $ParallelCheckpointDir" msg "Replaying parallel checkpoint directory to P4ROOT with this command:\\n$Cmd" else Cmd+=" -jr $Checkpoint" msg "Replaying checkpoint file to P4ROOT with this command:\\n$Cmd" fi eval "$Cmd" || bail "Checkpoint replay to P4ROOT failed." fi if [[ "$JournalCount" -gt 0 ]]; then for jnl in "${Journals[@]}"; do Cmd="$P4DBIN -r $P4ROOT -f" if [[ "$ReplicaTypeTag" == "edge" ]]; then # shellcheck disable=SC2154 Cmd+=" -K $ExcludedTables" fi Cmd+=" -jr $jnl" msg "Replaying journal to P4ROOT with this command:\\n$Cmd" eval "$Cmd" || bail "Replay of journal to P4ROOT failed for journal: $jnl" done fi rm -f "$P4ROOT/P4ROOT_not_usable.txt" #------------------------------------------------------------------------------ msg "${H2}\\nPhase 5 - Upgrade Schema." if [[ "$DoSchemaUpgrade" -eq 1 ]]; then Cmd="$P4DBIN -r $P4ROOT -t localhost:0.0.0.0 -xu" msg "Ensuring databases are upgraded with this command:\\n$Cmd" eval "$Cmd" || bail "Database upgrade in P4ROOT failed." else msg "Skipping P4ROOT database schema upgrade ('p4d -xu') due to '-no_xu'." fi #------------------------------------------------------------------------------ msg "${H2}\\nPhase 6 - Start Services." if [[ "$StartP4DWhenDone" -eq 1 ]]; then msg "Starting p4d." if [[ "$UseSystemdForP4D" -eq 1 ]]; then sudo systemctl start "$P4DServiceName" ||\ bail "Failed to execute: sudo systemctl start $P4DServiceName." else "$P4DInitScript" start fi # Delay a bit to give p4d time to start. As it comes up, # ensure the local SSL connection is trusted. i=0; while [[ "$i" -lt 10 ]]; do sleep 2 if p4 info -s > /dev/null 2>&1; then break elif [[ "$P4PORT" =~ ^ssl[46]*: ]]; then p4 trust -f -y > /dev/null 2>&1 fi i=$((i+1)) done if p4 info -s > /dev/null 2>&1; then msg "The P4D Server started OK." else errmsg "The P4D Server did not start. Consider reviewing the P4LOG [$P4LOG] and/or $LOGS/p4d_init.log." fi msg "Logging in." msg "Running: $P4CBIN/p4login -v" "$P4CBIN/p4login" -v msg "Running: $P4CBIN/p4login -p $P4MASTERPORT -v" "$P4CBIN/p4login" -p "$P4MASTERPORT" -v # Now that we have loaded a checkpoint, re-load the shell environment to # ensure that P4REPLICA is set correctly, as some db.config settings are # determined dynamically. # shellcheck disable=SC1091 source /p4/common/bin/p4_vars "$SDPInstance" if [[ "$P4REPLICA" == TRUE ]]; then msg "Logging in service user." msg "Running: $P4CBIN/p4login -v -service" "$P4CBIN/p4login" -v -service sleep 3 Cmd="$P4BIN -s pull -lj" msg "Checking replication status with this command:\\n$Cmd" eval "$Cmd" || bail "Replica is not replicating properly." fi #------------------------------------------------------------------------------ if [[ "$DoVerify" -eq 1 ]]; then msg "${H2}\\nPhase 7A - Optional Post Op - Verify." Cmd="p4verify.sh $SDPInstance $VerifyOptions" if [[ "$VerifyDelay" -ne 0 ]]; then msg "Sleeping $VerifyDelay seconds before starting p4verify.sh ..." sleep "$VerifyDelay" fi msg "\\nPerforming archive verification with this command, launched in as a\\nbackground process (fire and forget):\\nnohup $Cmd < /dev/null > /dev/null 2>&1 &" # shellcheck disable=SC2086 nohup $Cmd < /dev/null > /dev/null 2>&1 & sleep 1 msg "\\nThe p4verify.sh log is: $LOGS/p4verify.log" fi else msg "Skipping start of p4d due to '-no_start'." fi if [[ -x "$P4BrokerInitScript" && -r "$P4BrokerCfg" ]]; then msg "${H2}\\nPhase 7B - Optional Post Op / Start Broker." if [[ "$StartP4BrokerWhenDone" -eq 1 ]]; then if [[ "$UseSystemdForP4Broker" -eq 1 ]]; then sudo systemctl start "$P4BrokerServiceName" ||\ bail "Failed to execute: sudo systemctl start ${P4BrokerServiceName}." else "$P4BrokerInitScript" start fi i=0; while [[ "$i" -lt 10 ]]; do sleep 3 msg "Running (try $((i+1))): $P4CBIN/p4login -p $P4BROKERPORT -v" "$P4CBIN/p4login" -p "$P4BROKERPORT" -v && break i=$((i+1)) done else if [[ "$UseSystemdForP4Broker" -eq 1 ]]; then msg "NOT starting broker. Start it manually when ready with:\\n\\tsudo systemctl start $P4BrokerServiceName\\n" else msg "NOT starting broker. Start it manually when ready with:\\n\\t$P4BrokerInitScript start\\n" fi fi fi msg "Checkpoint load processing took $((SECONDS/3600)) hours $((SECONDS%3600/60)) minutes $((SECONDS%60)) seconds.\\n" #------------------------------------------------------------------------------ # Load the offline_db. if [[ "$LoadOfflineDB" -eq 1 ]]; then msg "${H2}\\nPhase 7C - Optional Post Op / Load offline_db." OfflineDB=${P4ROOT/root/offline_db} OfflineDBUsableFile="$OfflineDB/offline_db_usable.txt" cd "$OfflineDB" || bail "Could not cd to offline_db dir [$OfflineDB]." [[ "$PWD" == "/p4/${SDPInstance}/offline_db" ]] || bail "Unexpected OfflineDB value of: $OfflineDB" # Remove db.* and state* files, but not server.id or license files. Cmd="/bin/rm -f db.* state* $OfflineDBUsableFile" msg "Cleaning up databases and offline_db_usable.txt file with this command in $PWD:\\n$Cmd" eval "$Cmd" || bail "Failed to clean up old databases and state files in $OfflineDB" Cmd="$P4DBIN -r $OfflineDB $CompressFlag $CaseFlag" # For edge servers, ignore tables specified by $ExcludedTables, defined in # /p4/common/bin/edge_vars. if [[ "$ReplicaTypeTag" == "edge" ]]; then # shellcheck disable=SC2154 Cmd+=" -K $ExcludedTables" fi if [[ "$DoParallelCheckpoints" -eq 1 ]]; then Cmd+=" -N $Threads -jrp $ParallelCheckpointDir" else Cmd+=" -jr $Checkpoint" fi msg "Replaying checkpoint to offline_db with this command:\\n$Cmd" eval "$Cmd" || bail "Checkpoint replay to offline_db failed." if [[ "$DoSchemaUpgrade" -eq 1 ]]; then Cmd="$P4DBIN -r $OfflineDB -t localhost:0.0.0.0 -xu" msg "Ensuring databases in offline_db are upgraded with this command:\\n$Cmd" eval "$Cmd" || bail "Database upgrade in offline_db failed." else msg "Skipping offline database schema upgrade ('p4d -xu') due to '-no_xu'." fi echo "Database restored successfully." > "$OfflineDBUsableFile" ||\ bail "Failed to write this file: $OfflineDBUsableFile" fi if [[ "$ErrorCount" -eq 0 && "$WarningCount" -eq 0 ]]; then msg "\\nAll processing completed successfully.\\n" elif [[ "$ErrorCount" -eq 0 ]]; then warnmsg "\\nProcessing completed successfully with no errors but $WarningCount warnings:\\n$(grep ^Warning: "$Log")\\n" else errmsg "\\nProcessing completed, but $ErrorCount errors and $WarningCount warnings were reported. Review this log for more context. Summary of errors and warnings:\\n$(grep -E '^(Error|Warning):' "$Log")\\n" fi msg "\\nTime: Checkpoint load processing on $ThisHost took $((SECONDS/3600)) hours $((SECONDS%3600/60)) minutes $((SECONDS%60)) seconds.\\n" # See the terminate() function where this script actually exits. exit "$ErrorCount"
# | Change | User | Description | Committed | |
---|---|---|---|---|---|
#48 | 30848 | C. Thomas Tyler |
Semantically separated DoParallelCheckpionts into two variables, CreateParallelCheckpoint and LoadParallelCheckpoint, which are logicall independent. |
||
#47 | 30679 | C. Thomas Tyler |
Completed logic to detect case sensitivty compressed and uncompressed parallel checkpoints. Also corrected test in unrelated debug messages. |
||
#46 | 30678 | C. Thomas Tyler |
Fixed bug related to detection of case sensitivity for parallel checkpoints dir, resulting in error like this sample: head: error reading '/p4/1/checkpoints.edge_syd/p4_1.edge_syd.ckp.689': Is a directory |
||
#45 | 30668 | C. Thomas Tyler | Fixed bug detecting journal counter from serial checkpoint file. | ||
#44 | 30654 | C. Thomas Tyler |
Revised logic replaying checkpoints to select the most recent checkpoint regardless of whether it is parallel or not. This logic now applies consistently across scripts (even some not in this changelist that call functions in backup_functions.sh), such as sync_replica.sh, sync_replica.sh, load_checkpoint.sh, recover_edge.sh, etc. The edge_dump.sh script now creates parallel checkpoints if parallel checkpoints are configured. The load_checkpoint.sh now reliably detects the most recent checkpoint when '-latest' is used reliably, serial or parallel. This script now also finds checkpoints for a standby of an edge. Also fixed bug cleaning up old *.OK files from earlier parallel checkpoints. The recover_edge.sh script similarly detects the latest checkpoint correctly, serial or parallel. This chagne was tested with a new regression test suite that operates in a Battle School Lab environment, allowing for more sophisticated testing of sequences of operations. #review-30655 |
||
#43 | 30600 | C. Thomas Tyler | Silenced a shellcheck concern. | ||
#42 | 30456 | C. Thomas Tyler | Added P4D_VERSION to output. | ||
#41 | 30350 | C. Thomas Tyler | Tweaked example. | ||
#40 | 30349 | C. Thomas Tyler |
Fix to load_checkpoint.sh so '-jo' and 'jo_latest' do no imply '-no_xu', and documented the fact that the '-jo*' options do imply '-r'. |
||
#39 | 30340 | C. Thomas Tyler |
load_checkpoint.sh v3.0.0: Made suitable for multi-pass replay of checkpoint subsequent numbered journals. This enables a technique useful in reducing downtime for migrations involving replay of a checkpoint possibly days ahead of the cutover (one pass), and then later replaying numbered journals. #review-30341 |
||
#38 | 30236 | C. Thomas Tyler |
load_checkpoint.sh: Add '-no_start' option. #review-30237 |
||
#37 | 30100 | C. Thomas Tyler |
Fixed doc issues in load_checkpoint.sh. The'-v<N>' verosity control option was shown in the usage synopsis but was not implemented. The '-d/-D' options (which are documented and work as advertised) make the '-v' option (a carry over from an old script template) obsolete. Fixed other doc typos. No functional changes. #review-30101 @mark_zinthefer |
||
#36 | 29949 | C. Thomas Tyler | Added '-latest' option to synopsis for load_checkpoint.sh | ||
#35 | 29899 | C. Thomas Tyler | load_checkpoint.sh: Fixed warnings from 'find' call and fixed typo in error message. | ||
#34 | 29883 | C. Thomas Tyler | Added safey feature to defend against using '< /dev/null' and forgetting '-y'. | ||
#33 | 29881 | C. Thomas Tyler |
Added '-latest' option to load_checkpoint.sh. Selects latest checkpoint, be it a file (for serial checkpoints) or a directory (for parallel checkpoints). #review-29882 |
||
#32 | 29864 | C. Thomas Tyler |
load_checkpoint.sh v2.9: * Added support for replaying parallel checkpoint directories. * Changed order of operations so the replica service is started before the (optional) replay to the offline_db. * Fixed exit code bug where 'EXIT_CODE: 0' could be displayed even if there were errors. * Deprecated '-k' option to keep databases. Keeping metadata is now the default behavior. Added '-R' option to remove metadata. * Added a disk space safety check, projecting need based on size of previously existing db.* files. Added a '-F' safety factor option to control or bypass the safety. * Fixed some errors that could be encontered in some conditions when moving aside existing db.* and other files. * Enhanced audit trail and generally nicer output. * The server.locks directory is now removed. * The rdb.lbr table and server.pid files are now handled (moved or removed depending on -R, along db.* and other files). * Various doc improvements. * Added doc example, loading a parallel checkpoint on an edge. * Added doc tip re: the common practice of creating a log file name 'load.log' for non-interactive usage. * Enhanced so logic to infer the type tag, which had been applied to ServerID's specified with '-s', is now applied even if '-s' is not used. This results in less need to specify '-l' (to skip the license check) on server types like edges that don't need a license. * General internal improvements, moving toward the (not yet published) SDP Bash coding standard. #review-29865 |
||
#31 | 29316 | C. Thomas Tyler |
Enhanced robustness of start_p4d() and stop_p4d() interaction with systemd, and also in similar logic in load_checkpoint.sh. #review-29317 |
||
#30 | 29307 | C. Thomas Tyler |
load_checkpoint.sh: Added '-delay' option as a workaround for server job job079842. #review-29308 |
||
#29 | 29195 | C. Thomas Tyler | Fixed order-of-operations issue seen without standard shell environment. | ||
#28 | 29193 | C. Thomas Tyler |
load_checkpoint.sh v2.7.0: * If live P4JOURNAL is specified to replay, don't move it. * Added '-k' option to keep rather than delete db.* files. Updated test case to exercise new new options. With these changes, this script, originally focused on loading checkpoints on replicas/edge servers, is now also suited to support recovery. See also: SDP-582, for a separate script focused on recovery scenarios, which can call this script as needed. #review-29164 @robert_cowham |
||
#27 | 28836 | C. Thomas Tyler |
load_checkpoint.sh v2.6.2: * Added logic to avoid errors if broker startes slowly. * Format clarification to usage message for '-verify' option. * Fixed typos in doc and code comments. |
||
#26 | 28641 | C. Thomas Tyler |
Tweaked scripts to support IPv6 SSL prefixes. Added test script to test bash code snippets. First test is the snippet to check if SSL is enabled, and if so get the SSL prefix. |
||
#25 | 28587 | C. Thomas Tyler |
load_checkpoint.sh v2.6.0: * Added '-verify' option to call p4verify.sh from load_checkpoint.sh. Default is to do a 'fast verify' if possible (if p4d is 2021.1+). Other options to p4verify.sh can be passed. #review-28588 |
||
#24 | 28585 | C. Thomas Tyler |
load_checkpoint.sh v2.5.2: * Added logic to load edge servers. * Added '-t <Type>' flag to specifiy replica types, with logic to infer the value from '-s <ServerID>' if the SDP Server Spec Naming Standard is followed. * Fixed bug with failing to detect start of server with an untrusted SSL connection. * Refined logic for handling license file checks, eliminating needless warnings and errors. * Refined docs, with extra info for handling edge servers and filtered forwarding replicas. #review-28586 |
||
#23 | 28584 | C. Thomas Tyler |
load_checkpoint.sh v2.4.4: * Enhanced error message for case where '-s <ServerID>' disagrees with server.id file. |
||
#22 | 28219 | C. Thomas Tyler |
Fixed issue with slow p4d start causing 'p4login' failures. #review-28220 |
||
#21 | 28203 | C. Thomas Tyler | Optimization to avoid referencing Journals array if empty. | ||
#20 | 28168 | C. Thomas Tyler |
Enhanced load_checkpoint.sh to accept a list of journal files to replay following the checkpoint. Fixed issue with 'zcat' on Mac. |
||
#19 | 27722 | C. Thomas Tyler |
Refinements to @27712: * Resolved one out-of-date file (verify_sdp.sh). * Added missing adoc file for which HTML file had a change (WorkflowEnforcementTriggers.adoc). * Updated revdate/revnumber in *.adoc files. * Additional content updates in Server/Unix/p4/common/etc/cron.d/ReadMe.md. * Bumped version numbers on scripts with Version= def'n. * Generated HTML, PDF, and doc/gen files: - Most HTML and all PDF are generated using Makefiles that call an AsciiDoc utility. - HTML for Perl scripts is generated with pod2html. - doc/gen/*.man.txt files are generated with .../tools/gen_script_man_pages.sh. #review-27712 |
||
#18 | 27064 | C. Thomas Tyler |
Fixed issue where 'source p4_vars' hangs if load_checkpoint.sh is running. Added new semaphore file, $P4ROOT/P4ROOT_not_usable.txt. This is used in a way similar to 'offline_db_usable.txt' in the offline_db, except that this file only exists when the databases in P4ROOT are not usable. This is the opposite of how offline_db_usable.txt works, because P4ROOT is expected to be usable 99.9% fo the time. p4d_base will refuse to start p4d if this file exists, protecting against possible operator errors (like trying to start p4d when a checkpoint is still loading). Added check_file_dne() function to verify_sdp.sh to confirm a named file does not exist. Added checks in verify_sdp.sh that P4ROOT_not_usable.txt does not exist in P4ROOT or offline_db. Modified switch_db_files() (called by refresh_P4ROOT_from_offline_db.sh) to properly use the new P4ROOT_not_usable.txt safety file. Fixed bugs in p4d_base that could cause p4d_init.log to be overwritten if error output was generated. Removed call to 'backup_functions.sh' in p4d_base, as on balance it added more complexity than needed. #review-27065 |
||
#17 | 26940 | C. Thomas Tyler |
Fixed issue with load_checkpoint.sh not clearing journal.NNN file when working on a standby (journalcopy) replica, which prevented replication from automatically starting. These files are now moved aside. Removed undocumented edge processing logic determined to be unnecessary. Minor doc improvements. |
||
#16 | 26886 | C. Thomas Tyler |
Fixed issue in load_checkpoint.sh where the critical 'p4d -xu' step can fail due to lack of a license file with enough users for the data set. With this fix (borrowed from upgrade.sh), the critical upgrade step succeeds, and the user is not left with a failure in the middle of processing that would require manual cleanup. The p4d server will still refuse to start without a proper license later in the process, but at that we're at a cleaner break point in processing, and the failure mode is clearer to admins. |
||
#15 | 26825 | C. Thomas Tyler | Fixed order of operations issue when '-s <ServerID>' was specified. | ||
#14 | 26649 | Robert Cowham |
More SDP Doc tidy up. Removed some command summary files. |
||
#13 | 26457 | C. Thomas Tyler | Fixed typo for '-L' flag. | ||
#12 | 26444 | C. Thomas Tyler |
load_checkpoint.sh 2.3.x: * Patch for bug with missing variable initializations. * Does trusts and logins for services users as needed after loading checkpoint. * Enhanced preflight check. ServerID file now required (can be specified) * Now generates SSL certs if needed. * Enhanced docs. |
||
#11 | 26400 | C. Thomas Tyler |
Added refresh_P4ROOT_from_offline_db.sh. Updated backup_functions.sh to support functionality for db refresh. Upgrade start_p4d() and stop_p4d() to use systemd if available, else use the underlying SysV init scripts. Updated verify_sdp.sh to be called from other scripts (sans its own logging). Added many checks to verify_sdp.sh to support P4ROOT/offline_db swap. Logic in P4ROOT/offline_db swap is more careful about what gets swapped. Added start_p4broker() and stop_p4broker() that behave similarly. More shellcheck compliance. #review-26401 |
||
#10 | 25949 | C. Thomas Tyler | Fixed typo in variable name. | ||
#9 | 25938 | C. Thomas Tyler | Fixed bug in safety check. | ||
#8 | 25920 | C. Thomas Tyler |
Added new 'edge_vars' file to dynamically set list of edge-specific db tables based on current P4D version. Updated edge_dump.sh, recover_edge.sh, and load_checkpoint.sh to use the new edge_vars file. Made edge_dump.sh and recover_edge.sh shellcheck v0.6.0 compliant, along with load_checkpoint.sh. |
||
#7 | 25790 | C. Thomas Tyler |
load_checkpoint.sh v2.0.5: * Added support for handling case-insensitive and case sensitive checkpoints. This works if $P4DBIN is a symlink rather than a wrapper supplying '-C1', as the need for '-C1' is dynamically detected. (This was driven by a need to incorporate load_checkpoint.sh into case conversion orchestration scripts using p4migrate). * Added '-r' flag to load checkpoint into P4ROOT only, skipping offline_db. * Added '-b' flag to restart broker at the end. * Normalized to template.sh structure, with typical benefits (enhanced command line parsing, standard usage message, better documentation). * Added check for missing certificate.txt if using SSL. * Made shellcheck v0.6.0 compliant. |
||
#6 | 24602 | C. Thomas Tyler | Fixed a few obvious typos. | ||
#5 | 24599 | C. Thomas Tyler | Fixed obvious typo. | ||
#4 | 24482 | C. Thomas Tyler |
Fixed obvious typo. Removed redundant time as it clutters the output and introduces an unnecessary dependency. Change to new/unreleased script, bypassing review. |
||
#3 | 24374 | C. Thomas Tyler | Added actual file content. | ||
#2 | 24368 | C. Thomas Tyler |
load_checkpoint.sh v1.0.6: * Added safety warning and interactive confirmation. * Supports systemd init mechanism rather than SysV init scripts if systemd *.service files are configured. Uses 'sudo systemctl start/stop' calls for p4d and p4broker. Use SysV init scripts on other systems. * Fixed bug with cleanup steps not executing. TO DO: Normalize into template.sh style with auto-logging and standard command line flags. |
||
#1 | 24198 | C. Thomas Tyler |
Added utility script to load a specified checkpoint into both /p4/N/root and /p4/N/offline_db, including logic to managing p4d/p4broker processes, ensure no db.* files exist prior to replaying the checkoint, managing the offline_db_usable.txt file, etc. |