#!/bin/bash #============================================================================== # Copyright and license info is available in the LICENSE file included with # the Server Deployment Package (SDP), and also available online: # https://swarm.workshop.perforce.com/projects/perforce-software-sdp/view/main/LICENSE #------------------------------------------------------------------------------ set -u # Global Variables. export P4DInitScript= export P4DSystemdServiceFile= export P4BrokerInitScript= export P4BrokerSystemdServiceFile= export P4ProxyInitScript= export P4ProxySystemdServiceFile= export CKPTMPDIR= export OFFLINE_DB= export EDGESERVER= export STANDBYSERVER= # Common functions used in various SDP scripts. #------------------------------------------------------------------------------ # Verify key variables in the shell environment exist, or else abort. # # If checks in this function fail, this function an 'echo' and 'exit 1' # rather than calling 'log' or 'die', as this function is generally called # early in processing, before the log is initialized. #------------------------------------------------------------------------------ function check_vars () { local CheckVarsPreflightOK=1 CommonVars="SDP_INSTANCE P4HOME P4PORT P4ROOT P4JOURNAL P4BIN P4DBIN P4TICKETS P4TRUST KEEPCKPS KEEPJNLS KEEPLOGS CHECKPOINTS LOGS OSUSER" InstanceVars="P4MASTER_ID P4MASTERPORT" # First, check vars that should be set in /p4/common/bin/p4_vars. for var in $CommonVars; do # Detect unset variables, using ':-' to avoid 'unbound variable' errors. # shellcheck disable=SC1083 if [[ -z "$(eval echo \${"$var":-})" ]]; then echo "Error: Required variable \$$var is NOT set. It should be set in /p4/common/bin/p4_vars." CheckVarsPreflightOK=0 fi done # Next, check vars that should be set in /p4/common/config/p4_N.instance. # For some variables, provide additional details that help help users correct # the problem. for var in $InstanceVars; do # shellcheck disable=SC1083 if [[ -z "$(eval echo \${"$var":-})" ]]; then echo "Error: Required variable \$$var is NOT set. It should be set in /p4/common/config/p4_N.vars, where N is the SDP instance name." if [[ "$var" == "P4MASTER_ID" ]]; then echo "The value for P4MASTER_ID should be the name of the ServerID of the master server." fi CheckVarsPreflightOK=0 fi done if [[ "$CheckVarsPreflightOK" -eq 0 ]]; then echo "Use p4master_run or source p4_vars before calling this script." echo "Aborting to to errors in shell environment preflight checks." exit 1 fi } #------------------------------------------------------------------------------ # is_edge ($ServerID, $RootDir) # # Determine if a given ServerID is an edge server or not, checking a given # database root dir (e.g. $P4ROOT or $OFFLINE_DB). # # Input: # $1 - ServerID (required) # $2 - RootDir (optional, defaults to $P4ROOT) # # Output YES if an edge server, NO otherwise. # #------------------------------------------------------------------------------ function is_edge () { local ServerID=${1:-Unset} local RootDir=${2:-$P4ROOT} local ServicesData= local EdgeCheck= # Extract a slice of db.server referencing the given ServerID, # and then grab the field containing Services data. ServicesData=$("$P4DBIN" -r "$RootDir" -J off -L /dev/null -k db.server -jd - 2>&1 |\ $GREP "@db.server@ @${ServerID}@" | "$CUT" -d '@' -f 13) # Do a bitwise operation to determine if the ServicesData value indicates # this is an edge server. if [[ -n "$ServicesData" ]]; then EdgeCheck=$((ServicesData & 4096)) if [[ "$EdgeCheck" -gt 0 ]]; then echo YES else echo NO fi else echo NO fi } #------------------------------------------------------------------------------ # is_replica ($ServerID, $RootDir) # # Determine if a given ServerID is a replica server or not, checking a given # database root dir (e.g. $P4ROOT or $OFFLINE_DB). # # Input: # $1 - ServerID (required) # $2 - RootDir (optional, defaults to $P4ROOT) # # Output YES if an standby server, NO otherwise. # #------------------------------------------------------------------------------ is_replica () { local ServerID="${1:-Unset}" local RootDir="${2:-$P4ROOT}" local ServicesData= # Extract a slice of db.server referencing the given ServerID, # and then grab the field containing Services data. ServicesData=$("$P4DBIN" -r "$RootDir" -J off -L /dev/null -k db.server -jd - 2>&1 |\ "$GREP" "@db.server@ @${ServerID}@" | "$CUT" -d '@' -f 13 | tr -d ' ') # Do a check to see if the ServicesData value indicates # this is an standby server. if [[ -n "$ServicesData" ]]; then if [[ "$ServicesData" -eq '2533' ]]; then echo YES else echo NO fi else echo NO fi } #------------------------------------------------------------------------------ # is_standby ($ServerID, $RootDir) # # Determine if a given ServerID is an standby server or not, checking a given # database root dir (e.g. $P4ROOT or $OFFLINE_DB). # # Input: # $1 - ServerID (required) # $2 - RootDir (optional, defaults to $P4ROOT) # # Output YES if an standby server, NO otherwise. # #------------------------------------------------------------------------------ function is_standby () { local ServerID="${1:-Unset}" local RootDir="${2:-$P4ROOT}" local ServicesData= # Extract a slice of db.server referencing the given ServerID, # and then grab the field containing Services data. ServicesData=$("$P4DBIN" -r "$RootDir" -J off -L /dev/null -k db.server -jd - 2>&1 |\ "$GREP" "@db.server@ @${ServerID}@" | "$CUT" -d '@' -f 13 | tr -d ' ') # Do a check to see if the ServicesData value indicates # this is an standby server. if [[ -n "$ServicesData" ]]; then if [[ "$ServicesData" -eq '35141' || "$ServicesData" -eq '35301' ]]; then echo YES else echo NO fi else echo NO fi } #------------------------------------------------------------------------------ # Set variables for use in various scripts: # OFFLINE_DB=path to offline db directory # EDGESERVER=1 if this is an edge server, 0 otherwise. # STANDBYSERVER=1 if this is a standby server, 0 otherwise. # # This must be called after loading the standard shell environment by # doing: # source /p4/common/bin/p4_vars N # # This sets P4HOME, SERVERID, etc. needed by this function. #------------------------------------------------------------------------------ function set_vars () { P4DInitScript="$P4HOME/bin/p4d_${SDP_INSTANCE}_init" P4DSystemdServiceFile="/etc/systemd/system/p4d_${SDP_INSTANCE}.service" P4BrokerInitScript="$P4HOME/bin/p4broker_${SDP_INSTANCE}_init" P4BrokerSystemdServiceFile="/etc/systemd/system/p4broker_${SDP_INSTANCE}.service" P4ProxyInitScript="$P4HOME/bin/p4p_${SDP_INSTANCE}_init" P4ProxySystemdServiceFile="/etc/systemd/system/p4p_${SDP_INSTANCE}.service" OFFLINE_DB="${P4HOME}/offline_db" CKPTMPDIR="${CHECKPOINTS}/ckp_tmp" # shellcheck disable=SC2153 if [[ -n "$SERVERID" ]]; then if [[ "$(is_edge "$SERVERID")" == YES ]]; then export EDGESERVER=1 else export EDGESERVER=0 fi else export EDGESERVER=0 fi if [[ -n "$SERVERID" ]]; then if [[ "$(is_replica "$SERVERID")" == YES ]]; then export REPLICASERVER=1 # Get commit server from P4TARGET setting in database else export REPLICASERVER=0 fi else export REPLICASERVER=0 fi if [[ -n "$SERVERID" ]]; then if [[ "$(is_standby "$SERVERID")" == YES ]]; then export STANDBYSERVER=1 # Get commit server from P4TARGET setting in database else export STANDBYSERVER=0 fi else export STANDBYSERVER=0 fi # Ensure that SDP_ADMIN_PASSWORD_FILE is set, using existing value if set (e.g. # in p4_vars), otherwise set it to the SDP standard value. export SDP_ADMIN_PASSWORD_FILE="${SDP_ADMIN_PASSWORD_FILE:-Unset}" if [[ "$SDP_ADMIN_PASSWORD_FILE" == Unset ]]; then export SDP_ADMIN_PASSWORD_FILE="$P4CCFG/.p4passwd.${P4SERVER}.admin" fi } #------------------------------------------------------------------------------ # Check if user is running as required OS user. #------------------------------------------------------------------------------ function check_uid () { user=$(id -un) if [[ "${user}" != "${OSUSER}" ]]; then die "Must be run by user: ${OSUSER}. Abort!" fi } #------------------------------------------------------------------------------ # Function log() - echo message to logfile or stdout. # # If $LOGFILE is defined, write message to the log file only; nothing goes to # stdout. Prepend a datestamp. # If $LOGFILE isn't defined, just echo to stdout, w/o timestamp or. # In all cases, support '-e' formatting. # Input: # $1 - message to log (must be quoted). #------------------------------------------------------------------------------ function log () { if [[ "${LOGFILE:-Unset}" != Unset ]]; then echo -n "$(date)" >> "$LOGFILE" 2>&1 echo -e " $0: $*" >> "$LOGFILE" 2>&1 else echo -e "$@" fi } #------------------------------------------------------------------------------ # Decide depending on our mail utility, how to specify sender (if we need to). # Mail on some platforms sets sender by default. # If the mail utility returns what looks like a version identifier # when given the '-V' flag, use a '-S' flag. If it does not return a # version identifier, don't set a mail sender option. # Allow GNU Mailutils alternative flag instead. #------------------------------------------------------------------------------ function get_mail_sender_opt () { local mail_sender_opt= local mail_ver= if [[ -n "$MAILFROM" ]]; then mail_ver=$($SDPMAIL -V 2>&1) # shellcheck disable=SC2076 if [[ "$mail_ver" =~ "GNU Mailutils" ]]; then mail_sender_opt="-aFrom:$MAILFROM" elif [[ "$mail_ver" =~ ^[0-9]+\.[0-9] ]]; then mail_sender_opt="-S from=$MAILFROM" fi fi echo "$mail_sender_opt" } #------------------------------------------------------------------------------ # Email the log file by $LOGFILE. #------------------------------------------------------------------------------ function mail_log_file () { local subject=$1 local mail_sender_opt mail_sender_opt=$(get_mail_sender_opt) $SDPMAIL -s "$subject" "$mail_sender_opt" "$MAILTO" < "$LOGFILE" } #------------------------------------------------------------------------------ # Deliver the $LOGFILE via AWS SNS. #------------------------------------------------------------------------------ function sns_log_file () { # AWS SNS has a 100 character limit for subject field local subject=$1 local short_subject="$(echo ${subject:0:100})" echo -e "Sending alert and log file contents to administrator via AWS SNS." >&2 aws --region "$AWS_DEFAULT_REGION" sns publish --topic-arn "$SNS_ALERT_TOPIC_ARN" --subject "$subject" --message "$(cat "$LOGFILE")" } #------------------------------------------------------------------------------ # Function die() - log message, send email/SNS, and exit. # If $LOGFILE is defined, write message to the log file, email/SNS log, # and exit. # If $LOGFILE is not defined, write message to the stdout, and skip # email/SNS. # If in terminal session, display message to stderr as well. #------------------------------------------------------------------------------ function die () { # mail the error (with more helpful subject line than cron) log "ERROR!!! - $HOSTNAME $P4SERVER $0: $*" if [[ "${LOGFILE:-Unset}" != Unset ]]; then if [[ "${SNS_ALERT_TOPIC_ARN:-Unset}" != Unset ]]; then log "Using SNS for log file delivery..." sns_log_file "ERROR!!! - $HOSTNAME $P4SERVER $0: $*" else log "Using email for log file delivery..." mail_log_file "ERROR!!! - $HOSTNAME $P4SERVER $0: $*" fi fi # if running from terminal, also send to stderr if tty >/dev/null; then echo -e "$@" >&2 fi rm -f "${LOGS}/ckp_running.txt" exit 1 } #------------------------------------------------------------------------------ # Convert various byte values (K,M,G,%) to bytes # Pass in values such as 1024K, 512M, 1G or 10% #------------------------------------------------------------------------------ function convert_to_bytes () { local value=$1 local totalsize=${2:-Undefined} local size= local unit= # Break up value into size (numeric) and unit (K,M,G) size=$("$GREP" -Eo '[[:alpha:]%]+|[0-9]+' <<< "$value" | head -1) unit=$("$GREP" -Eo '[[:alpha:]%]+|[0-9]+' <<< "$value" | tail -1) # Based on unit, convert to bytes case "$unit" in K) echo $((size * 1024)) ;; M) echo $((size * 1024**2)) ;; G) echo $((size * 1024**3)) ;; %) echo $((totalsize * size / 100)) ;; esac } #------------------------------------------------------------------------------ # Write a semaphore file, $LOGS/ckp_running.txt. This file is written at # the start of processing, and removed upon successful completion. It # prevents multiple concurrent operations from being launched accidentally # e.g. by multiple human admins, or a human inadvertently competing with a # cron job. # # It is also intended to get human admins to determine the root cause of # checkpoint failures. #------------------------------------------------------------------------------ function ckp_running() { if [[ -f "${LOGS}/ckp_running.txt" ]]; then die "Last checkpoint not complete. Check the backup process or contact support." fi echo "Checkpoint running." > "${LOGS}/ckp_running.txt" } #------------------------------------------------------------------------------ # Remove the ckp_running.txt semaphore file when checkpoint processing is # complete. #------------------------------------------------------------------------------ function ckp_complete() { rm -f "${LOGS}/ckp_running.txt" } #------------------------------------------------------------------------------ # Ensure key directories are writable. Abort if they are not. #------------------------------------------------------------------------------ function check_dirs () { # Check that key dirs are writable local -i dirsOK=1 dirList="$OFFLINE_DB $CHECKPOINTS $LOGS" [[ "$EDGESERVER" -eq 1 ]] && dirList+=" ${CHECKPOINTS}.${SERVERID#p4d_}" for dir in $dirList; do if [[ ! -d "$dir" || ! -w "$dir" ]]; then log "Error: Dir $dir does not exist or is not writable." dirsOK=0 fi done [[ "$dirsOK" -eq 1 ]] || die "Some expected dirs are missing or not writable. Aborting." } #------------------------------------------------------------------------------ # Add the results of df -h or df -m to the log file. #------------------------------------------------------------------------------ function check_disk_space () { log "Checking disk space..." $P4BIN diskspace >> "$LOGFILE" 2>&1 } #------------------------------------------------------------------------------ # Check value of journal; ensure it is an integer. #------------------------------------------------------------------------------ function check_journalnum () { local JNLNUM=${1:-Unset} re='^[0-9]+$' if ! [[ $JNLNUM =~ $re ]] ; then die "The journal counter value [$JNLNUM] is invalid. It must be numeric." fi } #------------------------------------------------------------------------------ # Check the checkpoints directory for the oldest checkpoint #------------------------------------------------------------------------------ function get_ckpnum () { if [[ "$EDGESERVER" -eq 0 ]]; then # shellcheck disable=SC2034 disable=SC2012 disable=SC2016 OLDESTCHECKPOINT=$(ls -1tr "${CHECKPOINTS}/" | "$GREP" ckp | "$GREP" -v md5 | head -n 1 | "$AWK" -F '.ckp.' '{ print $(2) }' | tr -d '.gz') else # shellcheck disable=SC2034 disable=SC2012 disable=SC2016 OLDESTCHECKPOINT=$(ls -1tr "${CHECKPOINTS}.${SERVERID#p4d_}/" | "$GREP" ckp | "$GREP" -v md5 | head -n 1 | "$AWK" -F '.ckp.' '{ print $(2) }' | tr -d '.gz') fi } #------------------------------------------------------------------------------ # Determine journal counter by checking counter in db.counters. #------------------------------------------------------------------------------ get_journalnum () { # get the current journal and checkpoint serial numbers. local nextCheckpointNum if [[ -r "$P4ROOT/db.counters" ]]; then nextCheckpointNum=$("$P4DBIN" -r "$P4ROOT" -k db.counters -jd - 2>&1 | grep @journal@ | cut -d '@' -f 8) if [[ -n "$nextCheckpointNum" ]]; then check_journalnum "$nextCheckpointNum" JOURNALNUM="$nextCheckpointNum" else # Special case: If db.counters is empty, then we have a new/empty data # set, so just set the value to 0. JOURNALNUM=0 fi else # Special case: If db.counters doesn't exist, then we have a new/empty # data set, so just set the value to 0. JOURNALNUM=0 fi # If we are on an edge server, the journal has already rotated, so we have to decrement the value # so that we replay the correct journal file and create the correct checkpoint number on the # edge server. # # In the case of a standby server, the journal rotation occurs on the master server, # so we don't need to increment the journal number again, so we decrement by 1. # Also, when replaying the journals to the offline db, we don't want to play to the live journal # because it is still being replicated. if [[ "$EDGESERVER" -eq 1 || "$REPLICASERVER" -eq 1 || "$STANDBYSERVER" -eq 1 ]]; then JOURNALNUM=$((JOURNALNUM - 1)) fi CHECKPOINTNUM=$((JOURNALNUM + 1)) } #------------------------------------------------------------------------------ # Determine journal space usage and minimum disk space requirement #------------------------------------------------------------------------------ get_journal_stats () { # Get minimum disk space required on server journal filesystem before server rejects commands # This will return the configured and default value, but grab the configured value which shows first # If a configured value is not present, it will use the default value # shellcheck disable=SC2034 disable=SC2016 P4JOURNALMIN=$("$P4BIN" configure show filesys.P4JOURNAL.min | "$AWK" '{ print $1 }' | $CUT -d'=' -f2 | head -1) # Get current journal free disk space # shellcheck disable=SC2034 P4JOURNALFREE=$("$P4BIN" -ztag -F "%freeBytes%" diskspace P4JOURNAL) # Get total available disk space for journal # shellcheck disable=SC2034 P4JOURNALTOTAL=$("$P4BIN" -ztag -F "%totalBytes%" diskspace P4JOURNAL) } #------------------------------------------------------------------------------ # Verify that the offline databases are usable by checking the existence # of a 'offline_db_usable.txt' file that is written only when databases # are in a known-good state, following successful recovery from a checkpoint. #------------------------------------------------------------------------------ check_offline_db_usable () { # Check it is OK if [[ ! -f "$OFFLINE_DB/offline_db_usable.txt" ]]; then die "Offline database not in a usable state. Check the backup process." fi if [[ ! -f "$OFFLINE_DB/db.counters" ]]; then die "Offline database not found. Consider creating it with live_checkpoint.sh. Be aware that live_checkpoint.sh locks the live system and may take a long time. Aborting." fi } #------------------------------------------------------------------------------ # Determine journal counter in offline databases. #------------------------------------------------------------------------------ get_offline_journal_num () { # Get the journal number of the offline database check_offline_db_usable OFFLINEJNLNUM=$("$P4DBIN" -r "$OFFLINE_DB" -jd - db.counters 2>&1 | grep '@journal@' | cut -d "@" -f 8 2>> "$LOGFILE") || die "Cannot get the offline journal number. Abort!" check_journalnum "$OFFLINEJNLNUM" log "Offline journal number is: $OFFLINEJNLNUM" } #------------------------------------------------------------------------------ # Cleanup old checkpoint and numbered journal files. #------------------------------------------------------------------------------ remove_old_checkpoints_and_journals () { local CheckpointsDir= local StandbyReplicaJournalsDir= local FilePrefix= local JournalPrefix= local File= if [[ "$KEEPCKPS" -eq 0 ]]; then log "Skipping cleanup of old checkpoints because KEEPCKPS is set to 0." else log "Deleting obsolete checkpoints and journals. Keeping latest $KEEPCKPS per KEEPCKPS setting in p4_vars." # For the master server, we can safely rely on the SDP standard that the journalPrefix # is of the form '/p4/N/checkpoints/p4_N'. For replicas and edge servers, determine # the values dynamically based on the current journalPrefix value for the given ServerID. if [[ "$SERVERID" == "$P4MASTER_ID" ]]; then # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N CheckpointsDir="${CHECKPOINTS}" FilePrefix="${P4SERVER}" else JournalPrefix="$("$P4DBIN" -r "$P4ROOT" -k db.config -jd - | grep "@${SERVERID}@ @journalPrefix@" | cut -d '@' -f 10)" if [[ -n "$JournalPrefix" ]]; then CheckpointsDir="${JournalPrefix%/*}" FilePrefix="${JournalPrefix##*/}" else log "Warning: Could not determine journalPrefix for ServerID $SERVERID." CheckpointsDir="${CHECKPOINTS}" FilePrefix="${P4SERVER}" fi fi if [[ -d "$CheckpointsDir" ]]; then # Remove selected checkpoint and journal files based on the KEEPCKPS # setting regardless of whether compressed or not. # We multiply KEEPCKP by 2 for the ckp files because of the md5 files. # shellcheck disable=SC2012 for File in $(ls -t "${CheckpointsDir}/${FilePrefix}".ckp.* 2>/dev/null | "$AWK" "NR > ($KEEPCKPS * 2)"); do log "rm -f $File" rm -f "$File" done # Use KEEPJNLS to allow for separate journal rotation at a higher # frequency. # shellcheck disable=SC2012 for File in $(ls -t "${CheckpointsDir}/${FilePrefix}".jnl.* 2>/dev/null | "$AWK" "NR > $KEEPJNLS"); do log "rm -f $File" rm -f "$File" done fi StandbyReplicaJournalsDir="${P4HOME}/journals.rep" if [[ -d "$StandbyReplicaJournalsDir" ]]; then # shellcheck disable=SC2012 for File in $(ls -t "$StandbyReplicaJournalsDir/${FilePrefix}".ckp.* 2>/dev/null | "$AWK" "NR > ($KEEPCKPS * 2)"); do log "rm -f $File" rm -f "$File" done # shellcheck disable=SC2012 for File in $(ls -t "${StandbyReplicaJournalsDir}/${FilePrefix}".jnl.* 2>/dev/null | "$AWK" "NR > $KEEPJNLS"); do log "rm -f $File" rm -f "$File" done fi # This is a workaround to cleanup $LOGS/journal.NNN files on standby replicas. # These files are normally removed by p4d during journal rotation on the standby # replica. Use only if standby journals are not removed due to a standby replica # sharing /hxdepots with its P4TARGET server. To use this workround, add this # line to the end of the /p4/common/config/p4_N.vars file: # # export SDP_REMOVE_STANDBY_JOURNALS=1 # if [[ "${SDP_REMOVE_STANDBY_JOURNALS:-0}" == 1 && "$(is_standby "$SERVERID")" == YES ]]; then log "Removing excess journal.NNN files due to SDP_REMOVE_STANDBY_JOURNALS=1." # shellcheck disable=SC2012 for File in $(ls -t "${LOGS}/journal."* 2>/dev/null | "$AWK" "NR > $KEEPJNLS"); do # Process only files named 'journal.NNN' in $LOGS. [[ "$File" =~ /journal.[0-9]+$ ]] || continue log "rm -f $File" rm -f "$File" done fi fi } #------------------------------------------------------------------------------ # Function: is_server_up ($server) # # Input: # $1 - server, one of 'p4d', 'p4p', or 'p4broker' # # Output: None # # Return Codes: # 0: Server is up. # 1: Server is down. # 2: Bad usage. # # Server up/down status is checked using the appropriate init script. #------------------------------------------------------------------------------ function is_server_up () { local server="${1:-Unset}" case "$server" in (p4d) "$P4DInitScript" status > /dev/null 2>&1 return $? ;; (p4broker) "$P4BrokerInitScript" status > /dev/null 2>&1 return $? ;; (p4p) "$P4ProxyInitScript" status > /dev/null 2>&1 return $? ;; (Unset) log "Internal Error: is_server_up(): No server type specified." return 2 ;; (*) log "Internal Error: is_server_up(): Unknown server specified: $server" return 2 ;; esac } #------------------------------------------------------------------------------ # Shutdown p4d using systemd if configured for systemd. Otherwise call the # underlying init script directly. # # Log the shutdown activity. # #------------------------------------------------------------------------------ stop_p4d () { log "Shutting down the ${P4DBIN##*/} server." local -i maxStopDelay=${SDP_MAX_STOP_DELAY_P4D:-43200} local -i stopVerified=0 local -i i=0 if [[ -r "$P4DSystemdServiceFile" ]]; then { sudo systemctl stop "${P4DBIN##*/}"; } >> "$LOGFILE" 2>&1 ||\ die "Failed to execute: sudo systemctl stop ${P4DBIN##*/}" # With systemd, we must independently confirm service stop, # waiting if needed. stopVerified=0 i=0; while [[ "$i" -lt "$maxStopDelay" ]]; do if is_server_up p4d; then sleep 1 else stopVerified=1 break fi i+=1 done else "$P4DInitScript" stop >> "$LOGFILE" 2>&1 stopVerified=1 fi if [[ "$stopVerified" -eq 1 ]]; then log "Stopped ${P4DBIN##*/} server." return 0 else log "Error: Server ${P4DBIN##*/} did not stop after $maxStopDelay seconds. Tailing $P4LOG:" tail "$P4LOG" >> "$LOGFILE" 2>&1 die "Aborting due to failed p4d stop." fi } #------------------------------------------------------------------------------ # Shutdown p4broker using systemd if configured for systemd. Otherwise call the # underlying init script directly. # # Log the shutdown activity. # #------------------------------------------------------------------------------ stop_p4broker () { log "Shutting down the ${P4BROKERBIN##*/} server." local -i maxStopDelay=${SDP_MAX_STOP_DELAY_P4BROKER:-600} local -i stopVerified=0 local -i i=0 if [[ -r "$P4BrokerSystemdServiceFile" ]]; then { sudo systemctl stop "${P4BROKERBIN##*/}"; } >> "$LOGFILE" 2>&1 ||\ die "Failed to execute: sudo systemctl stop ${P4BROKERBIN##*/}" # With systemd, we must independently confirm service stop, # waiting if needed. stopVerified=0 i=0; while [[ "$i" -lt "$maxStopDelay" ]]; do if is_server_up p4broker; then sleep 1 else stopVerified=1 break fi i+=1 done else "$P4BrokerInitScript" stop >> "$LOGFILE" 2>&1 stopVerified=1 fi if [[ "$stopVerified" -eq 1 ]]; then log "Stopped ${P4BROKERBIN##*/} server." return 0 else die "Server ${P4BROKERBIN##*/} did not stop after $maxStartDelay seconds." fi } #------------------------------------------------------------------------------ # Shutdown p4p using systemd if configured for systemd. Otherwise call the # underlying init script directly. # # Log the shutdown activity. # #------------------------------------------------------------------------------ stop_p4p () { log "Shutting down the ${P4PBIN##*/} server." local -i maxStopDelay=${SDP_MAX_STOP_DELAY_P4P:-600} local -i stopVerified=0 local -i i=0 if [[ -r "$P4ProxySystemdServiceFile" ]]; then { sudo systemctl stop "${P4PBIN##*/}"; } >> "$LOGFILE" 2>&1 ||\ die "Failed to execute: sudo systemctl stop ${P4PBIN##*/}" # With systemd, we must independently confirm service stop, # waiting if needed. stopVerified=0 i=0; while [[ "$i" -lt "$maxStopDelay" ]]; do if is_server_up p4p; then sleep 1 else stopVerified=1 break fi i+=1 done else "$P4ProxyInitScript" stop >> "$LOGFILE" 2>&1 stopVerified=1 fi if [[ "$stopVerified" -eq 1 ]]; then log "Stopped ${P4PBIN##*/} server." return 0 else die "Server ${P4PBIN##*/} did not stop after $maxStopDelay seconds." fi } #------------------------------------------------------------------------------ # Start p4d using systemd if configured for systemd. Otherwise call the # underlying init script directly. # # Log the startup activity. # # This is a do-or-die function. It returns success upon successful server # startup, or else dies. #------------------------------------------------------------------------------ function start_p4d () { log "Starting the ${P4DBIN##*/} server." local -i maxStartDelay=${SDP_MAX_START_DELAY_P4D:-120} local -i startVerified=0 local -i i=0 if [[ -r "$P4DSystemdServiceFile" ]]; then { sudo systemctl start "${P4DBIN##*/}"; } ||\ die "Failed to execute: sudo systemctl start ${P4DBIN##*/}" else "$P4DInitScript" start >> "$LOGFILE" 2>&1 fi # Confirm that p4d started, waiting if needed. startVerified=0 i=0; while [[ "$i" -lt "$maxStartDelay" ]]; do if is_server_up p4d; then startVerified=1 break else sleep 1 fi i+=1 done if [[ "$startVerified" -eq 1 ]]; then log "Server ${P4DBIN##*/} started successfully." return 0 else log "Error: Server ${P4DBIN##*/} did not start after $maxStartDelay seconds. Tailing $P4LOG:" tail "$P4LOG" >> "$LOGFILE" 2>&1 die "Aborting due to failed p4d start." fi } #------------------------------------------------------------------------------ # Start p4broker using systemd if configured for systemd. Otherwise call the # underlying init script directly. # # Log the startup activity. # # This is a do-or-die function. It returns success upon successful server # startup, or else dies. #------------------------------------------------------------------------------ function start_p4broker () { log "Starting the ${P4BROKERBIN##*/} server." local -i maxStartDelay=${SDP_MAX_START_DELAY_P4BROKER:-60} local -i startVerified=0 local -i i=0 if [[ -r "$P4BrokerSystemdServiceFile" ]]; then { sudo systemctl start "${P4BROKERBIN##*/}"; } ||\ die "Failed to execute: sudo systemctl start ${P4BROKERBIN##*/}" else "$P4BrokerInitScript" start >> "$LOGFILE" 2>&1 fi # Confirm that p4broker started, waiting if needed. startVerified=0 i=0; while [[ "$i" -lt "$maxStartDelay" ]]; do if is_server_up p4broker; then startVerified=1 break else sleep 1 fi i+=1 done if [[ "$startVerified" -eq 1 ]]; then log "Server ${P4BROKERBIN##*/} started successfully." return 0 else die "Server ${P4BROKERBIN##*/} did not start after $maxStartDelay seconds." fi } #------------------------------------------------------------------------------ # Start p4p using systemd if configured for systemd. Otherwise call the # underlying init script directly. # # Log the startup activity. # # This is a do-or-die function. It returns success upon successful server # startup, or else dies. #------------------------------------------------------------------------------ function start_p4p () { log "Starting the ${P4PBIN##*/} server." local -i maxStartDelay=${SDP_MAX_START_DELAY_P4P:-60} local -i startVerified=0 local -i i=0 if [[ -r "$P4ProxySystemdServiceFile" ]]; then { sudo systemctl start "${P4PBIN##*/}"; } ||\ die "Failed to execute: sudo systemctl start ${P4PBIN##*/}" else "$P4ProxyInitScript" start >> "$LOGFILE" 2>&1 fi # Confirm that p4p started, waiting if needed. startVerified=0 i=0; while [[ "$i" -lt "$maxStartDelay" ]]; do if is_server_up p4p; then startVerified=1 break else sleep 1 fi i+=1 done if [[ "$startVerified" -eq 1 ]]; then log "Server ${P4PBIN##*/} started successfully." return 0 else die "Server ${P4PBIN##*/} did not start after $maxStartDelay seconds." fi } #------------------------------------------------------------------------------ # Do a front-door 'p4d admin journal' command to rotate the current/active # journal file on the master server, starting a fresh new P4JOURNAL file. # # In a distributed topology with replicas/edge servers, this function must # be called only on the master/commit server. #------------------------------------------------------------------------------ function truncate_journal () { local CheckpointFile="${CHECKPOINTS}/${P4SERVER}.ckp.${CHECKPOINTNUM}.gz" local JournalFile="${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM}" if [[ "$SERVERID" == "$P4MASTER_ID" ]]; then [[ -f "$CheckpointFile" ]] && \ die "Checkpoint $CheckpointFile already exists, check the backup process." [[ -f "$JournalFile" ]] && \ die "Journal $JournalFile already exists, check the backup process." log "Truncating journal..." # During journal rotation, either by a front-door 'p4 admin journal' or a # back-door 'p4d -jj', p4d does a copy-then-delete rather than an mv at # the OS level. During rotation, the perforce server will pause # responses to clients (just as with a checkpoint), but this should be # for a short period of time even for large data sets, as the journal # typically represents a single day of metadata. # Curly braces capture output of 'time'. "$P4CBIN"/p4login -p "$P4MASTERPORT" { time "$P4BIN" -p "$P4MASTERPORT" admin journal; } >> "$LOGFILE" 2>&1 || { die "Journal rotation failed. Abort!"; } # The test below waits until the journal file exists in the checkpoints directory before proceeding. test=1 while [[ $test != 0 ]]; do sleep 5 if [[ -f "$JournalFile" ]]; then test=0 fi done "$P4CBIN"/p4login else log "Warning: The truncate_journal () function should only be called on the master server. It is ignored on edge and replica replica servers." fi } #------------------------------------------------------------------------------ # Call 'p4d -jj' to rotate the current/active journal file on the master # server from an edge server, starting a fresh new P4JOURNAL file. # # In a distributed topology with edge and standby servers, this function can be # used to trigger a journal rotation on master/commit server. It's not meant to # be used from the master server itself. #------------------------------------------------------------------------------ function truncate_journal_on_master () { # Increment Edge journal number since the journal will increment on the master after calling journal rotation local EdgeJournalNum=$((JOURNALNUM + 1)) local StandbyJournalNum=$((JOURNALNUM + 2)) # If using journalcopy, have to add 2 since live journal is in checkpoints folder local JournalFile= if [[ "$EDGESERVER" -eq 1 ]]; then # Refer to ckp/jnl files starting like (example ServerID=p4d_edge_nyc): # /p4/N/checkpoints.edge_nyc/p4_N.edge_nyc JournalFile="${CHECKPOINTS}.${SERVERID#p4d_}/${P4SERVER}.${SERVERID#p4d_}.jnl.${EdgeJournalNum}" elif [[ "$STANDBYSERVER" -eq 1 ]]; then JournalFile="${CHECKPOINTS}/${P4SERVER}.jnl.${StandbyJournalNum}" fi if [[ "$SERVERID" != "$P4MASTER_ID" ]]; then [[ -f "$JournalFile" ]] && \ die "Journal $JournalFile already exists, check the backup process." log "Truncating journal on ${P4MASTERPORT}." # 'p4d -jj' does a copy-then-delete, instead of a simple mv. # During 'p4d -jj' the perforce server will hang the responses to clients, # this should be for a very short period of time even for large data # sets, as the journal represents a single day of metadata. # Curly braces capture output of 'time'. "$P4CBIN"/p4login -p "$P4MASTERPORT" { time "$P4BIN" -p "$P4MASTERPORT" admin journal; } >> "$LOGFILE" 2>&1 || { die "Journal rotation failed. Abort!"; } # The test below waits until the journal file exists in the checkpoints directory before proceeding. test=1 while [[ $test != 0 ]]; do sleep 5 if [[ -f "$JournalFile" ]]; then test=0 fi done "$P4CBIN"/p4login -service else log "Warning: truncate_journal_on_master () function should not be called on a master server. Ignoring." fi } #------------------------------------------------------------------------------ # Similar to truncate_journal() above, p4d_truncate_journal() is intended to be # usable form the p4d_base init script, to allow journal rotation on p4d # start. As it may be called from the init script, it may be called on the # master, a replica, or the edge. However, it should will only do the journal # rotation if called on the master. #------------------------------------------------------------------------------ function p4d_truncate_journal () { local JournalFile="${CHECKPOINTS}/${P4SERVER}.jnl.${JOURNALNUM}" if [[ "$SERVERID" == "$P4MASTER_ID" ]]; then [[ -f "$JournalFile" ]] && \ die "Journal $JournalFile already exists, check the backup process." log "Rotating journal prior to starting p4d." "$P4DBIN" -r "$P4ROOT" -J "$P4JOURNAL" -jj >> "$LOGFILE" 2>&1 ||\ die "Failed to rotate journal. Aborting p4d server start." else log "Warning: The p4d_truncate_journal() function has no effect if called on a server other than the master. Ignoring." fi } #------------------------------------------------------------------------------ # Replay any and all numbered journal files into the offline databases. #------------------------------------------------------------------------------ function replay_journals_to_offline_db () { local CheckpointsDir= local FilePrefix= local NumberedJournal= local JournalPrefix= log "Replay any unreplayed journals to the offline database." check_offline_db_usable # For the master server, we can safely rely on the SDP standard that the # journalPrefix is of the form '/p4/N/checkpoints/p4_N'. For replicas and # edge servers, determine the values dynamically based on the current journal # Prefix value for the given ServerID. if [[ "$SERVERID" == "$P4MASTER_ID" ]]; then # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N CheckpointsDir="${CHECKPOINTS}" FilePrefix="${P4SERVER}" else JournalPrefix="$("$P4DBIN" -r "$P4ROOT" -k db.config -jd - | grep "@${SERVERID}@ @journalPrefix@" | cut -d '@' -f 10)" if [[ -n "$JournalPrefix" ]]; then CheckpointsDir="${JournalPrefix%/*}" FilePrefix="${JournalPrefix##*/}" else log "Warning: Could not determine journalPrefix for ServerID $SERVERID." CheckpointsDir="${CHECKPOINTS}" FilePrefix="${P4SERVER}" fi fi for (( j=OFFLINEJNLNUM; j <= JOURNALNUM; j++ )); do NumberedJournal="${CheckpointsDir}/${FilePrefix}.jnl.${j}" log "Replay journal $NumberedJournal to offline db." rm -f "${OFFLINE_DB}/offline_db_usable.txt" >> "$LOGFILE" 2>&1 # Curly braces capture output of 'time'. { time "$P4DBIN" -r "$OFFLINE_DB" -jr -f "$NumberedJournal"; } >> "$LOGFILE" 2>&1 || { die "Offline journal replay failed. Abort!"; } echo "Offline db file restored successfully." > "${OFFLINE_DB}/offline_db_usable.txt" done } #------------------------------------------------------------------------------ # Replay the live, active P4JOURNAL file into the offline database. #------------------------------------------------------------------------------ function replay_active_journal_to_offline_db () { log "Replay active journal to offline db." local ActiveJournal= # On a standby server, the current/active journal is named /p4/N/logs/journal.<jnlNum>. # On the master and other server types, the active journal is $P4JOURNAL. if [[ "$STANDBYSERVER" -eq 1 ]]; then local _JNLNUM _JNLNUM=$("$P4DBIN" -r "$P4ROOT" -jd - db.counters 2>&1 | grep '@journal@' | cut -d "@" -f 8 2>> "$LOGFILE") || die "Cannot get $P4ROOT journal number. Abort!" ActiveJournal="$LOGS/journal.$_JNLNUM" else ActiveJournal="$P4JOURNAL" fi # Curly braces capture output of 'time'. { time "$P4DBIN" -r "$OFFLINE_DB" -jr -f "${ActiveJournal}"; } >> "$LOGFILE" 2>&1 || { die "Active Journal replay failed. Abort!"; } } #------------------------------------------------------------------------------ # Recreate offline databases from the latest checkpoint. #------------------------------------------------------------------------------ function recreate_offline_db_files () { local CheckpointsDir= local FilePrefix= local LastCheckpointMD5= local LastCheckpoint= # For the master server, we can safely rely on the SDP standard that the journalPrefix # is of the form '/p4/N/checkpoints/p4_N'. For replicas and edge servers, determine # the values dynamically based on the current journalPrefix value for the given ServerID. if [[ "$SERVERID" == "$P4MASTER_ID" ]]; then # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N CheckpointsDir="${CHECKPOINTS}" FilePrefix="${P4SERVER}" else JournalPrefix="$("$P4DBIN" -r "$P4ROOT" -k db.config -jd - | grep "@${SERVERID}@ @journalPrefix@" | cut -d '@' -f 10)" if [[ -n "$JournalPrefix" ]]; then CheckpointsDir="${JournalPrefix%/*}" FilePrefix="${JournalPrefix##*/}" else log "Warning: Could not determine journalPrefix for ServerID $SERVERID." CheckpointsDir="${CHECKPOINTS}" FilePrefix="${P4SERVER}" fi fi if [[ -z "$(ls "${CheckpointsDir}/${FilePrefix}".ckp.*.md5)" ]]; then ckp_complete if [[ "$SERVERID" == "$P4MASTER_ID" ]]; then die "No checkpoints found in $CheckpointsDir with prefix $FilePrefix. Consider running 'live_checkpoint.sh $SDP_INSTANCE'." else die "No checkpoints found in $CheckpointsDir with prefix $FilePrefix." fi fi # shellcheck disable=SC2012 LastCheckpointMD5=$(ls -t "${CheckpointsDir}/${FilePrefix}".ckp.*.md5 | head -1) [[ -n "$LastCheckpointMD5" ]] || \ die "Could not find *.md5 file for latest checkpoint. Abort!" # shellcheck disable=SC2129 rm -f "${OFFLINE_DB}"/offline_db_usable.txt >> "$LOGFILE" 2>&1 rm -f "${OFFLINE_DB}"/db.* >> "$LOGFILE" 2>&1 rm -f "${OFFLINE_DB}"/save/db.* >> "$LOGFILE" 2>&1 # Account for the idiosyncrasy that MD5 files for checkpoints may look # like p4_N.ckp.gz.md5 or p4_N.ckp.md5. if [[ "$LastCheckpointMD5" == *".gz.md5" ]]; then LastCheckpoint="${LastCheckpointMD5%.md5}" else LastCheckpoint="${LastCheckpointMD5%.md5}.gz" fi [[ -r "$LastCheckpoint" ]] || \ die "Missing last checkpoint file: $LastCheckpoint. Abort!" log "Recovering from last full checkpoint, $LastCheckpoint." # Curly braces capture output of 'time'. { time "$P4DBIN" -r "$OFFLINE_DB" -jr -z "${LastCheckpoint}"; } >> "$LOGFILE" 2>&1 || { die "Restore of checkpoint to $OFFLINE_DB failed!"; } echo "Offline db file restored successfully." > "${OFFLINE_DB}/offline_db_usable.txt" } #------------------------------------------------------------------------------ # Take a live checkpoint from db.* files in P4ROOT. #------------------------------------------------------------------------------ function checkpoint () { local CheckpointsDir= local FilePrefix= log "Create a new checkpoint from live db files in $P4ROOT." if [[ "$SERVERID" == "$P4MASTER_ID" ]]; then # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N CheckpointsDir="${CHECKPOINTS}" FilePrefix="${P4SERVER}" else die "Live checkpoints can only be run on the master server." fi # Curly braces capture output of 'time'. { time "$P4DBIN" -r "$P4ROOT" -jc -Z "${CheckpointsDir}/${FilePrefix}"; } >> "$LOGFILE" 2>&1 || { die "ERROR - New live checkpoint failed!"; } } #------------------------------------------------------------------------------A # Take a checkpoint from the ROOTDIR, typically either /p4/N/root or # /p4/N/offline_db. #------------------------------------------------------------------------------ function dump_checkpoint () { local CheckpointsDir= local NewCheckpoint= local NewCheckpointMD5= local FilePrefix= local JournalPrefix= local -i DoSnapshot=0 local -i SnapshotOK=1 local -i CheckpointOK=1 # shellcheck disable=SC2153 log "Dump out new checkpoint from db files in $ROOTDIR." # For the master server, we can safely rely on the SDP standard that the journalPrefix # is of the form '/p4/N/checkpoints/p4_N'. For replicas and edge servers, determine # the values dynamically based on the current journalPrefix value for the given ServerID. if [[ "$SERVERID" == "$P4MASTER_ID" ]]; then # Refer to ckp/jnl files starting like: /p4/N/checkpoints/p4_N CheckpointsDir="${CHECKPOINTS}" FilePrefix="${P4SERVER}" else JournalPrefix="$("$P4DBIN" -r "$P4ROOT" -k db.config -jd - | grep "@${SERVERID}@ @journalPrefix@" | cut -d '@' -f 10)" if [[ -n "$JournalPrefix" ]]; then CheckpointsDir="${JournalPrefix%/*}" FilePrefix="${JournalPrefix##*/}" else log "Warning: Could not determine journalPrefix for ServerID $SERVERID." CheckpointsDir="${CHECKPOINTS}" FilePrefix="${P4SERVER}" fi fi NewCheckpoint="${CheckpointsDir}/${FilePrefix}.ckp.${CHECKPOINTNUM}.gz" NewCheckpointMD5="${NewCheckpoint}.md5" if [[ -r "$NewCheckpoint" && -r "$NewCheckpointMD5" ]]; then log "Warning: Skipping generation of existing checkpoint $NewCheckpoint.\\nVerified MD5 file exists: $NewCheckpointMD5." return fi # Curly braces capture output of 'time'. if { time "$P4DBIN" -r "$ROOTDIR" -jd -z "${CheckpointsDir}/${FilePrefix}.ckp.${CHECKPOINTNUM}.gz"; } >> "$LOGFILE" 2>&1; then CheckpointOK=1 else CheckpointOK=0 fi if [[ -n "${SNAPSHOT_SCRIPT:-}" ]]; then DoSnapshot=1 log "Calling site-specific snapshot script: $SNAPSHOT_SCRIPT" if "$SNAPSHOT_SCRIPT" >> "$LOGFILE" 2>&1; then SnapshotOK=1 else SnapshotOK=0 fi fi if [[ "$DoSnapshot" -eq 0 ]]; then if [[ "$CheckpointOK" -eq 1 ]]; then log "New checkpoint dump succeeded." else die "New checkpoint dump FAILED." fi else if [[ "$CheckpointOK" -eq 0 && "$SnapshotOK" -eq 0 ]]; then die "Both checkpoint dump and snapshot FAILED." elif [[ "$CheckpointOK" -eq 1 && "$SnapshotOK" -eq 0 ]]; then die "New checkpoint dump succeeded, but snapshot FAILED." elif [[ "$CheckpointOK" -eq 0 && "$SnapshotOK" -eq 1 ]]; then die "New checkpoint dump FAILED, but snapshot succeeded." else log "New checkpoint dump and snapshot succeeded." fi fi } #------------------------------------------------------------------------------ # Compare journal numbers between live and offline databases, to ensure # they can be safely swapped out. #------------------------------------------------------------------------------ function compare_journal_numbers () { local _OFFLINEJNLNUM _OFFLINEJNLNUM=$("$P4DBIN" -r "$OFFLINE_DB" -jd - db.counters 2>&1 | grep '@journal@' | cut -d "@" -f 8 2>> "$LOGFILE") || die "Cannot get $OFFLINE_DB journal number. Abort!" check_journalnum "$_OFFLINEJNLNUM" # Get the journal number of the root database if [[ ! -f "$P4ROOT/db.counters" ]]; then die "$P4ROOT database not found. Something is seriously wrong since the server was just running a minute ago! Contact support@perforce.com" fi local _JNLNUM _JNLNUM=$("$P4DBIN" -r "$P4ROOT" -jd - db.counters 2>&1 | grep '@journal@' | cut -d "@" -f 8 2>> "$LOGFILE") || die "Cannot get $P4ROOT journal number. Abort!" check_journalnum "$_JNLNUM" if [[ "$_JNLNUM" -gt "$_OFFLINEJNLNUM" ]]; then log "$P4ROOT journal number is: $_JNLNUM" log "$OFFLINE_DB journal number is: $_OFFLINEJNLNUM" die "$OFFLINE_DB journal number is less than $P4ROOT, cannot switch." fi } #------------------------------------------------------------------------------ # Swap out live db.* database files in P4ROOT with those in offline_db. #------------------------------------------------------------------------------ function switch_db_files () { local verifyCmd="$P4CBIN/verify_sdp.sh -skip crontab,excess,license,masterid,version -L off" log "Calling 'verify_sdp.sh' before swapping db.* files:\\n$verifyCmd" $verifyCmd >> "$LOGFILE" 2>&1 ||\ die "Error: Cannot confirm all is well with $P4CBIN/verify_sdp.sh. Aborting" # Compare the Offline and Master journal numbers before switching to make # sure they match. compare_journal_numbers log "Switching root and offline_db links." [[ -d "${P4ROOT}"/save ]] || mkdir -p "${P4ROOT}"/save >> "$LOGFILE" 2>&1 # shellcheck disable=SC2129 echo "P4ROOT is not available during switch_db_files() processing." > "$P4ROOT/P4ROOT_not_usable.txt" 2>> "$LOGFILE" echo "P4ROOT is not available during switch_db_files() processing." > "$OFFLINE_DB/P4ROOT_not_usable.txt" 2>> "$LOGFILE" # shellcheck disable=SC2129 rm -f "${P4ROOT}"/save/db.* >> "$LOGFILE" 2>&1 rm -rf "${P4ROOT}"/server.locks >> "$LOGFILE" 2>&1 mv "${P4ROOT}"/db.* "${P4ROOT}"/save/. >> "$LOGFILE" 2>&1 if [[ -r "$P4ROOT"/license ]]; then mv "${P4ROOT}"/license "${OFFLINE_DB}/." >> "$LOGFILE" 2>&1 fi if [[ -n "$(ls "$P4ROOT"/license* 2>/dev/null)" ]]; then mv "${P4ROOT}"/license* "${OFFLINE_DB}/." >> "$LOGFILE" 2>&1 fi if [[ -r "${P4ROOT}"/rdb.lbr ]]; then mv "${P4ROOT}"/rdb.lbr "${OFFLINE_DB}/." >> "$LOGFILE" 2>&1 fi if [[ -n "$(ls "$P4ROOT"/state* 2>/dev/null)" ]]; then mv "${P4ROOT}"/state* "${OFFLINE_DB}/." >> "$LOGFILE" 2>&1 fi if [[ -r "${P4ROOT}"/server.id ]]; then mv "${P4ROOT}"/server.id "${OFFLINE_DB}/." >> "$LOGFILE" 2>&1 fi if [[ -n "$(ls "$P4ROOT"/server.id* 2>/dev/null)" ]]; then mv "${P4ROOT}"/server.id* "${OFFLINE_DB}/." >> "$LOGFILE" 2>&1 fi rm -f "${OFFLINE_DB}/offline_db_usable.txt" >> "$LOGFILE" 2>&1 LinkOfflineDB="$(readlink "$OFFLINE_DB")" LinkP4ROOT="$(readlink "$P4ROOT")" unlink "$OFFLINE_DB" unlink "$P4ROOT" ln -s "$LinkOfflineDB" "$P4ROOT" >> "$LOGFILE" 2>&1 ||\ die "Link of $LinkOfflineDB to $P4ROOT failed." ln -s "$LinkP4ROOT" "$OFFLINE_DB" >> "$LOGFILE" 2>&1 ||\ die "Link of $LinkP4ROOT to $OFFLINE_DB failed." rm -f "$P4ROOT/P4ROOT_not_usable.txt" >> "$LOGFILE" 2>&1 rm -f "$OFFLINE_DB/P4ROOT_not_usable.txt" >> "$LOGFILE" 2>&1 } #------------------------------------------------------------------------------ # Function: trim_log_file ($LogToTrim, $MaxLines) # # For log files expected to be short, keep them at a max size. # # When they get too big, trim them from the top first, so the most recent # output is retained. # # Thus function treats its processing as non-essential; most errors are # silently ignored and output discarded. Only error output related to replacing # the original log is retained in LOGFILE or displayed. #------------------------------------------------------------------------------ function trim_log_file () { local LogToTrim="${1:-}" local MaxLines="${2:-5000}" local TmpFile= local Lines= [[ -w "$LogToTrim" ]] || return # Abort if MaxLines isn't numeric. [[ "$MaxLines" =~ ^[0-9]+$ ]] || return TmpFile="${LogToTrim}.trimming.$$.$RANDOM" Lines=$(wc -l "$LogToTrim") Lines=${Lines%% *} # Confirm Lines is a number, else just abort. [[ "$Lines" =~ ^[0-9]+$ ]] || return # If the file isn't big enough to need trimming, abort. [[ "$Lines" -gt "$MaxLines" ]] || return log "Trimming log $LogToTrim from $Lines to $MaxLines lines." # If the trimming fails, discard output and just return. if tail -"$MaxLines" "$LogToTrim" > "$TmpFile" 2>/dev/null; then if [[ -n "${LOGFILE:-}" ]]; then mv -f "$TmpFile" "$LogToTrim" >> "$LOGFILE" 2>&1 else mv -f "$TmpFile" "$LogToTrim" fi else return fi } #------------------------------------------------------------------------------ # Rotate specified log files, and compress with gzip. #------------------------------------------------------------------------------ function rotate_log_file () { local LogToRotate="${1:-}" local GzExt="${2:-}" local -i i=1 local Datestamp= local RotatedLog= local RotatedZippedLog= [[ -n "$LogToRotate" ]] || return if [[ -n "${LOGFILE:-}" ]]; then pushd "$LOGS" > /dev/null 2>> "$LOGFILE" || die "Could not cd to: $LOGS" else pushd "$LOGS" > /dev/null || die "Could not cd to: $LOGS" fi Datestamp=$(date +'%Y-%m-%d_%H-%M-%S') RotatedLog="${LogToRotate}.${Datestamp}" if [[ -f "${LogToRotate}" ]]; then if [[ -n "${LOGFILE:-}" ]]; then mv -f "${LogToRotate}" "${RotatedLog}" >> "$LOGFILE" 2>&1 if [[ -n "$GzExt" ]]; then RotatedZippedLog="${RotatedLog}${GzExt}" # If needed, move existing zipped log aside. if [[ -e "$RotatedZippedLog" ]]; then while [[ -e "${LogToRotate}.${Datestamp}.${i}${GzExt}" ]]; do i+=1 done log "Moving pre-existing $RotatedZippedLog aside to ${LogToRotate}.${Datestamp}.${i}${GzExt}" >> "$LOGFILE" 2>&1 mv -f "$RotatedZippedLog" "${LogToRotate}.${Datestamp}.${i}${GzExt}" >> "$LOGFILE" 2>&1 fi gzip "$RotatedLog" >> "$LOGFILE" 2>&1 fi else mv -f "${LogToRotate}" "${RotatedLog}" if [[ -n "$GzExt" ]]; then RotatedZippedLog="${RotatedLog}${GzExt}" # If needed, move existing zipped log aside. if [[ -e "$RotatedZippedLog" ]]; then while [[ -e "${LogToRotate}.${Datestamp}.${i}${GzExt}" ]]; do i+=1 done log "Moving pre-existing $RotatedZippedLog aside to ${LogToRotate}.${Datestamp}.${i}${GzExt}" mv -f "$RotatedZippedLog" "${LogToRotate}.${Datestamp}.${i}${GzExt}" fi gzip "$RotatedLog" fi fi fi if [[ -n "${LOGFILE:-}" ]]; then popd > /dev/null 2>> "$LOGFILE" || die "Could not cd to: $OLDPWD" else popd > /dev/null || die "Could not cd to: $OLDPWD" fi } #------------------------------------------------------------------------------ # At the start of each run for live_checkpoint.sh, daily_checkpoint.sh, and # recreate_db_checkpoint.sh, before *any* logging activity occurs, rotate the # logs from the most recent prior run, always named "checkpoint.log" or "log". #------------------------------------------------------------------------------ function rotate_last_run_logs () { # Rotate prior log file for the current script. rotate_log_file "$LOGFILE" # Rotate prior server log. rotate_log_file "log" ".gz" # Rotate prior broker log. rotate_log_file "p4broker.log" ".gz" # Rotate prior audit log. rotate_log_file "audit.log" ".gz" } #------------------------------------------------------------------------------ # Remove log files matching a specified name prefix, preserving a specified # number of the recent logs. #------------------------------------------------------------------------------ function remove_log_files () { REMOVE_LOGNAME=$1 KEEPNUM=$2 # shellcheck disable=SC2012 for I_LOGFILE in $(ls -t "${REMOVE_LOGNAME:?}"* 2>/dev/null | $AWK "NR > $KEEPNUM"); do log "rm -f $I_LOGFILE" rm -f "$I_LOGFILE" done } #------------------------------------------------------------------------------ # Remove old logs. #------------------------------------------------------------------------------ function remove_old_logs () { # Remove old Checkpoint Logs # Use KEEPJNLS rather than KEEPLOGS, so we keep the same number # of checkpoint logs as we keep checkpoints. pushd "$LOGS" > /dev/null 2>> "$LOGFILE" || die "Could not cd to: $LOGS" if [[ "$KEEPJNLS" -eq 0 ]]; then log "Skipping cleanup of old checkpoint logs because KEEPJNLS is set to 0." else log "Deleting old checkpoint logs. Keeping latest $KEEPJNLS, per KEEPJNLS setting in p4_vars." remove_log_files "checkpoint.log" "$KEEPJNLS" fi if [[ "$KEEPLOGS" -eq 0 ]]; then log "Skipping cleanup of old server logs because KEEPLOGS is set to 0." else log "Deleting old server logs. Keeping latest $KEEPLOGS, per KEEPLOGS setting in p4_vars." remove_log_files "log" "$KEEPLOGS" remove_log_files "p4broker.log" "$KEEPLOGS" remove_log_files "broker_rotate.log" "$KEEPLOGS" remove_log_files "audit.log" "$KEEPLOGS" remove_log_files "sync_replica.log" "$KEEPLOGS" remove_log_files "replica_status.log" "$KEEPLOGS" remove_log_files "replica_cleanup.log" "$KEEPLOGS" remove_log_files "request_checkpoint.log" "$KEEPLOGS" remove_log_files "recreate_offline_db.log" "$KEEPLOGS" remove_log_files "edge_shelf_replicate.log" "$KEEPLOGS" remove_log_files "upgrade.log" "$KEEPLOGS" remove_log_files "p4login" "$KEEPLOGS" remove_log_files "p4verify.log" "$KEEPLOGS" remove_log_files "journal_watch.log" "$KEEPLOGS" remove_log_files "refresh_P4ROOT_from_offline_db.log" "$KEEPLOGS" remove_log_files "purge_revisions.log" "$KEEPLOGS" fi popd > /dev/null 2>>"$LOGFILE" || die "Could not cd to: $OLDPWD" } #------------------------------------------------------------------------------ # Set the SDP Checkpoint counter to indicate last successful SDP checkpoint # operation. For standby servers, set the SDP Checkpoint counter on the master. #------------------------------------------------------------------------------ function set_counter() { "$P4CBIN/p4login" if [[ "$EDGESERVER" -eq 1 || "$STANDBYSERVER" -eq 1 ]]; then "$P4BIN" -u "$P4USER" -p "$P4MASTERPORT" counter "LastSDPCheckpoint.$SERVERID" "$(date +'%s (%Y/%m/%d %H:%M:%S %z %Z)')" > /dev/null else "$P4BIN" -u "$P4USER" -p "$P4PORT" counter "LastSDPCheckpoint.$SERVERID" "$(date +'%s (%Y/%m/%d %H:%M:%S %z %Z)')" > /dev/null fi } #------------------------------------------------------------------------------ # This is the function that is called to run the individual checkpoint # dump or restores during a parallel run. #------------------------------------------------------------------------------ function parallel_checkpoint_cmd () { echo "=== Running $* on $(date)." >> "$cmd_log" "$@" >> "$cmd_log" 2>&1 status=$? if [[ "$status" -ne 0 ]]; then CkpFailed=1 fi echo "=== $* completed on $(date)." >> "$cmd_log" } #------------------------------------------------------------------------------ # This function checks for running processes as part of the parallel dump and restore #------------------------------------------------------------------------------ function check_running () { sleep 30 #loop thread process id's and see if any have finished. spot=0 run=() for p in "${ids[@]}"; do if [[ -n "$p" ]]; then # shellcheck disable=SC2009 running=$(ps cax | grep "$p") fi if [[ -n "$running" ]]; then run[$spot]=$p spot=$((spot+1)) else thread=$((thread-1)) fi done if [[ "$spot" -ne 0 ]]; then ids=("${run[@]}") else ids=() fi } #------------------------------------------------------------------------------ # Dump db files in parallel from offline_db #------------------------------------------------------------------------------ function dump_parallel_ckp () { db_files=() # Clear array thread=0 # Set current threads to 0 cd "$OFFLINE_DB" || die "Could not cd to: $OFFLINE_DB" [[ -d "${CKPTMPDIR}" ]] || mkdir "${CKPTMPDIR}" rm -f "${CKPTMPDIR:?}"/* # Build array of db_files in offline_db for f in db.*; do db_files+=( "$f" ) # Append db file to the array done # loop db_files running the number of them in parallel that is specified by the command line for f in "${db_files[@]}"; do # Loop to see if we are over our thread count. If so wait until we drop below it again # shellcheck disable=SC2154 while [[ $thread -ge "$Threads" ]]; do check_running done CkpCmd="${P4DBIN} -r ${OFFLINE_DB} -jd ${CKPTMPDIR}/$P4SERVER.ckp.${f} $f" echo "$CkpCmd" > greppattern.txt cmd_log="${LOG}-${f}.log" parallel_checkpoint_cmd "${P4DBIN}" -r "${OFFLINE_DB}" -jd "${CKPTMPDIR}/$P4SERVER.ckp.${f}" "$f" & sleep 1 # shellcheck disable=SC2009 pid=$(ps -ef | grep -F -f greppattern.txt | awk '{print $1;}') if [[ -n "$pid" ]]; then ids[$thread]=$pid # add the process ID into the array of running processes thread=$((thread+1)) # add one to the thread count and start a new verify fi done rm greppattern.txt # now that we have started all of them wait until all of our processes have finished before continuing. while [[ $thread -gt 0 ]]; do check_running done cd "$CKPTMPDIR" || die "Could not cd to: $CKPTMPDIR" rm -f ./*.md5 # now that the processes have finished combine all of the log file together for f in "${db_files[@]}"; do if [[ -f "${LOG}-${f}.log" ]]; then cat "${LOG}-${f}.log" >> "$LOGFILE" rm -f "${LOG}-${f}.log" fi done if [[ "$CkpFailed" -ne 0 ]]; then # shellcheck disable=SC2034 StatusMessage="Error: Checkpoint failed. Review the log [$LOGFILE]." ExitCode=1 fi if [[ "$ExitCode" -ne 0 ]]; then die "New checkpoint dump failed!" fi msg "Completed parallel checkpoint at $(date)." } #------------------------------------------------------------------------------ # Restore from db files that have been extracted from a parallel checkpoint tgz file. #------------------------------------------------------------------------------ function restore_parallel_ckp () { db_files=() # Clear array thread=0 # Set current threads to 0 [[ -d "${CKPTMPDIR}" ]] || die "$CKPTMPDIR doesn't exist! Restore failed." cd "$CKPTMPDIR" || die "Could not cd to: $CKPTMPDIR" rm -f "${OFFLINE_DB}"/offline_db_usable.txt >> "$LOGFILE" 2>&1 rm -f "${OFFLINE_DB}"/db.* >> "$LOGFILE" 2>&1 # Build array of db_files in checkpoint temp dir for f in *; do db_files+=( "$f" ) # Append db file to the array done # loop db_files running the number of them in parallel that is specified by the command line for f in "${db_files[@]}"; do # Loop to see if we are over our thread count. If so wait until we drop below it again while [[ $thread -ge "$Threads" ]]; do check_running done CkpCmd="${P4DBIN} -r ${OFFLINE_DB} -jr ${CKPTMPDIR}/${f}" echo "$CkpCmd" > greppattern.txt cmd_log="${LOG}-${f}.log" parallel_checkpoint_cmd "${P4DBIN}" -r "${OFFLINE_DB}" -jr "${CKPTMPDIR}/${f}" & sleep 1 # shellcheck disable=SC2009 pid=$(ps -ef | grep -F -f greppattern.txt | awk '{print $1;}') if [[ -n "$pid" ]]; then ids[$thread]=$pid # add the process ID into the array of running processes thread=$((thread+1)) # add one to the thread count and start a new verify fi done rm greppattern.txt # now that we have started all of them wait until all of our processes have finished before continuing. while [[ $thread -gt 0 ]]; do check_running done # now that the processes have finished combine all of the log file together for f in "${db_files[@]}"; do if [[ -f "${LOG}-${f}.log" ]]; then cat "${LOG}-${f}.log" >> "$LOGFILE" rm -f "${LOG}-${f}.log" fi done if [[ "$CkpFailed" -ne 0 ]]; then # shellcheck disable=SC2034 StatusMessage="Error: Checkpoint Restore failed. Review the log [$LOGFILE]." ExitCode=1 fi if [[ "$ExitCode" -ne 0 ]]; then die "Restore of checkpoint dump failed!" fi echo "Offline db file restored successfully." > "${OFFLINE_DB}/offline_db_usable.txt" msg "Completed parallel checkpoint restore at $(date)." } #------------------------------------------------------------------------------ # Create a tgz of the temporary checkpoint folder that contains individually dumped database checkpoints. #------------------------------------------------------------------------------ function create_tar_ckp () { cd "$CHECKPOINTS" || die "Could not cd to: $CHECKPOINTS" Ckptgz=${P4SERVER}.ckp.parallel.${CHECKPOINTNUM}.tgz [[ -f "$Ckptgz" ]] && die "$Ckptgz file already exists. Check the backup process!" { time tar cvzf "$Ckptgz" "${CKPTMPDIR}"; } >> "$LOGFILE" 2>&1 || { die "Failed to create tgz checkpoint file!"; } rm -rf "${CKPTMPDIR:?}"/* } #------------------------------------------------------------------------------ # Extract a tgz of the temporary checkpoint folder that contains individually dumped database checkpoints. #------------------------------------------------------------------------------ function extract_tar_ckp () { cd "$CHECKPOINTS" || die "Could not cd to: $CHECKPOINTS" Ckptgz=$1 [[ -f $Ckptgz ]] || die "$Ckptgz doesn't exist!" { time tar xvzf "$Ckptgz"; } >> "$LOGFILE" 2>&1 || { die "Failed to extract $Ckptgz checkpoint file!"; } }
# | Change | User | Description | Committed | |
---|---|---|---|---|---|
#155 | 30880 | C. Thomas Tyler |
Fixed bad grammar in comments. Non-functional change. |
||
#154 | 30860 | C. Thomas Tyler |
Tweaked number of logs to keep. Fixed typo in output message. |
||
#153 | 30857 | C. Thomas Tyler | Clarified text in comments. | ||
#152 | 30856 | C. Thomas Tyler |
Adapted to cleanup obsolete parallel checkpoint directories that had only *.OK files and no *.md5 files (created by a small range of p4d versions that did not create top-level *.md5 files for parallel checkpoint directories). |
||
#151 | 30855 | C. Thomas Tyler |
Completed safety feature to ensure checkpoints and journals cannot never be deleted by remove_old_checkpoints_and_journals() even if it were called at the wrong time. This is for future-proofing (and making custom SDP-derived scripts safer). |
||
#150 | 30854 | C. Thomas Tyler | Refined logic to find highest journal counter among *.md5 files. | ||
#149 | 30852 | C. Thomas Tyler |
Adapted logic to find latest checkpoint with an MD5 file to search by highest journal counter number rather than timestamp on the *.md5 file. In most situations, this will give the same result. However, in cases where checkpoints and MD5 files are copied across machines, it is possible that timetamps can be incorrect. Journal counters are less likely to be incorrect, and would only be incorrect in situations involving manual interactions. |
||
#148 | 30848 | C. Thomas Tyler |
Semantically separated DoParallelCheckpionts into two variables, CreateParallelCheckpoint and LoadParallelCheckpoint, which are logicall independent. |
||
#147 | 30654 | C. Thomas Tyler |
Revised logic replaying checkpoints to select the most recent checkpoint regardless of whether it is parallel or not. This logic now applies consistently across scripts (even some not in this changelist that call functions in backup_functions.sh), such as sync_replica.sh, sync_replica.sh, load_checkpoint.sh, recover_edge.sh, etc. The edge_dump.sh script now creates parallel checkpoints if parallel checkpoints are configured. The load_checkpoint.sh now reliably detects the most recent checkpoint when '-latest' is used reliably, serial or parallel. This script now also finds checkpoints for a standby of an edge. Also fixed bug cleaning up old *.OK files from earlier parallel checkpoints. The recover_edge.sh script similarly detects the latest checkpoint correctly, serial or parallel. This chagne was tested with a new regression test suite that operates in a Battle School Lab environment, allowing for more sophisticated testing of sequences of operations. #review-30655 |
||
#146 | 30372 | Will Kreitzmann | as per Tom request remove echos | ||
#145 | 30370 | Will Kreitzmann | (SDP-1065) SNS fix's though I'd like to mow the lawn a bit more on this | ||
#144 | 30305 | C. Thomas Tyler |
Refined JDTmpDir implementation to handle case of empty data sets consistently. #review-30306 @robert_cowham |
||
#143 | 30269 | Robert Cowham | Fix typo in variable usage | ||
#142 | 30267 | Robert Cowham |
Copy files to be dumped via p4d -jd to tmp dir first to avoid locks on P4ROOT (or offline_db) SDP-1087 |
||
#141 | 30210 | C. Thomas Tyler |
Adjusted set_counter() so the checkpoint counter is set consistently on any p4d server (commit, edge, standby, filtered forwarding replica, etc.). Also enhanced auditability of counter setting. #review-30211 |
||
#140 | 30175 | C. Thomas Tyler |
request_replica_checkpoint.sh can request parallel checkpoints. Added safety check to avoid running 'p4 admin checkpoint' command against the commit. #review-30176 |
||
#139 | 30072 | C. Thomas Tyler |
Added '-v track=-1' to all 'p4d -jd' commands to prevent cron.d from sending bogus emails containing only tracking data if 'track' (performance tracking) was enabled. #review-30073 @robert_cowham |
||
#138 | 30071 | lbarbier |
Fix FROM email function While previous implementation with -Sfrom= do make emails appear to be sent from the $MAILFROM, it's still sent as <user>@<fqdn> of the p4 server. Exchange will see this in the headers and consider it's sent from an untrusted place/domain. |
||
#137 | 29992 | C. Thomas Tyler |
p4verify.sh: Added support for handling the 'trait' depot (new in 2023.2). Refactored get_old_log_timestamp() function, moving it from backup_functions.sh into new log_functions.sh file. #review-29993 |
||
#136 | 29911 | Robert Cowham |
Changed format of data in log() function in backup_functions.sh to something easier to parse mechanically. We lose the time zone in the log with this format, but the inconsistent format of the timezone was part of the problem with parsing. Note this is parsed by "date -d" so original change to fix didn't work! #review-29800 @tom_tyler |
||
#135 | 29898 | Will Kreitzmann |
addition log files. p4p.log rotate_proxy.log p4pcm.log |
||
#134 | 29871 | C. Thomas Tyler |
Added support for s-nail email utility. Thanks for the contribution, Laurent! :) |
||
#133 | 29863 | C. Thomas Tyler | Fixed quoting issue affecting simulated email mode. | ||
#132 | 29799 | C. Thomas Tyler |
Changed format of data in log() function in backup_functions.sh to something easier to parse mechanically. We lose the time zone in the log with this format, but the inconsistent format of the timezone was part of the problem with parsing. #review-29800 @robert_cowham |
||
#131 | 29795 | C. Thomas Tyler |
Fixed bug where proxy_rotate.sh and broker_rotate.sh called check_dirs() looking for p4d directories, reporting errors. Fixed by adding optional ServerType parameter: 1 (default) - check for dirs for a p4d server 2 - check for dirs for a standalone proxy or broker Also addressed style issues to achieve compliance with ShellCheck v0.9.0. #review @robert_cowham @karl_wirth |
||
#130 | 29595 | C. Thomas Tyler |
Fixed so, if 'mail' program is not in PATH, a nice 'Simulated email' echo is displayed rather than an ugly error message. |
||
#129 | 29592 | C. Thomas Tyler |
Added support for P4D r23.1+ multi-file per-table parallel checkpoints. If DO_PARALLEL_CHECKPOINTS=4 (for example) is set in the instance vars file, live checkpoints are created with 'p4d -jcpm' and daily checkpoints are created with 'p4d -jdpm'. Replays are done with 'p4d -jrp' (same as in P4D r22.2; there is no '-jrpm'). With this change, live_checkpoint.sh and daily_checkpoint.sh take advantage of parallel checkpoint features. To Do: * Add doc coverage in SDP Guide for the full lifecycle of parallel checkpoints, including updates to recovery procedures. * Other checkpoint-handling scripts: - load_checkpoint.sh - mkrep.sh - edge_dump.sh - recover_edge.sh - sync_replica.sh #review-29593 @brent_schiestl |
||
#128 | 29590 | C. Thomas Tyler |
Another refinement of sync_replica.sh logic to handle all intended use cases: * HA of commit server, with and without NFS sharing (SHAREDDATA). * HA of edge server, with and without NFS sharing (SHAREDDATA). When NFS sharing, the rsync from the target server is disabled, as before. When rsycing for full replicas (not NFS-sharing, metadata-only replicas), the sync_replica.sh script now always rsyncs from a checkpoints directory based on the journalPrefix of the P4TARGET server. This is correct for all scenarios. If that cannot be determined, the script now does a die() call to avoid rsyncing to a possibly incorrect path. When rebuilding the local offline_db, the checkpoints directory based on the journalPrefix of the P4TARGET server is always used. This directory should exist, whether due to rsync from the target, or NFS-sharing. Logic to remove old checkpoints and journals now only cleans in folders written to by the local replica, to avoid removing files on an NFS-shared upstream server. Auditability of checkpoint operations in backup_functions.sh is improved. #review-29591 |
||
#127 | 29578 | C. Thomas Tyler |
Fixed issue with log rotation login on some platforms. #review-29579 @Domenic |
||
#126 | 29576 | C. Thomas Tyler |
Enhanced sync_replica.sh to support operation on a 'ham' type replica (HA, Metadata-only). A 'ham' type replica replicates only metadata, and shares the /hxdepots volume (via NFS) with its target server. In this configuration, the SHAREDDATA=TRUE value is set, and this corresponds to a p4d configuration setting for the replica of lbr.replication=shared. In this configuration, the journalPrefix value of the replica server will differ from that of its target server. For example, the commit server will may have the First Form jour the journalPrefix, while an HA of the commit will have the Second Form. See 'The journalPrefix Standard': https://swarm.workshop.perforce.com/projects/perforce-software-sdp/view/main/doc/SDP_Guide.Unix.html#_the_journalprefix_standard As another example, for an edge server and HA of that edge, both servers will use the Second Form of the journalPrefix, the form which incorporates a shortened form of the ServerID into the journalPrefix value. But since the ServerIDs are different, the actual journalPrefix values will be different, even though both are of the Second Form. The common pattern is that, when configured for NFS sharing, the sync_replica.sh script should use the journalPrefix of its target server when determining where to look for a checkpoint and numbered journal to load into the offline_db. #review @mark_zinthefer @robert_cowham |
||
#125 | 29567 | Andy Boutte | Adding option to delivery log and alerts via PagerDuty | ||
#124 | 29519 | C. Thomas Tyler |
Fixed issue with parallel checkpoints not working if the 'bc' utility was not available. |
||
#123 | 29434 | C. Thomas Tyler |
Tweaked code deriving Threads values from user-defined DO_PARALLEL_CHECKPOINT. Added a stand-alone test script for tweaking that snippet of code. |
||
#122 | 29420 | C. Thomas Tyler |
For parallel checkpoints, provide a way to specify '-N' parallel threads. #review-29421 |
||
#121 | 29363 | kathy_rayburn | Add rotation of p4triggers.log to rotate_last_run_logs | ||
#120 | 29316 | C. Thomas Tyler |
Enhanced robustness of start_p4d() and stop_p4d() interaction with systemd, and also in similar logic in load_checkpoint.sh. #review-29317 |
||
#119 | 29247 | C. Thomas Tyler |
Added a single file to indicate that a parallel checkpoint is complete. Added a check for that file before replaying from a parallel checkpoint directory. #review-29248 |
||
#118 | 29232 | C. Thomas Tyler |
Added regression test for new parallel checkpoint feature. #review-29220 |
||
#117 | 29136 | C. Thomas Tyler |
Fixed log name in recreate_offline_sb.sh to avoid colons in name, and changed to typical SDP behavior where most recent log name has no timestamp. |
||
#116 | 29031 | C. Thomas Tyler |
Added 'monitor_metrics.log' to list of rotated logs. Also made minor non-functional Shellcheck compliance tweaks. #review-29032 |
||
#115 | 28771 | C. Thomas Tyler |
Changed email address for Perforce Support. #review-28772 @amo @robert_cowham |
||
#114 | 28624 | C. Thomas Tyler |
Removed unused functions related to parallel checkpoint processing. Parallel checkpoint processing is expected to be added in p4d itself in future releases. This is a non-functional change. #review-28625 |
||
#113 | 28621 | C. Thomas Tyler |
Added support to sync_replica.sh for operation on a replica of an edge server. Fixed related error with erroneous output in backup_functions.sh. #review-28622 |
||
#112 | 28430 | Andy Boutte | forgot to make SNS use short_subject to overcome the SNS subject character limit | ||
#111 | 28421 | C. Thomas Tyler |
verify_sdp.sh v5.20.0: * New checks: /p4/N/bin/p4{d,p,broker}_N need correct symlink target, and must exist if the corresponding _init script exists. For p4d, it can be a symlink (for a case-sensitive instance) or script (for a case-insensitive instance to pass the C1 flag). Either way the target is checked. These checks cannot be skipped or converted to warnings. * Added check that /p4/N/bin/p4{d,p,broker}_N_init scripts have content that matches templates. This can be skipped with '-skip' or reported as mere warnings (with '-warn') with a new and documented 'init' category of test skipping/warning. #review-28422 |
||
#110 | 28277 | Andy Boutte |
Adding support for delivering alert notifications via AWS SNS #review https://jira.perforce.com:8443/browse/CLD-14 |
||
#109 | 28210 | ashaikh |
Fix checkpoint error on forwarding replicas When running the daily_checkpoint.sh script on forwarding replicas, you get the following error: Offline journal replay failed. Abort! This is because the daily_checkpoint.sh is trying to replay a journal that doesn't exist (hasn't been rotated yet). I've created SDP-563 job for this issue. https://swarm.workshop.perforce.com/jobs/SDP-563 These changes have been tested on several forwarding replicas. |
||
#108 | 28191 | C. Thomas Tyler |
Enhanced to remove old $LOGS/journal.NNN files on standby replicas. This is an optional behaviour which must be explicitly enabled by adding a setting to the bottom of p4_N.vars: export SDP_REMOVE_STANDBY_JOURNALS=1 This is useful if a standby replica shares /hxdepots with its P4TARGET server over NFS, and has the same journalPrefix value as the P4TARGET server. In this scenario, the standby will not rotate its journal as the file already exists, having been rotated on the P4TARGET server sharing the same NFS volume. When the journal rotation does not occur, the journal.NNN file is not removed. The feature implmented in this change is a workaround. A better solution could be a p4d enhancement for this specific scenario: job107633 - Add new value of 2 to rpl.journalcopy.location for standby replicas. |
||
#107 | 28188 | C. Thomas Tyler |
Minor internal cosmetic and coding style tweaks in backup_functions.sh. No functional changes. |
||
#106 | 28064 | Robert Cowham | Add some missing log files to remove_old_logs() function | ||
#105 | 27750 | C. Thomas Tyler |
upgrade.sh v4.6.9: * Fixed issue where where patch-only upgrades of instances after the first in a multi-instance environment are skipped. * Corrected error message for scenario where downgrades are attempted; the logic was correct but error message was confusing. verify_sdp.sh v5.17.3: * Extended '-skip version' meaning to also skip new live binary version comparison checks. Related updates: * A call to verify_sdp.sh in the switch_db_files() function in backup_functions.sh now skips the version check. * A call to daily_checkpoint.sh now skips the version check. #review-27743 |
||
#104 | 27722 | C. Thomas Tyler |
Refinements to @27712: * Resolved one out-of-date file (verify_sdp.sh). * Added missing adoc file for which HTML file had a change (WorkflowEnforcementTriggers.adoc). * Updated revdate/revnumber in *.adoc files. * Additional content updates in Server/Unix/p4/common/etc/cron.d/ReadMe.md. * Bumped version numbers on scripts with Version= def'n. * Generated HTML, PDF, and doc/gen files: - Most HTML and all PDF are generated using Makefiles that call an AsciiDoc utility. - HTML for Perl scripts is generated with pod2html. - doc/gen/*.man.txt files are generated with .../tools/gen_script_man_pages.sh. #review-27712 |
||
#103 | 27651 | Russell C. Jackson (Rusty) | Added forwarding-standby number to is_standby check function. | ||
#102 | 27455 | C. Thomas Tyler |
Normalized usage of calls to verify_sdp.sh from other scripts. In some contexts, we may desire "squeaky clean," while in other contexts we may only need assurance that core functions are operating OK. Fixed typo in variable name used by verify_sdp.sh in call from upgrade.sh that prevented local VERIFY_SDP_SKIP_TEST_LIST def'n from having any effect. |
||
#101 | 27207 | C. Thomas Tyler |
upgrade.sh v4.3.0: * Greatly enhanced documentation and examples. * Before doing second journal rotation: - Added wait for 'p4 storage -w' (for 'to or thru' P4D 2019.1). - Added wait for 'p4 upgrades|grep -v completed' (for P4D 2020.2+) * Added check for whether p4broker and p4p were online at the start of upgrade processing. Only start those services that were running at the beginning of processing are now started after the binaries and symlinks are updated. For the broker, only the broker with the default configuration is stopped and started; DFM brokers are ignored by this script (thus making this script compaitble with using DFM brokers). * Fixed bug where '-c' (Protections table comment conversion) would have failed due to 'p4d' addition of 'Update:' field to the Protections table. Also generally enhanced logic to convert Protections table comments. * Added support for operation on proxy-only and broker-only hosts. Processing of upgrades for p4d occur only if /p4/N/bin/p4d_N_init script exists on the machine. * Refined lexigraphical P4D version comparsion checks. verify_sdp.sh v5.12.0: * Added support for proxy-only and broker-only hosts. The existence of a *_init script in the instance bin dir for any of the Helix server binaries p4d/p4p/p4broker indicate they are configured, determining what tests are executed or skipped. * Added check_file_x() function to check for execute bit on files. In backup_functions.sh, fixed is_server_up() to avoid displaying output. #review-27208 |
||
#100 | 27199 | C. Thomas Tyler |
Made is_server_running() function in backup_functions.sh aware of the nuances of of checking whether p4broker, p4d, and p4p are up. For the broker in particular, only the default broker configuration is checked. This will avoid process compatibility issues with customers using DFM (Down for Maintenance) brokers as part of the upgrade procedure, either manually or with the Helix Management System (HMS). This was done by deferring to the *_base scripts for each service type, as these scripts now return reliable exit codes for status. Made p4d_base not write anything to p4d_init.log if only a 'status' check is done. Fixed bug in 'p4d base' status check where it would return a wrong exit code if p4d was up but SSL trust was not established. Removed redundant logic in start_p4[d,p,broker] functions for service status checking. Enhanced p4d_base to clean up temp dir when done. #review-27200 |
||
#99 | 27197 | C. Thomas Tyler |
Fixed an issue where p4d start would fail to recognized that p4d was up if 'p4 trust' needed to be established. Fixed to unreleased behavior. No job filed. #review-27198 |
||
#98 | 27192 | C. Thomas Tyler |
Fixed hang issue during journal rotation is journalPrefix is wrong. The verify_sdp.sh check is now incorporated into daily_checkpiont.sh. When verify_sdp.sh is called by other scripts, '-skip excess,crontab' options are supplied to avoid issues during maintenance windows with possibly overzealous/nitpicky verifications. #review-27193 |
||
#97 | 27188 | C. Thomas Tyler |
Changed start_p4[p,broker]() functions to check for pids rather than use a 'p4 info' check, to avoid dependency on 'p4d' being up. Style normalization: The formal bash 'function' declaration is now used for all function definitions. Removed obsolete logic to follow up a systemd stop for p4d with a SysV init stop call. That had been done to woarkound systemd status reliability issues, but is no longer needed due to robustness improvements in systemd handling in this release, and also changes to disallow SysV init scripts from being used when systemd is in play. Fix to unreleased behavior; no job filed. |
||
#96 | 27153 | C. Thomas Tyler |
Fixed bug where 'offline_db_usable.txt' file was not maintained correctly by rotate_journal.sh. Removed redundant call to check_offline_db_usable() in compare_journal_numbers(). Fixed some missing log capture redirects. #review-27154 |
||
#95 | 27110 | C. Thomas Tyler |
Stop/start robustness enhancements for start/stop_p4*() functions. Added wait loop to start_p4d(), start_p4broker(), and start_p4p() functions and corresponding stop_p4{d/p/broker}() functions. Delays are 1-second intervals up to a configurable maximum number of seconds. Defaults for start are 120 for p4d, 60 each for p4broker and p4p. Defaults for stop are 600 each for p4broker and p4p, but for 'p4d' the wait could cause a hang. This is deemed preferrable over other options. Delays are only incurred as needed. Services that do not start/stop within the alotted time are deemed to have failed to start/stop. Note that while this change is a general robustness enhancement, it is especially important with systemd due to its 'fire and forget' nature when doing a 'sudo systemctl start'. The systemctl command returns immediately and happily even if the p4d startup fails. All 3 start_p4*() and all 3 stop_p4*() functions now have the same "do or die" behavior; they call a 'die' if the service did not stop as requested. When they return, the requested start/stop can be assumed to have completed successfully. Added documentation for new settings in instance_vars.template. Also added doc and example of SDP_AUTOMATION_USERS. #review-27111 |
||
#94 | 27070 | C. Thomas Tyler |
Added backup_functions.sh back into p4d_base. Fixed failing regression tests. |
||
#93 | 27064 | C. Thomas Tyler |
Fixed issue where 'source p4_vars' hangs if load_checkpoint.sh is running. Added new semaphore file, $P4ROOT/P4ROOT_not_usable.txt. This is used in a way similar to 'offline_db_usable.txt' in the offline_db, except that this file only exists when the databases in P4ROOT are not usable. This is the opposite of how offline_db_usable.txt works, because P4ROOT is expected to be usable 99.9% fo the time. p4d_base will refuse to start p4d if this file exists, protecting against possible operator errors (like trying to start p4d when a checkpoint is still loading). Added check_file_dne() function to verify_sdp.sh to confirm a named file does not exist. Added checks in verify_sdp.sh that P4ROOT_not_usable.txt does not exist in P4ROOT or offline_db. Modified switch_db_files() (called by refresh_P4ROOT_from_offline_db.sh) to properly use the new P4ROOT_not_usable.txt safety file. Fixed bugs in p4d_base that could cause p4d_init.log to be overwritten if error output was generated. Removed call to 'backup_functions.sh' in p4d_base, as on balance it added more complexity than needed. #review-27065 |
||
#92 | 27043 | C. Thomas Tyler |
Made 'No checkpoints found' check simpler and more reliable. Fixed bug impacting live_checkpoint.sh operation on platforms where 'find' utility does not support '-printf' option (including Mac OSX High Sierra). #review-27044 |
||
#91 | 26995 | C. Thomas Tyler |
Further refinements to log rotation to handle an edge case where pre-existing gzip log files cause gzip to go interactive. |
||
#90 | 26994 | C. Thomas Tyler |
Fixed issue (encountred in repetitive test suite operations) where a log file called to quickly causes gzip to go interactive. Also refined pushd/popd logic interaction w/LOGFILE defined/undef. |
||
#89 | 26988 | C. Thomas Tyler |
Tweak to pushd/popd to discard stdout. Any error output is retained. |
||
#88 | 26928 | Robert Cowham |
Fix problem with line breaks when testing for journal corruption. Also softened the error message to avoid unnecessary alarm for users! Observed "cd -" not working on AWS Linux box. Changed to pushd/popd. |
||
#87 | 26619 | C. Thomas Tyler |
Silenced 'shellcheck' complaint about quoting right-hand-side of an '=~' test, as testing confirms the code works as intended with the quotes in our usage, and indeed fails with a syntax error of the quotes are removed. No functional change; only added a shellcheck comment. |
||
#86 | 26494 | C. Thomas Tyler |
Updated remove_old_logs() to cleanup refresh_P4ROOT_from_offline_db.sh. remove_old() |
||
#85 | 26478 | C. Thomas Tyler | Enhanced to work with replicas that don't set journalPrefix. | ||
#84 | 26472 | C. Thomas Tyler |
Patch to use dynamic journalPrefix detection. #review-26473 |
||
#83 | 26468 | C. Thomas Tyler |
Adjusted refresh_P4ROOT_from_offline_db.sh to work on maser/edge/replicas. Added logic in functions like p4d_truncate_journal() to make them safe to call on any host, master, replica, or edge. When called on any host other than the master, appropriate behavior is expected. Approving to deliver patch. |
||
#82 | 26456 | C. Thomas Tyler |
Patch to fix issue with refresh_P4ROOT_from_offline_db.sh behavior on replicas. Adjusted behavior for other scripts to ensure proper behaivor when run on replicas vs. edge servers vs. the master server. Approving patch for testing. |
||
#81 | 26407 | C. Thomas Tyler | Working around service stop reliability issue with systemd. | ||
#80 | 26406 | C. Thomas Tyler | Fixed issue where tail of server log is not displayed if server fails on startup. | ||
#79 | 26400 | C. Thomas Tyler |
Added refresh_P4ROOT_from_offline_db.sh. Updated backup_functions.sh to support functionality for db refresh. Upgrade start_p4d() and stop_p4d() to use systemd if available, else use the underlying SysV init scripts. Updated verify_sdp.sh to be called from other scripts (sans its own logging). Added many checks to verify_sdp.sh to support P4ROOT/offline_db swap. Logic in P4ROOT/offline_db swap is more careful about what gets swapped. Added start_p4broker() and stop_p4broker() that behave similarly. More shellcheck compliance. #review-26401 |
||
#78 | 26394 | C. Thomas Tyler |
Provide snapshot hook for daily_checkpoint.sh. Perhaps the most common SDP customization is a tweak to integrate the SDP checkpoint mechanism with site-local capability of the underlying hardware, so that the snapshot is created at the ideal point in time, right after the metadata checkpoint is created. The intent of this change is to reduce the need for such customization by making a well defined hook in the daily checkpoint process for calling a site-specific custom checkpoint script. Key elements: * Made an illustrative sample change to instance_vars.template, to show setting a SNAPSHOT_SCRIPT variable defined only if running on the master. This logic would be customized to meet local needs, e.g. perhaps checking hostname if snapshots can only be run on certiain machines. The custom logic would live in the /p4/common/config/p4_N.vars file. As a for-example, The sample logic shows a commented-out call to an Amazon Web Services (AWS) Elastic Block Store (EBS) snapshot. (This could just as easily reference a NetApp filer snapshot for an on-prem installation.) * A change to dump_checkpoint() in backup_functions.sh to call the snapshot script defined by the $SNAPSHOT_SCRIPT variable if defined. As coded, a failure of either the checkpoint or the snapshot will cause the script to report a failure. However, a failure of the checkpoint will not prevent the snapshot from being attempted. Possible future tweaks: Consider adding a LIMIT_ONE_DAILY_SNAPSHOT setting (akin to the LIMIT_ONE_DAILY_CHECKPOINT setting in the Windows SDP). |
||
#77 | 26221 | C. Thomas Tyler | Corrected typo in comments only; no functional change. | ||
#76 | 26156 | C. Thomas Tyler |
Shellcheck v0.6.0 and style compliance changes. Fixed minor bugs related to capturing output, driven by shellcheck changes. Fixed sync_replica.sh for standby replicas with the configurable rpl.journalcopy.location=1 (SDP-424), removing an unnecessary and broken check. Fixed test for pre-existing checkpoints in function recreate_offline_db_files() so that it checks only for the master server, fixing an issue where it would report "No checkpoints found - run live_checkpoint.sh" when used on a replica where checkpoints might legatimately not exist. Also fixed the actual test itself. Replaced P4COMMITSERVER variable with P4MASTERPORT to support daisy chain scenarios, removing the assumption that all servers target only the master. (This assumption was made only in journal_watch.sh). Enhanced check_vars() to report individual missing environment variables, and to add more info on how to fix environment problems (e.g. adding to p4_vars or p4_N.vars files). Fixed bug in check_dirs() where a missing directory check intended to result in a die() call would result in a syntax error instead. These files have been field tested. |
||
#75 | 25594 | C. Thomas Tyler |
Removed journalPrefix as command line paramter during journal rotation, deferring to db.config values. Removed explicit specification of journalPrefix as a command line argument to 'p4 admin journal' and 'p4d -jj' commands. Specifying the prefix is redundant as journalPrefix values are defined for the master server and any/all replicas/edge servers in db.config, and db.config is the One Source of Truth for journalPrefix. The SDP defines best practice values for journalPrefix, with elements of this in mkrep.sh (setting configurables for replicas) and mkdirs.sh (preparing directories during initial installation). Note that verify_sdp.sh verifies that journalPrefix values are correct as per SDP standards. Also cleaned up comments. #review @mshields |
||
#74 | 25374 | Russell C. Jackson (Rusty) |
New script for performing a parallel checkpoint. Run as follows: parallel_ckp.sh <instance> -P <threads> New script to restore a parallel checkpoint file to the offline database in case a recovery is needed. Run as follows: parallel_ckp_restore.sh <instance> -f <parallel_ckp_file.tgz> -P <threads> |
||
#73 | 25276 | ashaikh |
Fix syntax errors for SDP_ADMIN_PASSWORD_FILE The current backup_functions.sh script isn't working correctly due to syntax errors. When I try to run 'run_if_master.sh 1 echo a' on a master server, nothing happens. After I made the following fixes, it worked as expected. |
||
#72 | 24357 | C. Thomas Tyler |
Added SDP_ADMIN_PASSWORD_FILE variable in p4_vars.template, and also added a default value in backup_functions.sh. Also added comments in p4_vars explaining the 'set +u' bit. Adding SDP_ADMIN_PASSWORD_FILE is an enabling change for an upcoming change to mkrep.sh. Updated p4login and p4login-super.sh to reference this variable. Normalized p4login-super.sh to accept SDP instance parameter, which (as with other scripts) is optional of SDP_INSTANCE is already defined, else required. Also chmod +x p4login-super.sh. #review @robert_cowham |
||
#71 | 24330 | ashaikh |
Add a script to archive/purge revisions based on number of days. This script will allow you to archive files and optionally purge files based on a configurable number of days and minimum revisions that you want to keep. This is useful if you want to keep a certain number of days worth of files instead of a specific number of revisions. This script currently only accepts paths to specific files, it does not support globbing or wildcards. |
||
#70 | 24191 | C. Thomas Tyler |
Submit on behalf of ashaikh after merging/resolving with current tip revision. |
||
#69 | 24189 | ashaikh |
Add a new SDP script to warn and/or rotate live journal due to configurable low disk space condition A new journal_watch.sh script is being introduced with alerting and journal truncate functionality for enviroments with high Perforce activity. Two separate thresholds can be configured, one for a warn alert (notify via email - optional) and another to trigger a journal rotation. Values for the threshold can be defined via K, M, G or a percentage of disk space. These values represent the available disk space on the journal volume (free space minus filesys.P4Journal.min) before the script takes aption. Another optional feature is to enable KEEPJNL override which dynamically calculates the number of journals to keep based on the oldest checkpoint on disk. When this is enabled, KEEPJNL is temporarily overridden during the journal_watch.sh run to prevent removing any journals that may be required to recover from a checkpoint on disk and all the journals required to bring the server back to a current state. This script can be run on a master/commit, edge and replica server. If the edge or replica server is running low on journal space, it will trigger a journal rotation on the master/commit server (based on P4TARGET value), which then will cause the journals to rotate on the edge/replica servers. <code>#review</code> |
||
#68 | 23848 | Robert Cowham |
Missed a fix for the crontab spamming when track=1 If you do p4d -jd - you need to pipe 2>&1 |
||
#67 | 23639 | Robert Cowham | Remove trailing spaces on all lines - important to make line continuations work. | ||
#66 | 23637 | Robert Cowham | Avoid spamming emails from crontab when track=1 | ||
#65 | 23429 | Robert Cowham | Fix a couple of shellcheck warnings | ||
#64 | 23266 | C. Thomas Tyler |
Fixes and Enhancements: * Enabled daily_checkpoint.sh operate on edge servers, to keep /p4/N/offline_db current on those hosts for site-local recovery w/o requiring a site-local replica (though having a site-local replica can still be useful). * Disabled live_checkpoint.sh for edge servers. * More fully support topologies using edge severs, in both geographically distributed and horizaontal scaling "wokspace server" solutions. * Fix broken EDGESERVER value definition. * Modified name of SDP counter that gets set when a checkpoint is taken to incorporate ServerID, so now the counter name will look like lastSDPCheckpoint.master.1, or lastSDPCheckpoint.p4d_edge_sfo, rather than just lastSDPCheckpoint. There will be multiple such counters in a topology that uses edge servers, and/or which takes checkpoints on replicas. * Added comments for all functions. For the master server, journalPrefix remains: /p4/N/checkpoints/p4_N The /p4/N/checkpoints is reserved for writing by the master/commit server only. For non-standby (possibly filtered) replicas and edge serves, journalPrefix is: /p4/N/checkpoints.<ShortServerID>/p4_N.<ShortServerID> Here, ShortServerID is just the ServerID with the 'p4d_' prefix trimmed, since it is redundant in this context. See mkrep.sh, which enshines a ServerID (server spec) naming standard, with values like 'p4d_fr_bos' (forwarding replica in Boston) and p4d_edge_blr (Edge server in Bangalore). So the journalPrefix for the p4d_edge_bos replica would be: /p4/N/checkpoints.edge_bos/p4_N.edge_bos For "standby" (aka journalcopy) replicas, journalPrefix is set to /p4/N/journals.rep. which is written to the $LOGS volume, due to the nature of standby replicas using journalPrefix to write active server logs to pre-rotated journals. Some take-away to be updated in docs: * The /p4/N/checkpoints folder must be reserved for checkpoints that originate on the master. It should be safe to rsync this folder (with --delete if desired) to any replica or edge server. This is consistent with the current SDP. * I want to change 'journals.rep' to 'checkpoints.<ShortServerID>' for non-standby replicas, to ensure that checkpoints and journals taken on those hosts are written to a volume where they are backed up. * In sites with multiple edge serves, some sharing achive files ('workspace servers'), multiple edge servers will share the same SAN. So we one checkpoints dir per ServerID, and we want that dir to be on the /hxdepots volume. Note that the journalPrefix for replicas was a fixed /p4/N/journals.rep. This was on the /hxlogs volume - a presumably fast-for-writes volume, but typically NOT backed up and not very large. This change puts it under /p4/N/checkpoints.* for edge servers and non-standby replicas, but ensures other replica types and edge servers can generate checkpoints to a location that is backed up and has plenty of storage capacity. For standby replicas only (which cannot be filtered), the journalPrefix remains /p4/N/journals.rep on the /hxlogs volume. |
||
#63 | 23228 | Russell C. Jackson (Rusty) | Adding back a fix that I put in revsion 43 that somehow disappered in revision 44. | ||
#62 | 23031 | C. Thomas Tyler |
Simplified stop_p4d() in backup functions to just call the init script. Previously stop_p4d() had 'wait' logic to wait for p4d to stop before trying to stop with a 'kill' signal, but this is no longer needed since the modern init script (with logic in p4d_base) now does a 'kill' anyway, and also has the 'wait' logic and will exit only when p4d is well and truly down. In upgrade.sh, fixed issue where start/stop of p4broker and p4p went directly to the screen instead of the log. Also changed to call start/stop init scripts for p4d directly, just as for other services. Also enhanced upgrade.sh logging: * Log file name to incorporate SDP instance (redundant but nice). * Remove pesky ':' characters from the datestamp in log file name, as ':' chars in file names wreak havoc with 'scp' commands and require escaping on the command line. * Added log comments indicating which databases are being upgraded ($P4ROOT and $OFFLINE_DB). Also added common explaining use of '-t' flag in 'p4d -xu' call for offline databases. |
||
#61 | 23022 | Sven Erik Knop |
Simple fix for daily_checkpoint if the master server has its own server spec. If the master server has a server spec matching the serverid name, the script will check if the server is an edge server. This is done to avoid off-by-1 errors in the journal counter. The logic for the check was broken, though, resulting in checkpoints failing with "checkpoint.xxx.gz already exist, check setup" This fix avoids the problem. Bash script arithmatic is not as easy. |
||
#60 | 22802 | Russell C. Jackson (Rusty) | Fixed the test to check for the variable and do the bitwise mask test correctly. | ||
#59 | 22800 | Russell C. Jackson (Rusty) |
Correct the logic on checking for an edge server. Removed () from the set_vars call that got in via a cut and paste. |
||
#58 | 22679 | Russell C. Jackson (Rusty) | Removed duplicate line that was producing a cron message. | ||
#57 | 22658 | Russell C. Jackson (Rusty) |
Added line to remove the ckp_running.txt file when the checkpoint fails through the die function because the checkpoint is no longer running, and this file prevents the next checkpoint from running successfully. |
||
#56 | 22633 | Russell C. Jackson (Rusty) | Removed Debug and extra echo of journal number to eliminate cron messages. | ||
#55 | 22387 | Robert Cowham |
Fix journal rotation off-by-one error Also make sure that -jj rotation specifies prefix. |
||
#54 | 22345 | C. Thomas Tyler | Another tweak. | ||
#53 | 22343 | C. Thomas Tyler |
Fixed off-by-one error in new offline journal counter calculation logic. Bypassing pre-commit review until test suite runs clean again. #review-22344 |
||
#52 | 22277 | C. Thomas Tyler | Debugging. | ||
#51 | 22276 | C. Thomas Tyler | Debugging. | ||
#50 | 22274 | C. Thomas Tyler |
Fixed bug where detection of journal number fails for new/empty data set. Removed msg() and bail() functions, and changed approach to make the existing log() and die() functions behave correctly regardless of whether $LOGFILE is defined. If $LOGFILE is defined, log() silently write to the log file, otherwise writes to the screen (stdout). If $LOGFILE is defined, die() writes to the log file and sends an email, otherwise writes to the screen (stdout). If on a tty, error is duplicated in stderr. To Do: Improve in-code comments. Bypassing pre-commit review until tests pass. #review-22275 |
||
#49 | 22272 | C. Thomas Tyler |
Enhanced error message in check_journalnum() in backup_functions.hs. Bypassing pre-commit review until tests pass. #review-22273 |
||
#48 | 22270 | C. Thomas Tyler |
Attempting fix of build failure. Bypassing pre-commit review. #review-22271 |
||
#47 | 22250 | C. Thomas Tyler |
Further refinements to the new 'rotate journal on p4d start' change: * Fixed p4d_truncate_journal so it has less environment dependencies (e.g. doesn't depend on LOGFILE, etc.) and doesn't try sending email. * Introduced msg() and bail(), counterparts to log() and die() which don't try to write to LOGFILE and don't try to send email. * Added call to get_journalnum() before call to p4d_truncate_journal(). * Fixed logic in get_journalnum() so it gets the journal number w/o needing p4d to be up. * I think I fixed the syntax error in bitwise operator check when setting EDGE_SERVER. It works on a non-edge server (sets EDGESERVER=0). For now I have it doing an 'echo EDGESERVER=$EDGESERVER', but need to test that it correctly sets EDGESERVER=1 on an edge server. TO DO: Remove that 'echo EDGESERVER=$EDGESERVER' once we verify it correctly sets the value for $EDGESERVER. (Or not?) |
||
#46 | 22239 | Russell C. Jackson (Rusty) |
Change set_vars to look up the edge server directly in the database so the server does not have to be on-line to check. Fix for Job: SDP-223 |
||
#45 | 22066 | Russell C. Jackson (Rusty) | Added rotate for p4verify.log instead of just deleting the prior one. | ||
#44 | 21624 | C. Thomas Tyler |
Fixed issue with mail sending a usage error on Ubuntu, which does not accept the '-V' flag to check the version. |
||
#43 | 21580 | Russell C. Jackson (Rusty) |
Changed compare journal numbers function to only fail if root journal number is greater than offline_db. The not equal check was preventing the recreate_db_sync_replca.sh script from being used to fix a replica that was out of sync with the master. |
||
#42 | 21322 | Russell C. Jackson (Rusty) |
#review-21323 Forgot server.id |
||
#41 | 21318 | Russell C. Jackson (Rusty) |
#review-21319 Added commands to move license*, rdb.lbr and state* from P4ROOT to OFFLINE_DB before switching the links. Added command to remove the db.* files from offline_db/save as well before trying to recreate the offline database. |
||
#40 | 21178 | Russell C. Jackson (Rusty) |
Change the SDP so that root and offline_db can be on different volumes and still accomplish a fast database recovery using recreate_db_checkpoint.sh and recreate_db_sync_replica.sh. This is done by switching the links now rather than moving the db files. |
||
#39 | 20970 | Russell C. Jackson (Rusty) |
Changed to use the standard remove log function on the p4login log. We don't need to keep anymore than the keeplogs specified number of these logs around. It doesn't matter if they are all in the last hour or the last seven days. The only need for a p4login log is for debugging something not working. Anyone that needs long term tracking of logins can turn on the auth structured log to track the logins. |
||
#38 | 20964 | adrian_waters | Include removal of the p4login.*.log files in daily cleanup | ||
#37 | 20940 | Russell C. Jackson (Rusty) |
Drop JOURNALNUM from the rotated log names because it forces you to wait to rotate the prior logs until you get the journal number and creates a problem where the error that you couldn't get the journal number ends up at the end of the previous days log file, and that is what gets email out. That causes confusion for the person trying to see what the error is. Moved all rotate_last_run_logs up to the point right after we set the environment. |
||
#36 | 20822 | C. Thomas Tyler |
Change logic to use p4d init script only from /p4/N/bin. The current logic sets a variable essentially preferring the p4d init script in /etc/init.d, using the one in /p4/N/bin only if the one in /etc/init.d doesn't exist as a file (and would not be selected if it was a symlink). Reasons: * Referencing the file/symlink in /etc/init.d introduces potentially complex and confusing behavior. If there were a file in /etc/init.d rather than symlink'd, that could be bad if it doesn't get upated with new versions of the SDP, where stuff in /p4/N/bin should be reliably updated. * I just expect the SDP to always use its own files in /p4/N/bin, under direct control of the perforce user, rather than external references to it. In a proper SDP deployment on Linux, /etc/init.d should contain symlinks for SDP init scripts anyway. But why trust that if there's no need? * If there is a file in /etc/init.d and it's different than /p4/N/bin for some reason, we should prefer the one in /p4/N/bin. * The sylminks in /etc/init.d are outside the direct control of the perforce user, and could point to who-knows-where. |
||
#35 | 20749 | C. Thomas Tyler |
Approved and committed, but I believe that the shared data setting is always set to false on the master and we should look at fixing that in another change. Enhanced p4login again. Improvements: Default behavior with no arguments gives the desired results. For example, if run on a master, we login on the super user P4USER to P4PORT. If run on a replica/edge and auth.id is set, we login P4USER to the P4TARGET port of the replica. All other login functionality, such as logging in the replication service user on a replica, logging in supplemental automation users, is now accessed via new flags. A usage message is now available via '-h' and '-man' options. The new synopsys is: p4login [<instance>] [-p <port> | -service] [-automation] [-all] The <instance> parameter is the only non-flag positional parameter, and can be ommitted if SDP_INSTANCE is already defined (as is typical when called by scripts). With this change, several other scripts calling either the 'p4login' script or 'p4 login' commands were normalized to call p4login as appropriate given the new usage. Reviewer Note: Review p4login first, then other files. Most changes are in p4login. In other scripts callling p4login, calls similar to: $P4BIN -u $P4USER -p $P4PORT login < /path/to/pwd are replaced with: $P4CBIN/p4login In other scritps calling p4login, calls similar to: $P4BIN -p $P4MASTERPORT login < /path/to/pwd are replaced with: $P4CBIN/p4login -p $P4MASTERPORT Note that, if auth.id is set, calling 'p4login' actually has the same behavior as 'p4login -p $P4MASTERPORT', since p4login called on a replica with auth.id set will just login to the master port anyway. Depending on intent, sometimes $P4BIN/p4login -service is used. == Misc Cleanup == In doing the cleanup: * Fixed a hard-coding-to-instance-1 bug in broker_rotate.sh. * Fixed an inconsistency in recreate_db_sync_replica.sh, where it did just a regular login rather than a login -a as done in other places for (for compatibility with some multi-interface NIC card configs). == p4login Call Normalization == Code cleanup was done to normalize calls to p4login, such that: 1) the call starts with $P4CBIN/p4login (not the hard-coded path), and 2) logic to redirect sdtout/stderr to /dev/null was removed, since it's not necessary with p4login. (And if p4login ever does generate any unwanted output, we only fix it in one place). == Tweak to instance_vars.template == This change includes a tweak to set P4MASTERPORT dynamically on a replica to ensure the value precisely matches P4TARGET for the given replica. This will reduce a source of problems when SSL is used, as it is particularly sensitive to the precise P4PORT values used, and will also help for environments which have not yet set auth.id. If the port cannot be determined dynamically, we fall back to the old logic using the assigned value. == Tweak to SDP_ALWAYS_LOGIN behavior == This used to default to 1, now it defaults to 0. At this point we should no longer need to force logins, and in fact doing so can get into a 'p4 login' hang situation with auth.id set. Best to avoid unnecessary logins if we already have a valid ticket. (I think the need to force a login may have gone away with p4d patches). == Obsolete Script == With this change, svclogin.sh is now obsolete. All it was doing was a few redundant 'p4 login' commands followed by a call to p4login anyway. == Testing == Our test suite doesn't fully cover this change, so additional manual testing was done in the Battle School lab environment. |
||
#34 | 20637 | Russell C. Jackson (Rusty) |
Fixed the real cause of the problem and put the redirects to LOGFILE back. The actual cause of the problem was that we were rotating the sync_replica.log file twice within that function because of the call to rotate $LOGFILE and a second call to rotate "sync_replica.log". I removed the 2nd call to rotate the sync_replica.log. |
||
#33 | 20636 | Russell C. Jackson (Rusty) | Changed mv and gzip in rotate log to go to /dev/null to avoid stomping on the file we just rotated. | ||
#32 | 20170 | Russell C. Jackson (Rusty) |
Moved password and users into the config directory to allow for instance specific users and passwords. Ran into a case where two different teams were sharing the same server hardware and needed this type of differentiation. Surprised that we haven't hit this sooner. Also defaulted mkdirs to use the numeric ports since this is the most common installation. |
||
#31 | 19851 | Robert Cowham |
Check for usable offline_db before creating checkpoint work file. This avoids an error right at the start locking out the utility which will fix said error! |
||
#30 | 19768 | UnstoppableDrew |
@tom_tyler @russell_jackson Bug fix for running p4master_run as root, and some comment header cleanup. Job 000543 p4master_run: Preserve original arguments list and use this when exec'ing as $OSUSER. backup_functions.sh: Add text about sourcing p4_vars yourself instead of using p4master_run. update_limites.py: Run p4login directly without p4master_run since p4login calls p4_vars now. everything else: Remove comment block about needing to run with p4master_run. Reword comment about SDP_INSTANCE since it is not always an integer value. |
||
#29 | 19523 | Russell C. Jackson (Rusty) |
Added a KEEPJNLS variable to allow you to keep more journals than checkpoints in case you rotate the journal more frequently than you run checkpoints. |
||
#28 | 19113 | Russell C. Jackson (Rusty) |
Changed name of daily_backup.sh to daily_checkpoint.sh Changed name of weekly_backup.sh to recreate_db_checkpoint.sh Updated crontabs with new names, and changed to run recreate_db_checkpoint on the 1st Sat. of Jan. and July. For most companies, this is a better practice than recreating weekly per discussion with Anton. Remove solaris crontab since Solaris is pretty much dead, and we don't test on it. Updated docs to reflect name changes, and did a little clean other other sections while I was in there. |
||
#27 | 19105 | Russell C. Jackson (Rusty) |
This change uses p4 admin journal command against the master server to rotate the journal. Added a p4d_truncate_journal to use in weekly_back that still rotates via p4d. The purpose of this change is to allow you to run daily_backup.sh on a standby machine where you have a shared depotdata volume. If you want to use daily on the standby machine, you have to put offline_db on the shared depotdata volume which means you will NOT want to run weekly_backup.sh on the master very often, but that is basically what Anton is recommending now. I am currently testing this setup on a production environment, and if it works well, I will change mkdirs.sh to put offline_db on the depotdata volume by default and update the crontabs not to run weekly anymore. #review-19083 |
||
#26 | 18934 | C. Thomas Tyler |
Moved ckp_runnig.txt to $LOGS (/p4/n/logs) from /p4/n/checkpoints: * Avoids it getting rsync'd by sync_replica.sh or by common human admin rsyncs of the /p4/n/checkpoints dir. * It should be in a volume that's not shared. * Puts it in the logs directory where you go look when things break. |
||
#25 | 18617 | Russell C. Jackson (Rusty) |
#review-18610 Fixed a bug with check_journalnum where it was being called to check the offline journal number, but the function was hard coded to JOURNALNUM. Implemented a function to compare the journal numbers of P4ROOT and OFFLINE_DB before switching the db files as an extra layer of protection to avoid data loss. |
||
#24 | 18595 | Russell C. Jackson (Rusty) |
Fixed a log rotation bug that has been around for a long time. If you rotated the journal more times than KEEPCKPS and KEEPLOGS, the old method would remove all of your logs and checkpoints because it didn't actually look at how many were on disk. Found the bug while reviewing the test harness with Robert. Adjusted the test harness to account for the change. (Stole from Robert's shelf.) |
||
#23 | 18590 | Robert Cowham |
Fix failing tests. Change log filename format to use - instead of : as seperator for date/time component |
||
#22 | 18587 | Russell C. Jackson (Rusty) |
Reworked the log rotation stuff in backup_functions.sh to make it cleaner and handle the new log from recreate_offline_db.sh. Modified recreate_offline_db.sh to add comments about a bad checkpoint. Also made it create its own log file since it isn't doing a checkpoint. Removed the log rotation for the same reason. Moved the LOGFILE setting out to all of scripts to make it more obvious for future scripts that you need to set that variable in your script so that it doesn't just default to checkpoint.log. Moved the functions in weekly_backup.sh and recreate_offline_db.sh into backup_functions.sh where they belong for consistency. Modified backup_functions.sh to use a consistent naming convention for all the rotated log files rather than checkpoint.log being unique. Replaced all back ticks with the newer bash $() method. Removed all of the line wrapping since I am pretty sure that none of us are working on an 80 character terminal these days and it is easier to read this way. |
||
#21 | 18533 | Robert Cowham |
Put a date/time suffix onto checkpoint.log.* files in case of any errors to avoid them being overwritten. Make remove_old_logs tidy up appropriately. |
||
#20 | 18532 | Robert Cowham | Correct log message regarding journals replays | ||
#19 | 18484 | Russell C. Jackson (Rusty) | Added comment on WAITCOUNT to explain the value. | ||
#18 | 18450 | Russell C. Jackson (Rusty) |
Added a kill for the p4d_stop function in case p4d doesn't shut down. In the process of testing this, I discovered that using $P4DBIN in this case was a bug that didn't work when running in case insensitive mode because the executable doesn't match what is actually running since we end up calling p4d from /p4/common/bin. Corrected the grep so that it would match in either case. #review-18430 |
||
#17 | 16335 | C. Thomas Tyler |
Routine Merge Down to dev from main using: p4 merge -b perforce_software-sdp-dev |
||
#16 | 16029 | C. Thomas Tyler |
Routine merge to dev from main using: p4 merge -b perforce_software-sdp-dev |
||
#15 | 15797 | C. Thomas Tyler | Routine Merge Down to dev from main for SDP. | ||
#14 | 15778 | C. Thomas Tyler | Routine Merge Down to dev from main. | ||
#13 | 15376 | adrian_waters | formatting only - fix spacing; there's inconsistent use of tabs/spaces throughout the file - needs cleanup at some point. | ||
#12 | 15375 | adrian_waters | Routine merge-down from main->dev | ||
#11 | 15374 | adrian_waters |
- Ensure backup scripts are run as the OSUSER (to prevent accidental running as root); - in scripts where LOGFILE value is changed from the 'checkpoint.log' set by set_vars, ensure the new assignment is before check_dirs is called, otherwise errors could be written to the 'wrong' log - in 'die()' - detect if running from terminal & also send output to stderr |
||
#10 | 13931 | C. Thomas Tyler | Routine merge-down to dev from main. | ||
#9 | 13906 | C. Thomas Tyler |
Normalized P4INSTANCE to SDP_INSTANCE to get Unix/Windows implementations in sync. Reasons: 1. Things that interact with SDP in both Unix and Windows environments shoudn't have to account for this obscure SDP difference between Unix and Windows. (I came across this doing CBD work). 2. The Windows and Unix scripts have different variable names for defining the same concept, the SDP instance. Unix uses P4INSTANCE, while Windows uses SDP_INSTANCE. 3. This instance tag, a data set identifier, is an SDP concept. I prefer the SDP_INSTANCE name over P4INSTANCE, so I prpose to normalize to SDP_INSTANCE. 4. The P4INSTANCE name makes it look like a setting that might be recognized by the p4d itself, which it is not. (There are other such things such as P4SERVER that could perhaps be renamed as a separate task; but I'm not sure we want to totally disallow the P4 prefix for variable names. It looks too right to be wrong in same cases, like P4BIN and P4DBIN. That's a discussion for another day, outside the scope of this task). Meanwhile: * Fixed a bug in the Windows 2013.3 upgrade script that was referencing undefined P4INSTANCE, as the Windows environment defined only SDP_INSTANCE. * Had P4INSTANCE been removed completely, this change would likely cause trouble for users doing updates for existing SDP installations. So, though it involves slight technical debt, I opted to keep a redundant definition of P4INSTANCE in p4_vars.template, with comments indicating SDP_INSTANCE should be used in favor of P4INSTANCE, with a warning that P4INSTANCE may go away in a future release. This should avoid unnecessary upgrade pain. * In mkdirs.sh, the varialbe name was INSTANCE rather than SDP_INSTANCE. I changed that as well. That required manual change rather than sub/replace to avoid corrupting other similar varialbe names (e.g. MASTERINSTANCE). This is a trivial change technically (a substitute/replace, plus tweaks in p4_vars.template), but impacts many files. |
||
#8 | 12169 | Russell C. Jackson (Rusty) |
Updated copyright date to 2015 Updated shell scripts to require an instance parameter to eliminate the need for calling p4master_run. Python and Perl still need it since you have to set the environment for them to run in. Incorporated comments from reviewers. Left the . instead of source as that seems more common in the field and has the same functionality. |
||
#7 | 12028 | C. Thomas Tyler | Refreshed SDP dev branch, merging down from main. | ||
#6 | 11541 | Russell C. Jackson (Rusty) | Keeping dev up to date. | ||
#5 | 11535 | Russell C. Jackson (Rusty) | Updated dev from main. | ||
#4 | 11509 | Russell C. Jackson (Rusty) |
Added sync_replica.log to backup function log rotations, and added rm on existing gzipped logs with the same name in order to keep the script from hanging waiting for a response to overwrite. Added sync_shared_replica.sh and weekly_sync_shared_replica.sh to support replicas with shared depotdata storage. No rsync is necessary. The logs volume must not be a shared volume with these scripts though. |
||
#3 | 11483 | Russell C. Jackson (Rusty) | Brought over changes from RCJ backup_functions.sh | ||
#2 | 11463 | Russell C. Jackson (Rusty) | Updated dev to prepare for Summit agreed changes. | ||
#1 | 10638 | C. Thomas Tyler | Populate perforce_software-sdp-dev. | ||
//guest/perforce_software/sdp/main/Server/Unix/p4/common/bin/backup_functions.sh | |||||
#1 | 10148 | C. Thomas Tyler | Promoted the Perforce Server Deployment Package to The Workshop. |