verify_sdp.sh #3

#!/bin/bash
#==============================================================================
# Copyright and license info is available in the LICENSE file included with
# the Server Deployment Package (SDP), and also available online:
# https://swarm.workshop.perforce.com/projects/perforce-software-sdp/view/main/LICENSE
#------------------------------------------------------------------------------

# verify_sdp.sh
# Verifies SDP structure and environment.

#==============================================================================
# Declarations and Environment

export VS_SDP_P4CBIN=/p4/common/bin
export VS_SDP_ENV="$VS_SDP_P4CBIN/p4_vars"
export SDP_INSTANCE="${SDP_INSTANCE:-UnsetSDPInstance}"
export VS_SDP_OWNER

export SDP_INSTANCE="${1:-$SDP_INSTANCE}"
if [[ $SDP_INSTANCE == UnsetSDPInstance ]]; then
   echo "Instance parameter not supplied."
   echo "You must supply the SDP instance as a parameter to this script."
   exit 1
fi

declare Version=5.3.1
declare -i ServerOnline=0
declare -i ErrorCount=0
declare -i CheckCount=0
declare -i ExitCode=0
declare -i ShowLog=1
declare BadLog=
declare ThisUser=
declare H1="=============================================================================="
declare H2="------------------------------------------------------------------------------"
export LOGFILE=Unset
export P4TMP=Unset

#==============================================================================
# Local Functions

# Note: This script does not use SDP library files, as its purpose is to
# verify the integrity of an SDP installation.  Thus, it has its own
# self-contained versions of some functions for which similar versions
# would normally be sourced from files in /p4/common/lib, like libbcore.sh.

# Micro-functions, small or one-liners used to avoid external dependencies,
# which is critical for this particular verify_sdp.sh script.
function msg () { if [[ "$LOGFILE" != Unset ]]; then echo -e "$*" >> "$LOGFILE"; else echo -e "$*"; fi; }
function errmsg () { msg "\\nError: ${1:-Unknown Error}\n"; ErrorCount+=1; }
function bail () {
   errmsg "${1:-Unknown Error}"
   [[ "$ShowLog" -eq 1 && -s "$LOGFILE" ]] && cat "$LOGFILE"
   exit "${2:-1}"
}

#------------------------------------------------------------------------------
# Function: run ($cmd, $desc, $shoutOutput)
#
# Runs a command, with optional description, showing command line to execute
# and optionally also the output, and capturing and returning the exit code.
#
# Input:
# $1 - Command and arguments to execute. Defaults to 'echo'.
# $2 - Optional message to display describing what the command is doing.
# $3 - Numeric flag to show output; '1' indicates to show output, 0 to
#      suppress it.
#------------------------------------------------------------------------------
function run () {
   local cmd="${1:-echo}"
   local desc="${2:-}"
   local -i showOutput="${3:-1}"
   local -i exitCode=
   local log

   log="$(mktemp "$P4TMP/run.XXXXXXXXXXX")"

   [[ -n "$desc" ]] && msg "$desc"
   msg "Executing: $cmd"
   $cmd > "$log" 2>&1
   exitCode=$?

   if [[ "$showOutput" -eq 1 ]]; then
      echo "EXIT_CODE: $exitCode" >> "$log"
      cat "$log" >> $LOGFILE
   fi

   /bin/rm -f "$log"
   return "$exitCode"
}

#------------------------------------------------------------------------------
# Function: usage (required function)
#
# Input:
# $1 - style, either -h (for short form) or -man (for man-page like format).
# The default is -h.
#
# $2 - error message (optional).  Specify this if usage() is called due to
# user error, in which case the given message displayed first, followed by the
# standard usage message (short or long depending on $1).  If displaying an
# errror, usually $1 should be -h so that the longer usage message doesn't
# obsure the error message.
#
# Sample Usage:
# usage 
# usage -man
# usage -h "Incorrect command line usage."
#
# This last example generates a usage error message followed by the short
# '-h' usage summary.
#------------------------------------------------------------------------------
function usage
{
   declare style=${1:--h}
   declare errorMessage=${2:-Unset}

   if [[ $errorMessage != Unset ]]; then
      echo -e "\n\nUsage Error:\n\n$errorMessage\n\n" >&2
   fi

   echo "USAGE for verify_sdp.sh v$Version:

verify_sdp.sh [<instance>] [-online] [-si] [-L <log>] [-D]

   or

verify_sdp.sh -h|-man
"
   if [[ $style == -man ]]; then
      echo -e "DESCRIPTION:

	This script verifies the current SDP setup for the specified instance.

	Useful if you change anything, particularly after an SDP upgrade.

OPTIONS:
<instance>
	Specify the SDP instances.  If not specified, the SDP_INSTANCE
	environment variable is used instead.  If the instance is not
	defined by a parameter and SDP_INSTANCE is not defined,
	exits immediately with an error message.

 -online
	Online mode.  Does additional checks that require P4D to be online.

 -si	Silent mode.  Does not display the generated log file to stdout
	at the end of processing.

 -L <log>
	Specify the log file to use.  The default is /p4/N/logs/verify_sdp.log

 -D	Set extreme debugging verbosity.

HELP OPTIONS:
 -h	Display short help message
 -man	Display man-style help message

EXAMPLES:
	This script is typically called after SDP update with only the instance
	name or number as an argument, e.g.:

	verify_sdp.sh 1

LOGGING:
	This script generates a log file and also displays it to stdout at the
	end of processing.  By default, the log is:
	/p4/N/logs/verify_sdp.log.

	The exception is usage errors, which result an error being sent to
	stderr followed usage info on stdout, followed by an immediate exit.

	If the '-si' (silent) flag is used, the log is generated, but its
	contents are not displayed to stdout at the end of processing.

EXIT CODES:
	An exit code of 0 indicates no errors were encounted attempting to
	perform verifications, and that all checks verified cleanly.
"
   fi

   exit 1
}

#------------------------------------------------------------------------------
# Function: do_preflight_checks ()
#
# Sample Usage:
# do_preflght_checks ||\
#    bail "Preflight checks failed. Aborting further checks."
#------------------------------------------------------------------------------
function do_preflight_checks () {

   declare toolsList="date ls grep awk id head tail"

   msg "$H2\nDoing preflight sanity checks."
   msg "Preflight Check 1: Ensuring these utils are in PATH: $toolsList"

   for tool in $toolsList; do
      CheckCount+=1
      [[ -z "$(command -v "$tool")" ]] && \
         errmsg "Tool '$tool' not in PATH."
   done

   [[ $ErrorCount -eq 0 ]] || return 1

   msg "Verified: Essential tools are in the PATH."

   msg "Preflight Check 2: cd $VS_SDP_P4CBIN"

   CheckCount+=1
   if ! cd "$VS_SDP_P4CBIN" >> $LOGFILE 2>&1; then
      errmsg "Could not cd to: $VS_SDP_P4CBIN"
      return 1
   fi

   msg "Verified: cd works to: $VS_SDP_P4CBIN"

   msg "Preflight Check 3: Checking current user owns $VS_SDP_P4CBIN"

   # shellcheck disable=SC2012
   VS_SDP_OWNER="$(ls -ld . | awk '{print $3}')"
   ThisUser="$(id -n -u)"

   CheckCount+=1
   if [[ "$ThisUser" == "$VS_SDP_OWNER" ]]; then
      msg "Verified: Current user [$ThisUser] owns $VS_SDP_P4CBIN"
   else
      errmsg "Current user [$ThisUser] does not own $VS_SDP_P4CBIN. This most likely means this script is running as the wrong user.  It could also mean the $VS_SDP_P4CBIN directory is not owned by by the correct owner, which should be the OS account under which the p4d process runs."
      return 1
   fi

   return 0
}


#------------------------------------------------------------------------------
# Function: check_file ($file, $errMsg)
#
# Checks for existance of a file, returns 0 if it exists, 1 otherwise.
# Allows optional custom error message describing the file, to be displayed if
# the file is missing.  Default error message is "Missing file [FILE]."
#
# Inputs:
function check_file () {
   local file=$1
   local errMsg=${2:-Missing file}
   CheckCount+=1
   msg "Checking existence of file $file"
   [[ -f $file ]] && return 0
   errmsg "$errMsg: [$file]."
   return 1
}

#------------------------------------------------------------------------------
# Function: check_configurable ($instance, $configurable, $scope, $expectedVal, $errMsg1, $errMsg2)
#
# Check that a configurable is set, and optionally check that it is set to
# an expected value.
#
# Inputs:
# $1 - SDP Instance. Required.
# $2 - Configurable name. Required.
# $3 - Configurable scope/ServerID, as per 'p4 help configure'.  The default
#      is "any", meaning what it means with 'p4 configure set', i.e. that the
#      configurable is a global default.  The special value 'ALL' can
#      also be supplied parameter, which is has the special meaning of checking
#      if the configurable is defined for any ServerID, including the 'any'
#      value.  The value returned is that of the first setting encountered.
# $4 - Expected value of configurable. Optional. If defined, an additional check is
#      done, checking the current value against the expected value.  Optionally,
#      the special value UNDEF can be used, which reverses the exit code, such
#      that a happy zero is returned only if the value is not set.
# $5 - Optional error message to display if no value is defined.  See code
#      below for the default message.
# $6 - Optional error message to display if a value is defined but does not
#      match the expected value.  See code below for the default message.
#
# Return Codes:
# 1 - Verifications failed.
# 0 - Verifications passed.
# 
# Sample Usage: 
# check_configurable "$SDP_INSTANCE" journalPrefix
#
# check_configurable "$SDP_INSTANCE" journalPrefix any "$CHECKPOINTS/$P4SERVER"
#
# check_configurable "$SDP_INSTANCE" journalPrefix any "$CHECKPOINTS/$P4SERVER" ||\
#   bail "Yikes, journalPrefix is not set, all bets are off. Aborting."
#------------------------------------------------------------------------------
function check_configurable () {
   local instance="$1"
   local configurable="$2"
   local scope="${3:-any}"
   local expectedValue="${4:-NoExpectedValue}"
   local errMsgMissing="${5:-No value defined}"
   local errMsgBadValue="${6:-Value does not match what is expected}"
   local detectedScope=
   local value=
   CheckCount+=1

   if [[ "$scope" != "ALL" ]]; then
      value=$($P4DBIN -r "$P4ROOT" -cshow | grep "^${scope}: ${configurable} = ")
   else
      value=$($P4DBIN -r "$P4ROOT" -cshow | grep ": ${configurable} = " | head -1)
      detectedScope="$value"
      value=${value##* = }
      detectedScope="${detectedScope%%:*}"
   fi

   if [[ "$expectedValue" != "UNDEF" ]]; then
      if [[ -n "$value" ]]; then
         value=${value##* = }
         if [[ "$scope" != "ALL" ]]; then
            msg "Verified: Configurable ${scope}:${configurable} is defined."
         else
            msg "Verified: Configurable ${configurable} is defined for at least once."
         fi
      else
         errmsg "$errMsgMissing for configurable [${scope}:${configurable}]."
         return 1
      fi
   else
      if [[ -n "$value" ]]; then
         if [[ "$scope" != "ALL" ]]; then
            errmsg "Configurable ${configurable} should not be set with 'p4 configure set' but has a value for ServerID ${scope} of: ${value}"
            return 1
         else
            errmsg "Configurable ${configurable} should not be set with 'p4 configure set' but has a value for ServerID ${detectedScope} of: ${value} (and possibly for other ServerIDs)."
            return 1
         fi
      else
         if [[ "$scope" != "ALL" ]]; then
            msg "Verified: Configurable ${scope}:${configurable} is undefined."
         else
            msg "Verified: Configurable ${configurable} is undefined."
         fi
      fi
   fi

   [[ "$expectedValue" == "NoExpectedValue" ]] && return 0

   CheckCount+=1

   if [[ "$expectedValue" != "UNDEF" ]]; then
      if [[ "$value" == "$expectedValue" ]]; then
         msg "Verified: Configurable ${scope}:${configurable} has expected value [$value]."
      else
         errmsg "$errMsgBadValue for variable [${scope}:${configurable}]\n\tExpected value: [$expectedValue]\n\tActual value:   [$value]"
         return 1
      fi
   fi

   return 0
}

#------------------------------------------------------------------------------
# Function: check_env_var ($instance, $var, $expectedval, $msg1, $msg2)
#
# Check that a shell environment variable is set when sourcing the SDP
# environment. Optionally checks that variables are set to expected values.
#
# Inputs:
# $1 - SDP Instance. Required.
# $2 - Variable name. Required.
# $3 - Expected value of variable. Optional. If defined, an additional check is
#      done, checking the current value against the expected value.
# $4 - Optional error message to display if no value is defined.  See code
#      below for the default message.
# $5 - Optional error message to display if a value is defined but does not match
#      the expected value.  See code below for the default message.
# 
# Return Codes:
# 1 - Verifications failed.
# 0 - Verifications passed.
# Sample Usage: 
# check_env_var $SDP_INSTANCE P4JOURNAL "/p4/$SDP_INSTANCE/logs/journal"
#
# check_env_var $SDP_INSTANCE P4JOURNAL "/p4/$SDP_INSTANCE/logs/journal" ||\
#   bail "Yikes, P4JOURNAL is not set, all bets are off. Aborting."
#------------------------------------------------------------------------------
function check_env_var () {
   local instance="$1"
   local var="$2"
   local expectedValue="${3:-NoExpectedValue}"
   local errMsgMissing="${4:-No value defined}"
   local errMsgBadValue="${5:-Value does not match what is expected}"
   local value=
   CheckCount+=1

   eval unset "${var}"
   # shellcheck source=/p4/common/bin/p4_vars
   source "$VS_SDP_ENV" "$instance"

   set +u
   if [[ -n "$(eval echo \$"${var}")" ]]; then
      msg "Verified: Variable ${var} is defined."
      set -u
   else
      errmsg "$errMsgMissing for variable [$var]."
      set -u
      return 1
   fi

   [[ "$expectedValue" == "NoExpectedValue" ]] && return 0

   CheckCount+=1
   value="$(eval echo \$"${var}")"

   if [[ "$value" == "$expectedValue" ]]; then
      msg "Verified: Variable ${var} has expected value [$value]."
   else
      errmsg "$errMsgBadValue for variable [$var]\n\tExpected value: [$expectedValue]\n\tActual value:   [$value]"
      return 1
   fi

   return 0
}

#==============================================================================
# Command Line Processing

declare -i shiftArgs=0

set +u
while [[ $# -gt 0 ]]; do
   case $1 in
      (-h) usage -h;;
      (-man) usage -man;;
      (-online) ServerOnline=1;;
      (-si) ShowLog=0;;
      (-L) LOGFILE="$2"; shiftArgs=1;;
      (-D) set -x;; # Debug; use 'set -x' mode.
      (-*) usage -h "Unknown command line option ($1).";;
      (*) export SDP_INSTANCE=$1;;
   esac

   # Shift (modify $#) the appropriate number of times.
   shift; while [[ $shiftArgs -gt 0 ]]; do
      [[ $# -eq 0 ]] && usage -h "Incorrect number of arguments."
      shiftArgs=$shiftArgs-1
      shift
   done
done
set -u

#==============================================================================
# Command Line Verification

[[ $SDP_INSTANCE == Unset ]] && \
   bail "The \$SDP_INSTANCE setting is not defined. It must be defined by doing:\n\n\tsource $VS_SDP_ENV <instance>\n\nor by passing in the instance name as a parameter to this script.\n"

#==============================================================================
# Main Program

# shellcheck source=/p4/common/bin/p4_vars
source "$VS_SDP_ENV" "$SDP_INSTANCE" ||\
   bail "Failed to load SDP environment for instance $SDP_INSTANCE."

# shellcheck source=/p4/common/bin/backup_functions.sh
source "$P4CBIN/backup_functions.sh" ||\
   bail "Failed to load backup_functions.sh."

# Logs should be defined to /p4/N/logs after sourcing the environment
# file above; default to /tmp for cases of incomplete environment where
# LOGS is not defined.
export LOGS="${LOGS:-/tmp}"

[[ "$LOGFILE" == Unset ]] && export LOGFILE="$LOGS/verify_sdp.log"

if [[ -f "$LOGFILE" ]]; then
   if [[ ! -w "$LOGFILE" ]]; then
      BadLog="$LOGFILE"
      export LOGFILE=Unset
      bail "Existing log file [$BadLog] is not writable. Aborting."
   fi
else
   if [[ ! -d "${LOGS}" ]]; then
      export LOGFILE=Unset
      bail "Logs directory [$LOGS] is not writable. Aborting."
   fi
fi

[[ "$P4TMP" != Unset && -d "$P4TMP" && -w "$P4TMP" ]] ||\
   bail "SDP environment must define required P4TMP variable. Value must be a directory that is writable; value is: $P4TMP"

rotate_log_file "$LOGFILE" ".gz"

msg "${0##*/} v$Version Starting SDP verification at $(date +'%a %Y-%m-%d %H:%M:%S %Z')."

msg "\nIf you have any questions about the output from this script, contact support@perforce.com."

do_preflight_checks ||\
   bail "Preflight checks failed. Aborting further checks."

msg "${H2}\nChecking environment variables."
check_env_var "$SDP_INSTANCE" SDP_INSTANCE
check_env_var "$SDP_INSTANCE" P4ROOT "/p4/$SDP_INSTANCE/root"
check_env_var "$SDP_INSTANCE" P4JOURNAL "/p4/$SDP_INSTANCE/logs/journal"

if ! check_env_var "$SDP_INSTANCE" SDP_ADMIN_PASSWORD_FILE "$P4CCFG/.p4passwd.$P4SERVER.admin"; then
   check_file "$SDP_ADMIN_PASSWORD_FILE" "SDP admin password file doesn't exist"
fi

msg "${H2}\nRunning standard checks typically called within SDP scripts."
check_vars
set_vars
check_dirs

msg "${H2}\nChecking for a few database files."
# Check db files exist
file=db.counters
for dir in $P4ROOT $OFFLINE_DB; do
   check_file "$dir/$file" "Expected database file doesn't exist"
done

msg "${H2}\nChecking for key files."
check_file "$OFFLINE_DB/offline_db_usable.txt" "Offline database not in a usable state."
check_file "$P4BIN" "The p4 binary (or symlink) doesn't exist"
check_file "$RC" "The p4d init script doesn't exist"
check_file "$P4TICKETS" "The P4TICKETS file doesn't exist"

msg "${H2}\nChecking configurables values."
check_configurable "$SDP_INSTANCE" P4JOURNAL ALL UNDEF
check_configurable "$SDP_INSTANCE" journalPrefix any "$CHECKPOINTS/$P4SERVER"
check_configurable "$SDP_INSTANCE" server.depot.root any "$DEPOTS"

msg "${H2}\nChecking SDP structure."
if [[ -L "$P4HOME" ]]; then
   errmsg "This is a symlink, should be a local directory: $P4HOME"
fi

if [[ "$ServerOnline" -eq 1 ]]; then
   msg "${H2}\nDoing online checks."
   CheckCount+=1
   if run "$P4CBIN/p4login" "Login check" 0; then
      msg "Verified: Login OK."
   else
      errmsg "Login as P4USER $P4USER to P4PORT $P4PORT could not be verified."
   fi
fi

### P4="$P4BIN -p $P4PORT -u $P4USER"

# TODO:
# - Ensure links are all present and correct for instance
# - Ensure db files present (and quick verification OK)
# - Ensure db directories named appropriately (e.g. db1/db2)
# - Ensure crontab includes at least daily_checkpoint.sh
# - Ensure checkpoints dir contains a checkpoint or two
# - service password verified for a replica
# - Add flag to check less-critical SDP configurables, and generate
#   warnings (rather than errors) if they are not set as expected,
#   using SDP configure_new_serfers.sh script as a guide.

if [[ $ErrorCount -eq 0 ]]; then
   msg "\n${H1}\n\nALL CLEAN: $CheckCount verifications completed OK."
else
   msg "\n${H1}\n\nVerifications completed, with $ErrorCount errors detected in $CheckCount checks."
   ExitCode=1
fi

[[ "$ShowLog" -eq 1 && -s "$LOGFILE" ]] && cat "$LOGFILE"
 
exit $ExitCode

#	Change	User	Description
#19	30297	C. Thomas Tyler	Released SDP 2023.2.30295 (2024/05/08). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#18	30043	C. Thomas Tyler	Released SDP 2023.2.30041 (2023/12/22). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#17	29143	C. Thomas Tyler	Released SDP 2022.1.29141 (2022/10/29). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#16	28858	C. Thomas Tyler	Released SDP 2022.1.28855 (2022/05/27). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#15	28651	C. Thomas Tyler	Released SDP 2021.2.28649 (2022/03/03). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#14	28259	C. Thomas Tyler	Released SDP 2021.1.28253 (2021/11/13). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#13	28240	C. Thomas Tyler	Released SDP 2021.1.28238 (2021/11/12). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#12	27921	C. Thomas Tyler	Released SDP 2020.1.27919 (2021/07/19). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#11	27761	C. Thomas Tyler	Released SDP 2020.1.27759 (2021/05/07). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#10	27416	C. Thomas Tyler	Released SDP 2020.1.27414 (2021/02/07). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#9	27407	C. Thomas Tyler	Released SDP 2020.1.27403 (2021/02/06). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#8	27400	C. Thomas Tyler	Released SDP 2020.1.27398 (2021/02/06). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#7	27354	C. Thomas Tyler	Released SDP 2020.1.27351 (2021/01/31). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#6	27331	C. Thomas Tyler	Released SDP 2020.1.27325 (2021/01/29). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#5	26470	C. Thomas Tyler	Released SDP 2019.3.26468 (2020/04/10). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#4	26403	C. Thomas Tyler	Released SDP 2019.3.26400 (2020/03/28). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#3	26161	C. Thomas Tyler	Released SDP 2019.3.26159 (2019/11/06). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#2	25380	C. Thomas Tyler	Released SDP 2019.1.25374 (2019/03/21). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#1	25245	C. Thomas Tyler	Released SDP 2019.1.25238 (2019/03/02). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
//guest/perforce_software/sdp/dev/Server/Unix/p4/common/bin/verify_sdp.sh
#2	25206	C. Thomas Tyler	Removed logic that uses 'p4d -cset' to force the value for P4JOURNAL, and also automatic journal rotation on server startup. Added related logic to verify_sdp.sh to ensure there is one source of truth for the P4JOURNAL definition. === On Journal Rotation at Server Startup === The goal with journal rotation on server stratup is noble, to make it so any potential journal corruption always appears at the end of a numbered journal file, rather than being in the middle of the active journal. This can make it easier and faster to recover from journal corruption caused by sudden power loss, kernel panic, a p4d bug/crash, etc. However, the implementation causes problems (noted below). === On Forcing P4JOURNAL === The goal of forcing the value of P4JOURNAL via db.config is also noble, in that having a value anything other than the SDP standard can really wreak havoc with things. This is generally not an issue in a 'fresh' SDP install, but can be an issue (wreak havoc!) in cases where 'p4 configure' was used to set a value for P4JOURNAL that conflicts with the value defined by the SDP environment mechanism, which is in turn passed to 'p4d' on the command line. Even if the value defined differently, it should be set in to exactly one value, and exactly one mechanism. The current implementation causes problems (noted below). == Problems with setting P4JOURNAL in db.config == 1. Things Break The forced P4JOURNAL set via 'p4d -cset' causes a mild form of journal corruption that breaks 'standby' replicas using journalcopy, as this type of replica is extremely sensitive to the contents of every byte in the journal file, and doesn't allow for use of 'p4d -cset' to modify the P4JOURNAL file. While it does not cause any actual loss of data, it does require manual reset to fix things. In the case of a site-wide topology with a mandatory standby replica, it causes global replication to stall. 2. Not our Place (not the place of SDP scripts) Based on the above and taking a step back, I think this script behavior of forcing a back-door journal rotation is simply too intrusive for what SDP scritps should be allowed to do. They live to have some understanding of p4d workings, but shoulnd't pretend to have too much insight into the inner workings of p4d. == Problem with Always-On Journal Rotation on Start == 1. What the wah? This confuses admins by incrementing the journal counter unexpectedly. In Battle School training classes, for example, students (Perforce admins) are confused by seemingly random journal incrementing. While this could be documented and trained for, it violates the principal of least surprise, and is not typical 'p4d' beavhior. 2. Always vs. Rare It rotates the journal even when there is no corruption, which of course 99.99999% or more of the time at any given site. Anyone who has been through a corruption scenario is happy to have the corruption at the end rather than in the middle of a journal file -- as noted, the intent here is noble. But before we do any journal rotations, we should detect whether there is corruption. Turns out we have a means to detect journal corruption at the end of the current/active journal file, and should employ such detection and handle it in some approrpaite manner, e.g. by expanding the 'force_start' logic in this p4d_base init script. Journal corrption detection and preliminary handling may be added in a future SDP release. When the journal is truly corrupted, global replication will stall in any case, so measure like journal file rotation may be called for in that scenario. 3. Accelerated Deletion of Backups Increased journal counter rotations result in unexpectedly fast removal of backups. Admins are used to thinking that roughly, "one journal rotation is roughly one day." Settings like KEEPLOGS, KEEPCKPS, and KEEPJNLS trigger off the number of journal rotatations, not the number of actual calendar days. Now, I think it's OK that journal rotations and days don't match precisely. In a typical "big deal" maintenance window, for example, there might be an additional 1-3 journal rotations induced by extra checkpoints or journals being created over the course of a maintenance activity. But there may be a dozen more 'p4d' restarts during sanity testing and playing around with things. With the current logic, each restart causes another journal rotation. By the end fo the weekend, your next call to daily_checkpoint might remove more of your recent backups than you'd like or expect. (A long standing safety feature always preserves the last few, but still we don't want to delete more than desired.) === Foor for Thougt: KEEP* = numbrer of days? === Making it so KEEPLOGS/KEEPJNLS/KEEPCKPS mean literally number of days rather than journal rotations is worthy of consideration. That's beyond the scope of this change though. #review @robert_cowham @josh
#1	24804	C. Thomas Tyler	Terminology tweak, 'validate' -> 'verify'. #review @robert_cowham
//guest/perforce_software/sdp/dev/Server/Unix/p4/common/bin/validate_sdp.sh
#3	24534	C. Thomas Tyler	Various enhancements and internal refactoring for validate_sdp.sh. Added important check that /p4/N is a dir, not a symlink. #review-24532 @robert_cowham
#2	24356	C. Thomas Tyler	Enhancements to validate_sdp.sh: * Added simple bold ALL CLEAN message to look for. * Added check_env_var() function to check shell environment variables, with some calls to it. * Added check_configurable() to check for configurables. This is implemented using back-door check methodology (using 'p4d_N -cshow') so values can be checked with P4D offline. This replaced stub function check_var(). * Removed stub function check_configurables(). It's easier to understand if all checks in the Main section of the code. * Changed so checks requiring p4d to be online are not done by default; added '-online' flag to run those tests. This is because I anticpate typical usage of the validator to be requiring it to be report ALL CLEAN before starting P4D after a server upgrade. * Added check for new $SDP_ADMIN_PASSWORD_FILE variable. * Added check admin password file pointed to by $SDP_ADMIN_PASSWORD_FILE. * Added errmsg() function, with corresponding tweak to bail(). * Consolidated Log an LOGIFLE to just LOGFILE. * Removed a few items from TOOD comments that got done. * Made a few tweaks for style normalization: - Functions are lowercase with undescore separators. - Functions vars are lowercase-initiated camelCase. - Indentation: 3 spaces for functions/loops/etc. * Added run() function replacing cmd() stub function. * Enhanced p4login check. * Added comment noting why this script uses self-contained copies of functions defined in other SDP files in /p4/common/lib. * And other things. Warning: In the short run, this may fail tests as the new SDP_ADMIN_PASSWORD_FILE variable is also pending review. #review @robert_cowham
#1	23640	Robert Cowham	Super basic validation - placeholder for many more tests to come!