#!/bin/bash
#==============================================================================
# Copyright and license info is available in the LICENSE file included with
# the Server Deployment Package (SDP), and also available online:
# https://swarm.workshop.perforce.com/projects/perforce-software-sdp/view/main/LICENSE
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
# sdp_health_check.sh
#
#==============================================================================
# Declarations and Environment
# Prefix global vars with HC_ to avoid name collisions.
declare ThisScript="${0##*/}"
declare Version=1.8.1
declare ThisUser=
declare Log=
declare DirList=
declare HC_SDP_P4CBIN="/p4/common/bin"
declare HC_SDP_P4CCFG="/p4/common/config"
declare HC_SDP_ENV="$HC_SDP_P4CBIN/p4_vars"
declare HC_SDP_MRUN="$HC_SDP_P4CBIN/p4master_run"
declare HC_SDP_VSDP="$HC_SDP_P4CBIN/verify_sdp.sh"
declare HC_SDP_P4LOGIN="${HC_SDP_P4CBIN}/p4login"
declare SDPInstanceList=
declare SDP_341_URL="https://swarm.workshop.perforce.com/jobs/SDP-341"
declare SDPOwner=
declare -i ErrorCount=0
declare -i WarningCount=0
declare -a KeyFiles
declare -i KeyFileCount=0
declare -a SmallLogFiles
declare -i SmallLogCount=0
KeyFiles[$KeyFileCount]="$HC_SDP_P4CBIN/p4_vars"
KeyFileCount+=1
KeyFiles[$KeyFileCount]="$HC_SDP_P4CBIN/backup_functions.sh"
KeyFileCount+=1
SmallLogFiles[$SmallLogCount]="checkpoint.log"
SmallLogCount+=1
SmallLogFiles[$SmallLogCount]="sync_replica.log"
SmallLogCount+=1
SmallLogFiles[$SmallLogCount]="replica_cleanup.log"
SmallLogCount+=1
SmallLogFiles[$SmallLogCount]="replica_status.log"
SmallLogCount+=1
declare H1="=============================================================================="
declare H2="------------------------------------------------------------------------------"
if [[ -n "$(command -v date)" ]]; then
Log=/tmp/sdp_health_check.$(date +'%Y%m%d-%H%M%s').log
else
Log=/tmp/sdp_health_check.log
fi
#==============================================================================
# Local Functions
# Note: This script does not use SDP library files, as its purpose is to
# verify the integrity of an SDP installation. Thus, it has its own
# self-contained versions of some functions that would normally be
# sourced in from files like /p4/common/lib/libcore.sh.
# Micro-functions, one-liners used to avoid external dependencies.
# Display text with formatting.
function msg () { echo -e "$*" ; }
# Calls to errmsg() increment the ErrorCount.
function errmsg () { msg "\\nError: ${1:-Unknown Error}\\n"; ErrorCount+=1; }
# Calls to warnmsg() increment the WarningCount.
function warnmsg () { msg "\\nWarning: ${1:-Unknown Warning}\\n"; WarningCount+=1; }
# The bail is only used for critical errors that prevent this script from
# gathering output. This should only be called early in processing.
function bail () { errmsg "${1:-Unknown Error}"; exit "${2:-1}"; }
function run () {
local cmd="${1:-echo}"
local desc="${2:-}"
local -i showOutput="${3:-1}"
local tmpLog=
local -i exitCode=
tmpLog=$(mktemp)
[[ -n "$desc" ]] && msg "$desc"
msg "Executing: $cmd"
# shellcheck disable=SC2086
eval $cmd > "$tmpLog" 2>&1
exitCode="$?"
if [[ "$showOutput" -eq 1 ]]; then
echo "EXIT_CODE: $exitCode" >> "$tmpLog"
cat "$tmpLog"
fi
/bin/rm -f "$tmpLog"
return $exitCode
}
#------------------------------------------------------------------------------
# Function: usage (required function)
#
# Input:
# $1 - style, either -h (for short form) or -man (for man-page like format).
# The default is -h.
#
# $2 - error message (optional). Specify this if usage() is called due to
# user error, in which case the given message displayed first, followed by the
# standard usage message (short or long depending on $1). If displaying an
# error, usually $1 should be -h so that the longer usage message doesn't
# obscure the error message.
#
# Sample Usage:
# usage
# usage -man
# usage -h "Incorrect command line usage."
#
# This last example generates a usage error message followed by the short
# '-h' usage summary.
#------------------------------------------------------------------------------
function usage
{
declare style=${1:--h}
declare errorMessage=${2:-Unset}
if [[ $errorMessage != Unset ]]; then
msg "\\n\\nUsage Error:\\n\\n$errorMessage\\n\\n" >&2
fi
msg "USAGE for sdp_health_check.sh v$Version:
sdp_health_check.sh
or
sdp_health_check.sh -h|-man
"
if [[ $style == -man ]]; then
echo -e "DESCRIPTION:
This script does a health check of the SDP. It generates a
report log, which can be emailed to support@perforce.com.
It identifies SDP instances and reports on general SDP health.
It must be run as the OS user who owns the $HC_SDP_P4CBIN
directory. This should be the user account which runs the
p4d process, and which owns the /p4/common/bin directory
(often 'perforce' or 'p4admin').
Characteristics of this script:
* It is always safe to run. It does only analysis and reporting.
* It does only fast checks, and has no interactive prompts.
Some log files are captured such as checkpoint.log, but not
potentially large ones such as the p4d server log.
* It requires no command line arguments.
* It works for any and all UNIX/Linux SDP version since 2007.
Assumptions:
* The SDP has always used $HC_SDP_ENV as the shell
environment file. This is consistent across all SDP versions.
OPTIONS:
-D Set extreme debugging verbosity.
HELP OPTIONS:
-h Display short help message
-man Display man-style help message
EXAMPLES:
This script is typically called with no arguments.
LOGGING:
This script generates a log file and also displays it to stdout at the
end of processing. By default, the log is:
/tmp/sdp_health_check.<datestamp>.log
or
/tmp/sdp_health_check.log
The exception is usage errors, which result an error being sent to
stderr followed usage info on stdout, followed by an immediate exit.
EXIT CODES:
An exit code of 0 indicates no errors or warnings were encountered.
"
fi
exit 1
}
#------------------------------------------------------------------------------
# Function: do_341_check ()
function do_341_check () {
declare -i returnCode=0
declare atRiskScripts=
msg "$H2\\nChecking for susceptibility to SDP-341.\\n"
Lib1="$HC_SDP_P4CBIN/backup_functions.sh"
Lib2="$HC_SDP_P4CBIN/sdp_functions.sh"
Script1="$HC_SDP_P4CBIN/recreate_db_checkpoint.sh"
Script2="$HC_SDP_P4CBIN/recreate_db_sync_replica.sh"
if [[ -r "$Script1" ]]; then
if grep -q 'SAVEDIR since we know' $Script1; then
msg "Verified: Known-safe version exists for: $Script1"
elif grep -q 'save directory since we know' $Script1; then
errmsg "At risk due to existing known-broken version of script $Script1\\nSuggested actions and background information are provided here: $SDP_341_URL"
returnCode=1
else
warnmsg "Unknown version of this script exists: $Script1"
fi
else
msg "Verified: This does not exist: $Script1"
fi
if [[ -r "$Script2" ]]; then
if grep -q 'rm -f rdb.lbr' $Script2; then
msg "Verified: Known-safe version exists for: $Script2"
elif grep -q 'RsyncCmd=' $Script2; then
errmsg "At risk due to existing known-broken version of script $Script2\\nSuggested actions and background information are provided here: $SDP_341_URL"
returnCode=1
else
warnmsg "Unknown version of this script exists: $Script2"
fi
else
msg "Verified: This does not exist: $Script2"
fi
if [[ -r "$Lib1" ]]; then
if grep -q 'OLDBLNK' $Lib1; then
atRiskScripts="$(grep -l switch_db_files ./* 2>/dev/null|grep -v backup_functions.sh)"
if [[ -n "$atRiskScripts" ]]; then
errmsg "A library file contains a known-broken version of switch_db_files(), that may be called by other scripts.\\nLibrary file is: $Lib1\\nCalling scripts are:\\n$(echo "$atRiskScripts"|tr ' ' '\n')\\n"
returnCode=1
else
warnmsg "A library file contains a known-broken function, switch_db_files(). However, that function is not called by any scripts in $HC_SDP_P4CBIN, and thus is not an issue. An upgrade of the SDP will replace the library entirely, but it should not be removed now.\\n"
fi
else
msg "Verified: Known-safe version exists for: $Lib1"
fi
elif [[ -r "$Lib2" ]]; then
msg "Verified: Known-safe version exists for: $Lib2"
else
errmsg "Missing files. One of these two files should exist:\\n\\t$Lib1\\nor\\n\\t$Lib2\\n\\nSuggested action: Contact Perforce Support <support@perforce.com> and request an SDP health check.\\n"
fi
if [[ "$returnCode" -eq 0 ]]; then
msg "Verified: There is no susceptibility to SDP-341."
else
errmsg "Known-broken version of one or more key scripts detected."
fi
return $returnCode
}
#------------------------------------------------------------------------------
# Function: do_preflight_checks ()
function do_preflight_checks () {
local exitCode=0
declare ToolsList="date grep id ls"
msg "$H2\\nDoing preflight sanity checks."
msg "Preflight Check 1: Ensuring basic tools are in the PATH."
for tool in $ToolsList; do
if [[ -z "$(command -v "$tool")" ]]; then
errmsg "Required tool '$tool' not found in PATH."
exitCode=1
fi
done
[[ $exitCode -ne 0 ]] && return 1
msg "Verified: Essential tools are in the PATH."
msg "Preflight Check 2: cd $HC_SDP_P4CBIN"
if ! cd "$HC_SDP_P4CBIN"; then
errmsg "Could not cd to: $HC_SDP_P4CBIN"
return 1
fi
msg "Verified: cd works to: $HC_SDP_P4CBIN"
msg "Preflight Check 3: Checking current user owns $HC_SDP_P4CBIN"
# shellcheck disable=SC2012
SDPOwner=$(ls -ld . | awk '{print $3}')
if [[ "$ThisUser" == "$SDPOwner" ]]; then
msg "Verified: Current user [$ThisUser] owns $HC_SDP_P4CBIN"
else
errmsg "Current user [$ThisUser] does not own $HC_SDP_P4CBIN."
return 1
fi
return 0
}
#------------------------------------------------------------------------------
# Function: get_sdp_instances ()
#
# Get the list of SDP instances after doing some preliminary sanity
# checks.
function get_sdp_instances () {
SDPInstanceList=
cd /p4 || bail "Could not cd to /p4."
for e in *; do
if [[ -r "/p4/$e/root/db.counters" ]]; then
SDPInstanceList+=" $e"
fi
done
# Trim leading space.
# shellcheck disable=SC2116
SDPInstanceList=$(echo "$SDPInstanceList")
}
#------------------------------------------------------------------------------
# Function: check_sdp_instance ()
# This checks various things about a given SDP instance.
function check_sdp_instance () {
local instance="${1:-UnsetSDPInstance}"
local -i startErrorCount="$ErrorCount"
if [[ "$instance" == "UnsetSDPInstance" ]]; then
errmsg "Invalid call to check_sdp_instance(), no instance parameter. Skipping further checks for this instance."
return 1
fi
if [[ -x "${HC_SDP_P4LOGIN}" ]]; then
run "$HC_SDP_MRUN $instance $HC_SDP_P4LOGIN $instance -v" \
"$H2\\nDoing 'p4login' for instance $instance." ||\
errmsg "p4 login reported an error."
else
warnmsg "No '/p4/common/bin/p4login' script found."
fi
if [[ -n "$(command -v systemctl)" ]]; then
svcName="p4d_${instance}"
run "systemctl cat $svcName" "Showing systemd service file." ||\
warnmsg "Could not cat Systemd unit file for service: $svcName."
else
run "ls -lArt /etc/init.d/p4*" "Listing SysV p4 init files." ||\
errmsg "Could not list SysV init files."
fi
run "$HC_SDP_MRUN $instance p4 -ztag info" \
"$H2\\nChecking p4 -ztag info for instance $instance." ||\
errmsg "p4 info did not respond."
run "$HC_SDP_MRUN $instance p4 configure show allservers" \
"$H2\\nChecking p4 configure show allservers." ||\
errmsg "p4 configure show allservers reported an error."
run "$HC_SDP_MRUN $instance p4 servers -J" \
"$H2\\nChecking p4 servers -J" ||\
errmsg "p4 servers -J reported an error."
if [[ -e "/p4/$instance/root/server.id" ]]; then
run "cat /p4/$instance/root/server.id" \
"Contents of /p4/$instance/root/server.id:" ||\
errmsg "Could not display contents of server.id file."
else
errmsg "Expected ServerID file is missing: /p4/$instance/root/server.id"
fi
if [[ -e "/p4/$instance/bin/p4d_$instance" ]]; then
run "/p4/$instance/bin/p4d_$instance -r /p4/$instance/root/ -cshow" \
"Contents of /p4/$instance/root -cshow:" ||\
errmsg "Could not display contents of -cshow."
else
errmsg "Expected p4d_$instance or /p4/$instance/root is missing: /p4/$instance/bin/p4d_$instance or /p4/$instance/root/"
fi
run "$HC_SDP_MRUN $instance p4 journals -m 100" \
"$H2\\nChecking journal data:" ||\
errmsg "p4 journals -m 100 reported an error."
for server in $("$HC_SDP_MRUN" "$instance" p4 -ztag -F %ServerID% servers); do
run "$HC_SDP_MRUN $instance p4 server -o $server" \
"$H2\\nChecking p4 server spec for server $server" ||\
errmsg "p4 server -o $server reported an error."
done
run "$HC_SDP_MRUN $instance env" \
"$H2\\nChecking shell environment for instance $instance." ||\
errmsg "Shell environment did not load for instance $instance"
run "$HC_SDP_MRUN $instance p4 counters | grep '^SDP_'" \
"$H2\\nChecking SDP version counters for instance $instance." ||\
errmsg "Error checking SDP version counters for instance $instance."
# In the directory values, always include the trailing '/', or the 'ls'
# may not give desired results.
msg "${H1}\\nListing key directories."
DirList="/p4/ /p4/$instance/ /p4/$instance/bin/ /p4/$instance/logs/ /p4/$instance/checkpoints/"
# For edges and some replicas, a checkpoints.* directory may exist; if so
# add it to the list of directories checked.
# shellcheck disable=SC2045
for d in $(ls -d /p4/"$instance"/checkpoints.* 2>/dev/null); do
DirList+=" $d/"
done
for d in $DirList; do
run "ls -lArt $d" "Listing: $d" ||\
errmsg "Failed to list dir: $d"
done
# Good to list P4ROOT and offline_db in db size order
DirList="/p4/$instance/root/ /p4/$instance/offline_db/"
for d in $DirList; do
run "ls -lAhS $d" "Listing: $d" ||\
errmsg "Failed to list dir: $d"
done
msg "${H1}\\nListing small log files."
for log in ${SmallLogFiles[*]}; do
logPath="/p4/$instance/logs/$log"
if [[ -e "$logPath" ]]; then
if run "cat $logPath" "$H2\\nCapturing contents of log file $logPath:"; then
msg "\\n=== END contents of log $logPath ===\\n"
# After catting small log files, check to see if they have error
# messages in known formats. The die() function in the SDP
# backup_functions.sh library used in several SDP scripts
# write critical errors with ': ERROR!!!'. Other scripts report
# report '^Error:'. This regex avoids false-positives with scripts
# that have the word error, e.g. "NO ERRORS".
if grep -q -E '(^Error:|: ERROR\!\!\!)' "$logPath"; then
errmsg "Found one or more errors in: $logPath"
fi
else
errmsg "Error showing contents of log: $logPath"
fi
else
msg "Log $log does not exist here."
fi
done
msg "$H2\\nChecking structure."
if [[ -L "/p4/$instance" ]]; then
errmsg "Instance $instance uses old-style symlink structure and should be upgraded."
fi
if [[ -x "$HC_SDP_VSDP" ]]; then
run "$HC_SDP_VSDP $instance -L off" "${H1}\\nRunning verify_sdp.sh" ||\
errmsg "SDP Verify failed for instance $instance."
else
msg "Note: $HC_SDP_VSDP is not available to execute."
fi
if [[ "$ErrorCount" -eq "$startErrorCount" ]]; then
msg "SDP instance $instance seems OK."
else
errmsg "SDP instance $instance has issues - see above."
fi
}
#------------------------------------------------------------------------------
# Function: terminate
function terminate
{
# Disable signal trapping.
trap - EXIT SIGINT SIGTERM
declare -i overallExitCode=0
msg "$H1\\nErrors detected: $ErrorCount"
msg "Warnings detected: $WarningCount"
# Set $overallExitCode:
# 0 - all clean
# 1 - errors and maybe warnings
# 2 - warnings, but no errors.
if [[ "$WarningCount" -ne 0 && "$ErrorCount" -eq 0 ]]; then
overallExitCode=2
elif [[ "$ErrorCount" -ne 0 ]]; then
overallExitCode=1
fi
msg "$ThisScript: EXITCODE: $overallExitCode"
msg "$H2\\nLog file: $Log"
exit "$overallExitCode"
}
#==============================================================================
# Command Line Processing
declare -i shiftArgs=0
set +u
while [[ $# -gt 0 ]]; do
case $1 in
(-h) usage -h;;
(-man) usage -man;;
(-D) set -x;; # Debug; use 'set -x' mode.
(-*) usage -h "Unknown command line flag ($1).";;
(*) usage -h "Unknown command line fragment ($1).";;
esac
# Shift (modify $#) the appropriate number of times.
shift; while [[ $shiftArgs -gt 0 ]]; do
[[ $# -eq 0 ]] && usage -h "Incorrect number of arguments."
shiftArgs=$shiftArgs-1
shift
done
done
set -u
#==============================================================================
# Main Program
trap terminate EXIT SIGINT SIGTERM
# Start Logging.
exec > >(tee "${Log}")
exec 2>&1
if [[ -n "$(command -v id)" ]]; then
ThisUser="$(id -n -u)"
else
ThisUser="$USER"
fi
msg "$ThisScript v$Version as $ThisUser@${HOSTNAME%%.*}\\nStarting verification at $(date +'%a %Y-%m-%d %H:%M:%S %Z')."
msg "This log file is: $Log"
do_preflight_checks || bail "Preflight checks failed. Aborting."
do_341_check || errmsg "Failed check for SDP-341."
get_sdp_instances
if [[ -n "$SDPInstanceList" ]]; then
msg "$H2\\nList of SDP Instances to verify: $SDPInstanceList"
else
errmsg "No SDP instances detected."
fi
msg "$H1\\nChecking each SDP instance."
for i in $SDPInstanceList; do
check_sdp_instance "$i"
KeyFiles[$KeyFileCount]="$HC_SDP_P4CCFG/p4_${i}.vars"
KeyFileCount+=1
done
msg "$H1\\nGeneral Checks."
msg "OS Info:"
run "uname -a" "OS uname info for UNIX/Linux"
if [[ -r /etc/os-release ]]; then
run "cat /etc/os-release" "$H2\\nList Linux distribution/version:" ||\
errmsg "Could not read /etc/os-release file."
fi
run "crontab -l" "$H2\\nCrontab for $USER:" ||\
errmsg "Failed to gather crontab for $USER."
run "ps -ef | grep p4" "$H2\\nChecking currently running p4 processes:" ||\
errmsg "Error checking processes."
run "ls -lArt $HC_SDP_P4CBIN/" "$H2\\nListing files in $HC_SDP_P4CBIN:" ||\
"Error listing files in: $HC_SDP_P4CBIN"
run "df -h" "$H2\\nChecking volumes and storage available."
msg "$H1\\nKey File Checks."
for file in ${KeyFiles[*]}; do
if [[ -r "$file" ]]; then
if run "cat $file" "$H2\\nCapturing contents of file $file:"; then
msg "\\n=== END contents of $file ===\\n"
else
errmsg "Error showing contents of file: $file"
fi
else
errmsg "Expected file is missing: $file"
fi
done
msg "$H1\\nSDP Version Checks."
# The /p4/sdp/Version may exist, depending on how the SDP was
# installed on a given machine. It is usually on the first
# master machine, but always correctly copied to replicas.
if [[ -r /p4/sdp/Version ]]; then
run "cat /p4/sdp/Version" "Version from /p4/sdp/Version:" ||\
errmsg "Could not cat /p4/sdp/Version file."
else
msg "File /p4/sdp/Version did not exist."
fi
# The SDP_VERSION setting in p4_vars should be there; if not that is
# considered an error.
if grep -q 'SDP_VERSION=' $HC_SDP_ENV; then
run "grep 'SDP_VERSION=' $HC_SDP_ENV" "Version from $HC_SDP_ENV:" ||\
errmsg "Failed to run: grep 'SDP_VERSION=' $HC_SDP_ENV"
else
errmsg "No SDP_VERSION defined in $HC_SDP_ENV."
fi
msg "$H1\\nSummary:"
if [[ "$ErrorCount" -eq 0 && "$WarningCount" -eq 0 ]]; then
msg "No errors or warnings detected."
elif [[ "$ErrorCount" -eq 0 ]]; then
msg "Encountered no errors and $WarningCount warnings."
else
msg "Encountered $ErrorCount errors and $WarningCount warnings."
fi
msg "\\nIf you have any questions about the output from this script, contact support@perforce.com"
# See the 'terminate()' function where this script actually exits.
exit 0
| # | Change | User | Description | Committed | |
|---|---|---|---|---|---|
| #8 | 32135 | C. Thomas Tyler |
Released SDP 2025.1.32133 (2025/10/29). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'. |
||
| #7 | 31566 | C. Thomas Tyler |
Released SDP 2024.2.31564 (2025/05/14). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'. |
||
| #6 | 31077 | C. Thomas Tyler |
Released SDP 2024.2.31075 (2024/12/20). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'. |
||
| #5 | 30915 | C. Thomas Tyler |
Released SDP 2024.1.30913 (2024/11/20). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'. |
||
| #4 | 30388 | C. Thomas Tyler |
Released SDP 2024.1.30385 (2024/06/11). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'. |
||
| #3 | 30297 | C. Thomas Tyler |
Released SDP 2023.2.30295 (2024/05/08). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'. |
||
| #2 | 30043 | C. Thomas Tyler |
Released SDP 2023.2.30041 (2023/12/22). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'. |
||
| #1 | 29891 | C. Thomas Tyler |
Released SDP 2023.1.29699 (2023/07/11). Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'. |
||
| //guest/perforce_software/sdp/dev/Server/Unix/p4/common/bin/sdp_health_check.sh | |||||
| #1 | 29844 | C. Thomas Tyler |
Added sdp_health_check to SDP package. Updated docs in Guide and Release Notes to reflect this change. Added more docs for this in the SDP Guide. #review-29845 @vkanczes |
||