#!/bin/bash
#==============================================================================
# Copyright and license info is available in the LICENSE file included with
# the Server Deployment Package (SDP), and also available online:
# https://swarm.workshop.perforce.com/projects/perforce-software-sdp/view/main/LICENSE
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
# sdp_health_check.sh
#
#==============================================================================
# Declarations and Environment
# Prefix global vars with HC_ to avoid name collisions.
declare ThisScript="${0##*/}"
declare Version=1.4.1
declare ThisUser=
declare Log=
declare HC_SDP_P4CBIN="/p4/common/bin"
declare HC_SDP_P4CCFG="/p4/common/config"
declare HC_SDP_ENV="$HC_SDP_P4CBIN/p4_vars"
declare HC_SDP_MRUN="$HC_SDP_P4CBIN/p4master_run"
declare HC_SDP_VSDP="$HC_SDP_P4CBIN/verify_sdp.sh"
declare SDPInstanceList=
declare SDP_341_URL="https://swarm.workshop.perforce.com/jobs/SDP-341"
declare SDPOwner=
declare -i ErrorCount=0
declare -i WarningCount=0
declare -a KeyFiles
declare -i KeyFileCount=0
KeyFiles[$KeyFileCount]="$HC_SDP_P4CBIN/p4_vars"
KeyFileCount+=1
KeyFiles[$KeyFileCount]="$HC_SDP_P4CBIN/backup_functions.sh"
KeyFileCount+=1
declare H1="=============================================================================="
declare H2="------------------------------------------------------------------------------"
if [[ -n "$(command -v date)" ]]; then
Log=/tmp/sdp_health_check.$(date +'%Y%m%d-%H%M%s').log
else
Log=/tmp/sdp_health_check.log
fi
#==============================================================================
# Local Functions
# Note: This script does not use SDP library files, as its purpose is to
# verify the integrity of an SDP installation. Thus, it has its own
# self-contained versions of some functions that would normally be
# sourced in from files like /p4/common/lib/libcore.sh.
# Micro-functions, one-liners used to avoid external dependencies.
function msg () { echo -e "$*" ; }
function errmsg () { msg "\\nError: ${1:-Unknown Error}\\n"; ErrorCount+=1; }
function warnmsg () { msg "\\nWarning: ${1:-Unknown Warning}\\n"; WarningCount+=1; }
function bail () { errmsg "${1:-Unknown Error}"; exit "${2:-1}"; }
function run () {
local cmd="${1:-echo}"
local desc="${2:-}"
local -i showOutput="${3:-1}"
local tmpLog=
local -i exitCode=
tmpLog=$(mktemp)
[[ -n "$desc" ]] && msg "$desc"
msg "Executing: $cmd"
# shellcheck disable=SC2086
eval $cmd > "$tmpLog" 2>&1
exitCode="$?"
if [[ "$showOutput" -eq 1 ]]; then
echo "EXIT_CODE: $exitCode" >> "$tmpLog"
cat "$tmpLog"
fi
/bin/rm -f "$tmpLog"
return $exitCode
}
#------------------------------------------------------------------------------
# Function: usage (required function)
#
# Input:
# $1 - style, either -h (for short form) or -man (for man-page like format).
# The default is -h.
#
# $2 - error message (optional). Specify this if usage() is called due to
# user error, in which case the given message displayed first, followed by the
# standard usage message (short or long depending on $1). If displaying an
# errror, usually $1 should be -h so that the longer usage message doesn't
# obsure the error message.
#
# Sample Usage:
# usage
# usage -man
# usage -h "Incorrect command line usage."
#
# This last example generates a usage error message followed by the short
# '-h' usage summary.
#------------------------------------------------------------------------------
function usage
{
declare style=${1:--h}
declare errorMessage=${2:-Unset}
if [[ $errorMessage != Unset ]]; then
echo -e "\n\nUsage Error:\n\n$errorMessage\n\n" >&2
fi
echo "USAGE for sdp_health_check.sh v$Version:
sdp_health_check.sh
or
sdp_health_check.sh -h|-man
"
if [[ $style == -man ]]; then
echo -e "DESCRIPTION:
This script does a health check of the SDP, and generates a
short report log, which should be emailed to support@perforce.com
unless it indicagtes an exit code of 0. It identifies SDP instances
and reports on whether the installation is at risk due to a bug,
SDP-341.
For details, see: $SDP_341_URL
It must be run as the OS user who owns the $HC_SDP_P4CBIN
directory. This should be the user account which runs the
p4d process.
Characteristics of this script:
* It is always safe to run. It does only analysis and reporting.
* It does only fast checks, and has no interactive prompts.
* It does only checks that can be done regardless of whether p4d is
running.
* It requires no command line arguments.
* It works for any SDP version since 2007.
* Exit codes are as follows:
0 - indicates no issues or risks are detected.
1 - indicates errors are detected, and maybe also warnings.
2 - indicates warnings are detected, but no errors.
Assumptions:
* The SDP has always used $HC_SDP_ENV as the shell
environment file. This is consistent across all SDP versions.
OPTIONS:
-D Set extreme debugging verbosity.
HELP OPTIONS:
-h Display short help message
-man Display man-style help message
EXAMPLES:
This script is typically called with no arguments.
LOGGING:
This script generates a log file and also displays it to stdout at the
end of processing. By default, the log is:
/tmp/sdp_health_check.<datestamp>.log
or
/tmp/sdp_health_check.log
The exception is usage errors, which result an error being sent to
stderr followed usage info on stdout, followed by an immediate exit.
EXIT CODES:
An exit code of 0 indicates no errors or warnings were encountered.
"
fi
exit 1
}
#------------------------------------------------------------------------------
# Function: do_341_check ()
function do_341_check () {
declare -i returnCode=0
declare atRiskScripts=
msg "$H2\nChecking for susceptibility to SDP-341.\\n"
Lib1="$HC_SDP_P4CBIN/backup_functions.sh"
Lib2="$HC_SDP_P4CBIN/sdp_functions.sh"
Script1="$HC_SDP_P4CBIN/recreate_db_checkpoint.sh"
Script2="$HC_SDP_P4CBIN/recreate_db_sync_replica.sh"
if [[ -r "$Script1" ]]; then
if grep -q 'SAVEDIR since we know' $Script1; then
msg "Verified: Known-safe version exists for: $Script1"
elif grep -q 'save directory since we know' $Script1; then
errmsg "At risk due to existing known-broken version of script $Script1\\nSuggested actions and background information are provided here: $SDP_341_URL"
returnCode=1
else
warnmsg "Unknown version of this script exists: $Script1"
fi
else
msg "Verified: This does not exist: $Script1"
fi
if [[ -r "$Script2" ]]; then
if grep -q 'rm -f rdb.lbr' $Script2; then
msg "Verified: Known-safe version exists for: $Script2"
elif grep -q 'RsyncCmd=' $Script2; then
errmsg "At risk due to existing known-broken version of script $Script2\\nSuggested actions and background information are provided here: $SDP_341_URL"
returnCode=1
else
warnmsg "Unknown version of this script exists: $Script2"
fi
else
msg "Verified: This does not exist: $Script2"
fi
if [[ -r "$Lib1" ]]; then
if grep -q 'OLDBLNK' $Lib1; then
atRiskScripts="$(grep -l switch_db_files ./* 2>/dev/null|grep -v backup_functions.sh)"
if [[ -n "$atRiskScripts" ]]; then
errmsg "A library file contains a known-broken function, switch_db_files(), that is called by scripts.\\nLibrary file is: $Lib1\\nCalling scripts are:\\n$(echo "$atRiskScripts"|tr ' ' '\n')\\n"
returnCode=1
else
warnmsg "A library file contains a known-broken function, switch_db_files(). However, that function is not called by any scripts in $HC_SDP_P4CBIN, and thus is not an issue. An upgrade of the SDP will replace the library entirely, but it should not be removed now.\\n"
fi
else
msg "Verified: Known-safe version exists for: $Lib1"
fi
elif [[ -r "$Lib2" ]]; then
msg "Verified: Known-safe version exists for: $Lib2"
else
errmsg "Missing files. One of these two files should exist:\\n\\t$Lib1\\nor\\n\\t$Lib2\\n\\nSuggested action: Contact Perforce Support <support@perforce.com> and request an SDP health check.\\n"
fi
if [[ "$returnCode" -eq 0 ]]; then
msg "Verified: There is no susceptibilty to SDP-341."
else
errmsg "Known-broken version of one or more key scripts detected."
fi
return $returnCode
}
#------------------------------------------------------------------------------
# Function: do_preflight_checks ()
function do_preflight_checks () {
local exitCode=0
declare ToolsList="date ls grep awk id"
msg "$H2\\nDoing preflight sanity checks."
msg "Preflight Check 1: Ensuring basic tools are in the PATH."
for tool in $ToolsList; do
if [[ -z "$(command -v "$tool")" ]]; then
errmsg "Required tool '$tool' not found in PATH."
exitCode=1
fi
done
[[ $exitCode -ne 0 ]] && return 1
msg "Verified: Essential tools are in the PATH."
msg "Preflight Check 2: cd $HC_SDP_P4CBIN"
if ! cd "$HC_SDP_P4CBIN"; then
errmsg "Could not cd to: $HC_SDP_P4CBIN"
return 1
fi
msg "Verified: cd works to: $HC_SDP_P4CBIN"
msg "Preflight Check 3: Checking current user owns $HC_SDP_P4CBIN"
# shellcheck disable=SC2012
SDPOwner=$(ls -ld . | awk '{print $3}')
if [[ "$ThisUser" == "$SDPOwner" ]]; then
msg "Verified: Current user [$ThisUser] owns $HC_SDP_P4CBIN"
else
errmsg "Current user [$ThisUser] does not own $HC_SDP_P4CBIN."
return 1
fi
return 0
}
#------------------------------------------------------------------------------
# Function: get_sdp_instances ()
#
# Get the list of SDP instances after doing some preliminary sanity
# checks.
function get_sdp_instances () {
SDPInstanceList=
cd /p4 || bail "Could not cd to /p4."
for e in *; do
if [[ -r "/p4/$e/root/db.counters" ]]; then
SDPInstanceList+=" $e"
fi
done
# Trim leading space.
# shellcheck disable=SC2116
SDPInstanceList=$(echo "$SDPInstanceList")
}
#------------------------------------------------------------------------------
# Function: check_sdp_instance ()
# This checks various things about a given SDP instance.
function check_sdp_instance () {
local instance="${1:-UnsetSDPInstance}"
if [[ "$instance" == "UnsetSDPInstance" ]]; then
errmsg "Invalid call to check_sdp_instance(), no instance parameter. Skipping further checks for this instance."
return 1
fi
run "$HC_SDP_MRUN $instance p4 -ztag info" \
"$H2\\nChecking p4 -ztag info for instance $instance." ||\
errmsg "p4 info did not respond."
run "$HC_SDP_MRUN $instance p4 configure show allservers" \
"$H2\\nChecking p4 configure show allservesr." ||\
errmsg "p4 configure show allservers reported an error."
run "$HC_SDP_MRUN $instance p4 servers -J" \
"$H2\\nChecking p4 servers -J" ||\
errmsg "p4 servers -J reported an error."
for server in $($HC_SDP_MRUN "$instance" p4 -ztag -F %ServerID% servers); do
run "$HC_SDP_MRUN $instance p4 server -o $server" \
"$H2\\nChecking p4 server spec for server $server" ||\
errmsg "p4 server -o $server reported an error."
done
run "$HC_SDP_MRUN $instance env" \
"$H2\\nChecking shell environment for instance $instance." ||\
errmsg "Shell environment did not load for instance $instance"
run "$HC_SDP_MRUN $instance p4 counters | grep '^SDP_'" \
"$H2\\nChecking SDP version counters for instance $instance." ||\
errmsg "Error checking SDP version counters for instance $instance."
for d in /p4/ /p4/$instance/ /p4/$instance/root/ /p4/$instance/offline_db/; do
run "ls -lart $d" "Listing: $d" ||\
errmsg "Failed to list dir: $d"
done
msg "$H2\\nChecking structure."
if [[ -L "/p4/$instance" ]]; then
errmsg "Instance $instance uses old-style symlink structure and should be upgraded."
fi
if [[ -x "$HC_SDP_VSDP" ]]; then
run "$HC_SDP_VSDP $instance -L off" ||\
errmsg "SDP Verify failed for instance $instance."
else
msg "Note: $HC_SDP_VSDP is not available to execute."
fi
if [[ "$ErrorCount" -eq 0 ]]; then
msg "SDP instance $instance seems OK."
else
errmsg "SDP instance $instance has issues - see above."
fi
}
#------------------------------------------------------------------------------
# Function: terminate
function terminate
{
# Disable signal trapping.
trap - EXIT SIGINT SIGTERM
declare -i overallExitCode=0
msg "$H1\\nErrors detected: $ErrorCount"
msg "Warnings detected: $WarningCount"
# Set $overallExitCode:
# 0 - all clean
# 1 - errors and maybe warnings
# 2 - warnings, but no errors.
if [[ "$WarningCount" -ne 0 && "$ErrorCount" -eq 0 ]]; then
overallExitCode=2
elif [[ "$ErrorCount" -ne 0 ]]; then
overallExitCode=1
fi
msg "$ThisScript: EXITCODE: $overallExitCode"
msg "$H2\\nLog file: $Log"
exit $overallExitCode
}
#==============================================================================
# Command Line Processing
declare -i shiftArgs=0
set +u
while [[ $# -gt 0 ]]; do
case $1 in
(-h) usage -h;;
(-man) usage -man;;
(-D) set -x;; # Debug; use 'set -x' mode.
(-*) usage -h "Unknown command line flag ($1).";;
(*) usage -h "Unknown command line fragment ($1).";;
esac
# Shift (modify $#) the appropriate number of times.
shift; while [[ $shiftArgs -gt 0 ]]; do
[[ $# -eq 0 ]] && usage -h "Incorrect number of arguments."
shiftArgs=$shiftArgs-1
shift
done
done
set -u
#==============================================================================
# Main Program
trap terminate EXIT SIGINT SIGTERM
# Start Logging.
exec > >(tee "${Log}")
exec 2>&1
if [[ -n "$(command -v id)" ]]; then
ThisUser="$(id -n -u)"
else
ThisUser="$USER"
fi
msg "$ThisScript v$Version as $ThisUser@${HOSTNAME%%.*}\\nStarting verification at $(date +'%a %Y-%m-%d %H:%M:%S %Z')."
msg "This log file is: $Log"
do_preflight_checks || bail "Preflight checks failed. Aborting."
do_341_check || errmsg "Failed check for SDP-341."
get_sdp_instances
if [[ -n "$SDPInstanceList" ]]; then
msg "$H2\\nList of SDP Instances to verify: $SDPInstanceList"
else
errmsg "No SDP instances detected."
fi
msg "$H1\\nChecking each SDP instance."
for i in $SDPInstanceList; do
check_sdp_instance "$i"
KeyFiles[$KeyFileCount]="$HC_SDP_P4CCFG/p4_${i}.vars"
KeyFileCount+=1
done
msg "$H1\\nGeneral Checks."
run "crontab -l" "$H2\\nCrontab for $USER:" ||\
errmsg "Failed to gather crontab for $USER."
run "ps -ef | grep p4" "$H2\\nChecking currently running p4 processes:" ||\
errmsg "Error checking processes."
run "ls -lrt $HC_SDP_P4CBIN/" "$H2\\nListing files in $HC_SDP_P4CBIN:" ||\
"Error listing files."
run "p4 journals -m 100" "$H2\\nChecking journal data:"
run "df -h" "$H2\\nChecking volumes and storage available."
msg "$H1\\nKey File Checks."
for file in ${KeyFiles[*]}; do
if [[ -r "$file" ]]; then
if run "cat $file" "$H2\\nCapturing contents of key file $file:"; then
msg "\\n=== END contents of $file ===\\n"
else
errmsg "Error showing contents of backup_functions.sh."
fi
else
errmsg "Expected file is missing: $file"
fi
done
msg "$H1\\nSDP Version Checks."
# The /p4/common/Version file may exists from SDP 2020.1 onward.
if [[ -r /p4/common/Version ]]; then
run "cat /p4/common/Version" "Version from /p4/common/Version:" ||\
errmsg "Could not cat /p4/common/Version file."
else
msg "File /p4/common/Version did not exist."
fi
# The /p4/sdp/Version may exist, depending on how the SDP was
# installed on a given machine. It is usually on the first
# master machine, but always correctly copied to replicas.
if [[ -r /p4/sdp/Version ]]; then
run "cat /p4/sdp/Version" "Version from /p4/sdp/Version:" ||\
errmsg "Could not cat /p4/sdp/Version file."
else
msg "File /p4/sdp/Version did not exist."
fi
# The SDP_VERSION setting in p4_vars should be there; if not that is
# considered an error.
if grep -q 'SDP_VERSION=' $HC_SDP_ENV; then
run "grep 'SDP_VERSION=' $HC_SDP_ENV" "Version from $HC_SDP_ENV:" ||\
errmsg "Failed to run: grep 'SDP_VERSION=' $HC_SDP_ENV"
else
errmsg "No SDP_VERSION defined in $HC_SDP_ENV."
fi
msg "$H1\\nSummary:"
if [[ "$ErrorCount" -eq 0 && "$WarningCount" -eq 0 ]]; then
msg "No errors or warnings detected."
elif [[ "$ErrorCount" -eq 0 ]]; then
msg "Encountered no errors and $WarningCount warnings."
else
msg "Encountered $ErrorCount errors and $WarningCount warnings."
fi
msg "\\nIf you have any questions about the output from this script, contact support@perforce.com"
# See the 'terminate()' function where this script actually exits.
exit 0
| # | Change | User | Description | Committed | |
|---|---|---|---|---|---|
| #34 | 32107 | C. Thomas Tyler | Updated to 1.16.0. | ||
| #33 | 32104 | C. Thomas Tyler | Updated to 1.15.0. | ||
| #32 | 31502 | C. Thomas Tyler |
Modified sdp_health_check.sh to call verify_sdp.sh with new '-csec' option if the SDP is new enough to support that option. |
||
| #31 | 31478 | C. Thomas Tyler | Enahnced health check to capture triggers. | ||
| #30 | 31423 | C. Thomas Tyler | Added latest version from /p4/common/bin. | ||
| #29 | 30827 | C. Thomas Tyler | Added check for LastSDPCheckpoint* counters | ||
| #28 | 30825 | C. Thomas Tyler | Updated to 1.12.0 from SDP package. | ||
| #27 | 30290 | C. Thomas Tyler | Updated to match SDP dev branch version. | ||
| #26 | 29982 | C. Thomas Tyler |
Added support for SDP installations that define P4SUPER. Some SDP environments define the P4SUPER variable that, if set, is distinct from the P4USER (the default super user). If P4SUPER is defined, use it instead of P4USER to run commands that require super access, such as 'p4 configure' or 'p4 journals'. #review-29983 |
||
| #25 | 29546 | C. Thomas Tyler | Just bumped Version for last change. | ||
| #24 | 29545 | Robert Cowham | Check for -cshow output in case user can't login | ||
| #23 | 29490 | Robert Cowham |
List linux distribution List contents of /p4/$instance/bin/ List root files in reverse size order |
||
| #22 | 28616 | C. Thomas Tyler | Corrected issue with a call to older versions of the 'p4login' script. | ||
| #21 | 27841 | C. Thomas Tyler |
Cosmetic/grammar and content corrections to sdp_health_check.sh docs. Non-functional change. |
||
| #20 | 27839 | C. Thomas Tyler |
sdp_health_check.sh v1.7.1: * Moved 'p4 journals -m 100' call to be per-instance, fixing an error for environments where the SDP shell environment is not set and 'p4' cannot be found. This also makes it so the command is called for each instance in multi-instance environments. |
||
| #19 | 27798 | C. Thomas Tyler |
sdp_health_check.sh v1.7.0: Scan the small log files that are captured, such as checkpoint.log, and detect if they report errors. Any such errors are now included in the Summary displayed at the end of processing. This will prevent the summary from reporting that no errors were detected when errors appear in the scanned SDP log files. Note that calling 'verify_sdp.sh -online' directly by this sdp_health_check.sh script is not required, as that is done by the daily_checkpoint.sh (or live_checkpoint.sh) and captured in checkpoint.log. With this change, that output (which was already being captured) is now scanned, with errors now referred to in the summary. #review-27799 @clouie rwillyoung |
||
| #18 | 27778 | C. Thomas Tyler |
sdp_health_check.sh v1.6.0: * Added listing of checkpoints* directories. * Added clarity to error message re: switch_db_files(). * Fixed shellcheck compliance issues. * Removed bogus check for /p4/Version (that was never actually used). |
||
| #17 | 27722 | C. Thomas Tyler |
Refinements to @27712: * Resolved one out-of-date file (verify_sdp.sh). * Added missing adoc file for which HTML file had a change (WorkflowEnforcementTriggers.adoc). * Updated revdate/revnumber in *.adoc files. * Additional content updates in Server/Unix/p4/common/etc/cron.d/ReadMe.md. * Bumped version numbers on scripts with Version= def'n. * Generated HTML, PDF, and doc/gen files: - Most HTML and all PDF are generated using Makefiles that call an AsciiDoc utility. - HTML for Perl scripts is generated with pod2html. - doc/gen/*.man.txt files are generated with .../tools/gen_script_man_pages.sh. #review-27712 |
||
| #16 | 27182 | C. Thomas Tyler |
Fixed typo in output message. No functional impact. |
||
| #15 | 27180 | C. Thomas Tyler | Captured basic systemd or SysV init info. | ||
| #14 | 27069 | C. Thomas Tyler | Removed 'aws' as a required command line utility. | ||
| #13 | 27047 | C. Thomas Tyler |
sdp_health_check.sh v1.4.4: * Adds capture of certain small log files, e.g. checkpoint.log. * Fixed error in error message about missing file. * General enhancements to output format. |
||
| #12 | 27046 | C. Thomas Tyler | sdp_health_check.sh v1.4.3: Added ServerID (server.id) file check. | ||
| #11 | 27045 | C. Thomas Tyler |
sdp_health_check.sh v1.4.2: * Minor doc refinements. * Added 'p4login' for each instance. * Updated location of "Version" file for coming-soon SDP r20.1. * Fixed "carry over" error where one instance reporting bad would make all instances checked after report as bad. |
||
| #10 | 26813 | C. Thomas Tyler | Added a few more checks per coordination with Support. | ||
| #9 | 26812 | C. Thomas Tyler | Added a few more bits to capture. | ||
| #8 | 26804 | C. Thomas Tyler | Added 'crontab -l' and 'df -h' capture. | ||
| #7 | 26802 | C. Thomas Tyler |
Major upgrades to sdp_health_check.sh: * SDP version checks incorporated. * Various bits of info that might be useful for Support to have are gathered, including contents of key SDP files. * If verify_sdp.sh exists, it is called. #review-26803 @robert_cowham @clouie @amo @amoriss @vkanczes @josh |
||
| #6 | 25373 | C. Thomas Tyler |
Added check for broken version of backup_functions.sh, with additional info to check. |
||
| #5 | 24526 | C. Thomas Tyler |
Incorporated review comments. Thanks, Robert! To Do: Add test script. Submitting so I can send the next batch of updates. |
||
| #4 | 24516 | C. Thomas Tyler | Fixed doc typo. | ||
| #3 | 24514 | C. Thomas Tyler | Enhanced health check. | ||
| #2 | 24513 | C. Thomas Tyler | Removed excess code cruft. | ||
| #1 | 24512 | C. Thomas Tyler | Added sdp_health_check.sh script. |