sdp_health_check.sh #1

  • //
  • guest/
  • perforce_software/
  • sdp/
  • main/
  • Server/
  • Unix/
  • p4/
  • common/
  • bin/
  • sdp_health_check.sh
  • View
  • Commits
  • Open Download .zip Download (20 KB)
#!/bin/bash
#==============================================================================
# Copyright and license info is available in the LICENSE file included with
# the Server Deployment Package (SDP), and also available online:
# https://swarm.workshop.perforce.com/projects/perforce-software-sdp/view/main/LICENSE
#------------------------------------------------------------------------------

#------------------------------------------------------------------------------
# sdp_health_check.sh
#
#==============================================================================
# Declarations and Environment

# Prefix global vars with HC_ to avoid name collisions.
declare ThisScript="${0##*/}"
declare Version=1.8.1
declare ThisUser=
declare Log=
declare DirList=

declare HC_SDP_P4CBIN="/p4/common/bin"
declare HC_SDP_P4CCFG="/p4/common/config"
declare HC_SDP_ENV="$HC_SDP_P4CBIN/p4_vars"
declare HC_SDP_MRUN="$HC_SDP_P4CBIN/p4master_run"
declare HC_SDP_VSDP="$HC_SDP_P4CBIN/verify_sdp.sh"
declare HC_SDP_P4LOGIN="${HC_SDP_P4CBIN}/p4login"

declare SDPInstanceList=
declare SDP_341_URL="https://swarm.workshop.perforce.com/jobs/SDP-341"

declare SDPOwner=

declare -i ErrorCount=0
declare -i WarningCount=0

declare -a KeyFiles
declare -i KeyFileCount=0

declare -a SmallLogFiles
declare -i SmallLogCount=0

KeyFiles[$KeyFileCount]="$HC_SDP_P4CBIN/p4_vars"
KeyFileCount+=1
KeyFiles[$KeyFileCount]="$HC_SDP_P4CBIN/backup_functions.sh"
KeyFileCount+=1

SmallLogFiles[$SmallLogCount]="checkpoint.log"
SmallLogCount+=1
SmallLogFiles[$SmallLogCount]="sync_replica.log"
SmallLogCount+=1
SmallLogFiles[$SmallLogCount]="replica_cleanup.log"
SmallLogCount+=1
SmallLogFiles[$SmallLogCount]="replica_status.log"
SmallLogCount+=1

declare H1="=============================================================================="
declare H2="------------------------------------------------------------------------------"

if [[ -n "$(command -v date)" ]]; then
   Log=/tmp/sdp_health_check.$(date +'%Y%m%d-%H%M%s').log
else
   Log=/tmp/sdp_health_check.log
fi

#==============================================================================
# Local Functions

# Note: This script does not use SDP library files, as its purpose is to
# verify the integrity of an SDP installation.  Thus, it has its own
# self-contained versions of some functions that would normally be
# sourced in from files like /p4/common/lib/libcore.sh.

# Micro-functions, one-liners used to avoid external dependencies.
# Display text with formatting.
function msg () { echo -e "$*" ; }

# Calls to errmsg() increment the ErrorCount.
function errmsg () { msg "\\nError: ${1:-Unknown Error}\\n"; ErrorCount+=1; }

# Calls to warnmsg() increment the WarningCount.
function warnmsg () { msg "\\nWarning: ${1:-Unknown Warning}\\n"; WarningCount+=1; }

# The bail is only used for critical errors that prevent this script from
# gathering output.  This should only be called early in processing.
function bail () { errmsg "${1:-Unknown Error}"; exit "${2:-1}"; }

function run () {
   local cmd="${1:-echo}"
   local desc="${2:-}"
   local -i showOutput="${3:-1}"
   local tmpLog=
   local -i exitCode=

   tmpLog=$(mktemp)

   [[ -n "$desc" ]] && msg "$desc"
   msg "Executing: $cmd"
   # shellcheck disable=SC2086
   eval $cmd > "$tmpLog" 2>&1
   exitCode="$?"

   if [[ "$showOutput" -eq 1 ]]; then
      echo "EXIT_CODE: $exitCode" >> "$tmpLog"
      cat "$tmpLog"
   fi

   /bin/rm -f "$tmpLog"
   return $exitCode
}

#------------------------------------------------------------------------------
# Function: usage (required function)
#
# Input:
# $1 - style, either -h (for short form) or -man (for man-page like format).
# The default is -h.
#
# $2 - error message (optional).  Specify this if usage() is called due to
# user error, in which case the given message displayed first, followed by the
# standard usage message (short or long depending on $1).  If displaying an
# error, usually $1 should be -h so that the longer usage message doesn't
# obscure the error message.
#
# Sample Usage:
# usage 
# usage -man
# usage -h "Incorrect command line usage."
#
# This last example generates a usage error message followed by the short
# '-h' usage summary.
#------------------------------------------------------------------------------
function usage
{
   declare style=${1:--h}
   declare errorMessage=${2:-Unset}

   if [[ $errorMessage != Unset ]]; then
      msg "\\n\\nUsage Error:\\n\\n$errorMessage\\n\\n" >&2
   fi

   msg "USAGE for sdp_health_check.sh v$Version:

sdp_health_check.sh

   or

sdp_health_check.sh -h|-man
"
   if [[ $style == -man ]]; then
      echo -e "DESCRIPTION:

	This script does a health check of the SDP. It generates a
	report log, which can be emailed to support@perforce.com.
	It identifies SDP instances and reports on general SDP health.

	It must be run as the OS user who owns the $HC_SDP_P4CBIN
	directory.  This should be the user account which runs the
	p4d process, and which owns the /p4/common/bin directory
	(often 'perforce' or 'p4admin').

	Characteristics of this script:
	* It is always safe to run.  It does only analysis and reporting.
	* It does only fast checks, and has no interactive prompts.
	  Some log files are captured such as checkpoint.log, but not
	  potentially large ones such as the p4d server log.
	* It requires no command line arguments.
	* It works for any and all UNIX/Linux SDP version since 2007.
 
	Assumptions:
	* The SDP has always used $HC_SDP_ENV as the shell
	  environment file.  This is consistent across all SDP versions.

OPTIONS:
 -D     Set extreme debugging verbosity.

HELP OPTIONS:
 -h	Display short help message
 -man	Display man-style help message

EXAMPLES:
	This script is typically called with no arguments.

LOGGING:
	This script generates a log file and also displays it to stdout at the
	end of processing.  By default, the log is:
	
	/tmp/sdp_health_check.<datestamp>.log

	or

	/tmp/sdp_health_check.log

	The exception is usage errors, which result an error being sent to
	stderr followed usage info on stdout, followed by an immediate exit.

EXIT CODES:
	An exit code of 0 indicates no errors or warnings were encountered.
"
   fi

   exit 1
}

#------------------------------------------------------------------------------
# Function: do_341_check ()
function do_341_check () {

   declare -i returnCode=0
   declare atRiskScripts=

   msg "$H2\\nChecking for susceptibility to SDP-341.\\n"
   Lib1="$HC_SDP_P4CBIN/backup_functions.sh"
   Lib2="$HC_SDP_P4CBIN/sdp_functions.sh"
   Script1="$HC_SDP_P4CBIN/recreate_db_checkpoint.sh"
   Script2="$HC_SDP_P4CBIN/recreate_db_sync_replica.sh"

   if [[ -r "$Script1" ]]; then
      if grep -q 'SAVEDIR since we know' $Script1; then
         msg "Verified: Known-safe version exists for: $Script1"
      elif grep -q 'save directory since we know' $Script1; then 
         errmsg "At risk due to existing known-broken version of script $Script1\\nSuggested actions and background information are provided here: $SDP_341_URL"
         returnCode=1
      else
         warnmsg "Unknown version of this script exists: $Script1"
      fi
   else
      msg "Verified: This does not exist: $Script1"
   fi

   if [[ -r "$Script2" ]]; then
      if grep -q 'rm -f rdb.lbr' $Script2; then
         msg "Verified: Known-safe version exists for: $Script2"
      elif grep -q 'RsyncCmd=' $Script2; then 
         errmsg "At risk due to existing known-broken version of script $Script2\\nSuggested actions and background information are provided here: $SDP_341_URL"
         returnCode=1
      else
         warnmsg "Unknown version of this script exists: $Script2"
      fi
   else
      msg "Verified: This does not exist: $Script2"
   fi

   if [[ -r "$Lib1" ]]; then
      if grep -q 'OLDBLNK' $Lib1; then
         atRiskScripts="$(grep -l switch_db_files ./* 2>/dev/null|grep -v backup_functions.sh)"
         if [[ -n "$atRiskScripts" ]]; then
            errmsg "A library file contains a known-broken version of switch_db_files(), that may be called by other scripts.\\nLibrary file is: $Lib1\\nCalling scripts are:\\n$(echo "$atRiskScripts"|tr ' ' '\n')\\n"
            returnCode=1
         else
            warnmsg "A library file contains a known-broken function, switch_db_files(). However, that function is not called by any scripts in $HC_SDP_P4CBIN, and thus is not an issue.  An upgrade of the SDP will replace the library entirely, but it should not be removed now.\\n"
         fi
      else
         msg "Verified: Known-safe version exists for: $Lib1"
      fi
   elif [[ -r "$Lib2" ]]; then
      msg "Verified: Known-safe version exists for: $Lib2"
   else
      errmsg "Missing files.  One of these two files should exist:\\n\\t$Lib1\\nor\\n\\t$Lib2\\n\\nSuggested action: Contact Perforce Support <support@perforce.com> and request an SDP health check.\\n"
   fi

   if [[ "$returnCode" -eq 0 ]]; then
      msg "Verified: There is no susceptibility to SDP-341."
   else
      errmsg "Known-broken version of one or more key scripts detected."
   fi

   return $returnCode
}

#------------------------------------------------------------------------------
# Function: do_preflight_checks ()
function do_preflight_checks () {

   local exitCode=0
   declare ToolsList="date grep id ls"

   msg "$H2\\nDoing preflight sanity checks."
   msg "Preflight Check 1: Ensuring basic tools are in the PATH."

   for tool in $ToolsList; do
      if [[ -z "$(command -v "$tool")" ]]; then
         errmsg "Required tool '$tool' not found in PATH."
         exitCode=1
      fi
   done

   [[ $exitCode -ne 0 ]] && return 1

   msg "Verified: Essential tools are in the PATH."

   msg "Preflight Check 2: cd $HC_SDP_P4CBIN"

   if ! cd "$HC_SDP_P4CBIN"; then
      errmsg "Could not cd to: $HC_SDP_P4CBIN"
      return 1
   fi

   msg "Verified: cd works to: $HC_SDP_P4CBIN"

   msg "Preflight Check 3: Checking current user owns $HC_SDP_P4CBIN"
   # shellcheck disable=SC2012
   SDPOwner=$(ls -ld . | awk '{print $3}')

   if [[ "$ThisUser" == "$SDPOwner" ]]; then
      msg "Verified: Current user [$ThisUser] owns $HC_SDP_P4CBIN"
   else
      errmsg "Current user [$ThisUser] does not own $HC_SDP_P4CBIN."
      return 1
   fi

   return 0
}

#------------------------------------------------------------------------------
# Function: get_sdp_instances ()
#
# Get the list of SDP instances after doing some preliminary sanity
# checks.
function get_sdp_instances () {
   SDPInstanceList=
   cd /p4 || bail "Could not cd to /p4."
   for e in *; do
      if [[ -r "/p4/$e/root/db.counters" ]]; then
         SDPInstanceList+=" $e"
      fi
   done

   # Trim leading space.
   # shellcheck disable=SC2116
   SDPInstanceList=$(echo "$SDPInstanceList")
}

#------------------------------------------------------------------------------
# Function: check_sdp_instance ()
# This checks various things about a given SDP instance.
function check_sdp_instance () {
   local instance="${1:-UnsetSDPInstance}"
   local -i startErrorCount="$ErrorCount"

   if [[ "$instance" == "UnsetSDPInstance" ]]; then
      errmsg "Invalid call to check_sdp_instance(), no instance parameter. Skipping further checks for this instance."
      return 1
   fi

   if [[ -x "${HC_SDP_P4LOGIN}" ]]; then
      run "$HC_SDP_MRUN $instance $HC_SDP_P4LOGIN $instance -v" \
         "$H2\\nDoing 'p4login' for instance $instance." ||\
         errmsg "p4 login reported an error."
   else
      warnmsg "No '/p4/common/bin/p4login' script found."
   fi

   if [[ -n "$(command -v systemctl)" ]]; then
      svcName="p4d_${instance}"
      run "systemctl cat $svcName" "Showing systemd service file." ||\
         warnmsg "Could not cat Systemd unit file for service: $svcName."
   else
      run "ls -lArt /etc/init.d/p4*" "Listing SysV p4 init files." ||\
         errmsg "Could not list SysV init files."
   fi

   run "$HC_SDP_MRUN $instance p4 -ztag info" \
      "$H2\\nChecking p4 -ztag info for instance $instance." ||\
      errmsg "p4 info did not respond."

   run "$HC_SDP_MRUN $instance p4 configure show allservers" \
      "$H2\\nChecking p4 configure show allservers." ||\
      errmsg "p4 configure show allservers reported an error."

   run "$HC_SDP_MRUN $instance p4 servers -J" \
      "$H2\\nChecking p4 servers -J" ||\
      errmsg "p4 servers -J reported an error."

   if [[ -e "/p4/$instance/root/server.id" ]]; then
      run "cat /p4/$instance/root/server.id" \
         "Contents of /p4/$instance/root/server.id:" ||\
         errmsg "Could not display contents of server.id file."
   else
      errmsg "Expected ServerID file is missing: /p4/$instance/root/server.id"
   fi

   if [[ -e "/p4/$instance/bin/p4d_$instance" ]]; then
      run "/p4/$instance/bin/p4d_$instance -r /p4/$instance/root/ -cshow" \
         "Contents of /p4/$instance/root -cshow:" ||\
         errmsg "Could not display contents of -cshow."
   else
      errmsg "Expected p4d_$instance or /p4/$instance/root is missing: /p4/$instance/bin/p4d_$instance or /p4/$instance/root/"
   fi

   run "$HC_SDP_MRUN $instance p4 journals -m 100" \
      "$H2\\nChecking journal data:" ||\
      errmsg "p4 journals -m 100 reported an error."

   for server in $("$HC_SDP_MRUN" "$instance" p4 -ztag -F %ServerID% servers); do
      run "$HC_SDP_MRUN $instance p4 server -o $server" \
         "$H2\\nChecking p4 server spec for server $server" ||\
         errmsg "p4 server -o $server reported an error."
   done

   run "$HC_SDP_MRUN $instance env" \
      "$H2\\nChecking shell environment for instance $instance." ||\
      errmsg "Shell environment did not load for instance $instance"

   run "$HC_SDP_MRUN $instance p4 counters | grep '^SDP_'" \
      "$H2\\nChecking SDP version counters for instance $instance." ||\
      errmsg "Error checking SDP version counters for instance $instance."

   # In the directory values, always include the trailing '/', or the 'ls'
   # may not give desired results.
   msg "${H1}\\nListing key directories."
   DirList="/p4/ /p4/$instance/ /p4/$instance/bin/ /p4/$instance/logs/ /p4/$instance/checkpoints/"

   # For edges and some replicas, a checkpoints.* directory may exist; if so
   # add it to the list of directories checked.
   # shellcheck disable=SC2045
   for d in $(ls -d /p4/"$instance"/checkpoints.* 2>/dev/null); do
      DirList+=" $d/"
   done
   
   for d in $DirList; do
      run "ls -lArt $d" "Listing: $d" ||\
         errmsg "Failed to list dir: $d"
   done

   # Good to list P4ROOT and offline_db in db size order
   DirList="/p4/$instance/root/ /p4/$instance/offline_db/"
   for d in $DirList; do
      run "ls -lAhS $d" "Listing: $d" ||\
         errmsg "Failed to list dir: $d"
   done

   msg "${H1}\\nListing small log files."
   for log in ${SmallLogFiles[*]}; do
      logPath="/p4/$instance/logs/$log"
      if [[ -e "$logPath" ]]; then
         if run "cat $logPath" "$H2\\nCapturing contents of log file $logPath:"; then
            msg "\\n=== END contents of log $logPath ===\\n"

            # After catting small log files, check to see if they have error
            # messages in known formats.  The die() function in the SDP
            # backup_functions.sh library used in several SDP scripts
            # write critical errors with ': ERROR!!!'. Other scripts report
            # report '^Error:'. This regex avoids false-positives with scripts
            # that have the word error, e.g. "NO ERRORS".
            if grep -q -E '(^Error:|: ERROR\!\!\!)' "$logPath"; then
               errmsg "Found one or more errors in: $logPath"
            fi
         else
            errmsg "Error showing contents of log: $logPath"
         fi
      else
         msg "Log $log does not exist here."
      fi
   done

   msg "$H2\\nChecking structure."
   if [[ -L "/p4/$instance" ]]; then
      errmsg "Instance $instance uses old-style symlink structure and should be upgraded."
   fi

   if [[ -x "$HC_SDP_VSDP" ]]; then
      run "$HC_SDP_VSDP $instance -L off" "${H1}\\nRunning verify_sdp.sh" ||\
         errmsg "SDP Verify failed for instance $instance."
   else
      msg "Note: $HC_SDP_VSDP is not available to execute."
   fi

   if [[ "$ErrorCount" -eq "$startErrorCount" ]]; then
      msg "SDP instance $instance seems OK."
   else
      errmsg "SDP instance $instance has issues - see above."
   fi
}

#------------------------------------------------------------------------------
# Function: terminate
function terminate
{
   # Disable signal trapping.
   trap - EXIT SIGINT SIGTERM

   declare -i overallExitCode=0

   msg "$H1\\nErrors detected: $ErrorCount"
   msg "Warnings detected: $WarningCount"

   # Set $overallExitCode:
   # 0 - all clean
   # 1 - errors and maybe warnings
   # 2 - warnings, but no errors.
   if [[ "$WarningCount" -ne 0 && "$ErrorCount" -eq 0 ]]; then
      overallExitCode=2
   elif [[ "$ErrorCount" -ne 0 ]]; then
      overallExitCode=1
   fi

   msg "$ThisScript: EXITCODE: $overallExitCode"
   msg "$H2\\nLog file: $Log"

   exit "$overallExitCode"
}
#==============================================================================
# Command Line Processing

declare -i shiftArgs=0

set +u
while [[ $# -gt 0 ]]; do
   case $1 in
      (-h) usage -h;;
      (-man) usage -man;;
      (-D) set -x;; # Debug; use 'set -x' mode.
      (-*) usage -h "Unknown command line flag ($1).";;
      (*) usage -h "Unknown command line fragment ($1).";;
   esac

   # Shift (modify $#) the appropriate number of times.
   shift; while [[ $shiftArgs -gt 0 ]]; do
      [[ $# -eq 0 ]] && usage -h "Incorrect number of arguments."
      shiftArgs=$shiftArgs-1
      shift
   done
done
set -u

#==============================================================================
# Main Program

trap terminate EXIT SIGINT SIGTERM

# Start Logging.
exec > >(tee "${Log}")
exec 2>&1

if [[ -n "$(command -v id)" ]]; then
   ThisUser="$(id -n -u)"
else
   ThisUser="$USER"
fi

msg "$ThisScript v$Version as $ThisUser@${HOSTNAME%%.*}\\nStarting verification at $(date +'%a %Y-%m-%d %H:%M:%S %Z')."

msg "This log file is: $Log"

do_preflight_checks || bail "Preflight checks failed. Aborting."

do_341_check || errmsg "Failed check for SDP-341."

get_sdp_instances

if [[ -n "$SDPInstanceList" ]]; then
   msg "$H2\\nList of SDP Instances to verify: $SDPInstanceList"
else
   errmsg "No SDP instances detected."
fi

msg "$H1\\nChecking each SDP instance."
for i in $SDPInstanceList; do
   check_sdp_instance "$i"
   KeyFiles[$KeyFileCount]="$HC_SDP_P4CCFG/p4_${i}.vars"
   KeyFileCount+=1
done

msg "$H1\\nGeneral Checks."

msg "OS Info:"

run "uname -a" "OS uname info for UNIX/Linux"

if [[ -r /etc/os-release ]]; then
   run "cat /etc/os-release" "$H2\\nList Linux distribution/version:" ||\
      errmsg "Could not read /etc/os-release file."
fi

run "crontab -l" "$H2\\nCrontab for $USER:" ||\
   errmsg "Failed to gather crontab for $USER."

run "ps -ef | grep p4" "$H2\\nChecking currently running p4 processes:" ||\
   errmsg "Error checking processes."

run "ls -lArt $HC_SDP_P4CBIN/" "$H2\\nListing files in $HC_SDP_P4CBIN:" ||\
   "Error listing files in: $HC_SDP_P4CBIN"

run "df -h" "$H2\\nChecking volumes and storage available."

msg "$H1\\nKey File Checks."

for file in ${KeyFiles[*]}; do
   if [[ -r "$file" ]]; then
      if run "cat $file" "$H2\\nCapturing contents of file $file:"; then
         msg "\\n=== END contents of $file ===\\n"
      else
         errmsg "Error showing contents of file: $file"
      fi
   else
      errmsg "Expected file is missing: $file"
   fi
done

msg "$H1\\nSDP Version Checks."

# The /p4/sdp/Version may exist, depending on how the SDP was
# installed on a given machine. It is usually on the first
# master machine, but always correctly copied to replicas.
if [[ -r /p4/sdp/Version ]]; then
   run "cat /p4/sdp/Version" "Version from /p4/sdp/Version:" ||\
      errmsg "Could not cat /p4/sdp/Version file."
else
   msg "File /p4/sdp/Version did not exist."
fi

# The SDP_VERSION setting in p4_vars should be there; if not that is
# considered an error.
if grep -q 'SDP_VERSION=' $HC_SDP_ENV; then
   run "grep 'SDP_VERSION=' $HC_SDP_ENV" "Version from $HC_SDP_ENV:" ||\
      errmsg "Failed to run: grep 'SDP_VERSION=' $HC_SDP_ENV"
else
   errmsg "No SDP_VERSION defined in $HC_SDP_ENV."
fi

msg "$H1\\nSummary:"

if [[ "$ErrorCount" -eq 0 && "$WarningCount" -eq 0 ]]; then
   msg "No errors or warnings detected."
elif [[ "$ErrorCount" -eq 0 ]]; then
   msg "Encountered no errors and $WarningCount warnings."
else
   msg "Encountered $ErrorCount errors and $WarningCount warnings."
fi

msg "\\nIf you have any questions about the output from this script, contact support@perforce.com"

# See the 'terminate()' function where this script actually exits.
exit 0
# Change User Description Committed
#8 32135 C. Thomas Tyler Released SDP 2025.1.32133 (2025/10/29).
Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#7 31566 C. Thomas Tyler Released SDP 2024.2.31564 (2025/05/14).
Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#6 31077 C. Thomas Tyler Released SDP 2024.2.31075 (2024/12/20).
Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#5 30915 C. Thomas Tyler Released SDP 2024.1.30913 (2024/11/20).
Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#4 30388 C. Thomas Tyler Released SDP 2024.1.30385 (2024/06/11).
Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#3 30297 C. Thomas Tyler Released SDP 2023.2.30295 (2024/05/08).
Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#2 30043 C. Thomas Tyler Released SDP 2023.2.30041 (2023/12/22).
Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
#1 29891 C. Thomas Tyler Released SDP 2023.1.29699 (2023/07/11).
Copy Up using 'p4 copy -r -b perforce_software-sdp-dev'.
//guest/perforce_software/sdp/dev/Server/Unix/p4/common/bin/sdp_health_check.sh
#1 29844 C. Thomas Tyler Added sdp_health_check to SDP package.

Updated docs in Guide and Release Notes to reflect this change.

Added more docs for this in the SDP Guide.

#review-29845 @vkanczes