#!/bin/bash #============================================================================== # Copyright and license info is available in the LICENSE file included with # the Server Deployment Package (SDP), and also available online: # https://swarm.workshop.perforce.com/projects/perforce-software-sdp/view/main/LICENSE #------------------------------------------------------------------------------ # Helix Server base init script # Do nothing unless $SDP_INSTANCE is defined. export SDP_INSTANCE="${1:-Unset}" export OS_INIT_MECHANISM="${OS_INIT_MECHANISM:-Unset}" declare -i StartDelay="${SDP_START_DELAY:-2}" declare -i TailCmdUsable=0 declare -i UseSystemd=0 declare -i ExitCode=0 declare ThisScript="${0##*/}" declare Version=5.0.1 declare CmdLine="$0 $*" declare OpMode="${2:-usage}" declare Log= declare TmpDir= declare TmpLog= declare TmpJnl= declare MustUseSystemdMsg= declare ServiceName= if [[ "$SDP_INSTANCE" == "Unset" ]]; then echo -e "\\nError: The SDP_INSTANCE is not defined.\\n." exit 1 fi # Load SDP controlled shell environment. # shellcheck disable=SC1091 source /p4/common/bin/p4_vars "$SDP_INSTANCE" ExitCode=$? if [[ "$ExitCode" -ne 0 ]]; then echo -e "\\nError: Failed to load SDP environment for instance $SDP_INSTANCE.\\n" exit 1 fi # shellcheck disable=SC1090 disable=SC1091 source "$P4CBIN/backup_functions.sh" ExitCode=$? if [[ "$ExitCode" -ne 0 ]]; then echo -e "\\nError: Failed to load SDP lib $P4CBIN/backup_functions.sh.\\n" exit 1 fi if [[ "${P4D_FLAGS:-Unset}" == "Unset" ]]; then echo -e "\\nError: P4D_FLAGS is unset after loading SDP environment for instance $SDP_INSTANCE.\\n" exit 1 fi if [[ $(id -u) -eq 0 ]]; then exec su - "$OSUSER" -c "$0 $*" elif [[ $(id -u -n) != "$OSUSER" ]]; then echo "$0 can only be run by root or $OSUSER" exit 1 fi if [[ ! -x "$P4DBIN" ]]; then echo -e "\\nError: $P4DBIN is not executable." exit 2; fi # Ensure that the '--pid-file' argument is provided, and add it if it is not. # This is intended to prevent problems upgrading the SDP in case the # /p4/common/config/p4_N.vars file isn't updated to use the new template. if [[ "$P4D_FLAGS" != *"--pid-file"* ]]; then export P4D_FLAGS="$P4D_FLAGS --pid-file" fi # Determine if systemd is configured for this instance. if [[ -n "$(command -v systemctl)" ]]; then ServiceName="${P4DBIN##*/}" if [[ -n "$(systemctl is-enabled "$ServiceName" 2>/dev/null)" ]]; then UseSystemd=1 MustUseSystemdMsg="\\nError: It appears an attempt was made to start or stop the $ServiceName\\nservice without using systemd. When a systemd unit file is configured, only the\\nsystemd init mechanism can be used to start and stop the service, so that\\nsystemd maintains a more reliable indication of the status of the service. Run\\nas root or as $OSUSER with sudo. For example:\\n\\n\\tsudo systemctl start $ServiceName\\n\\tsudo systemctl stop $ServiceName\\n\\tsystemctl status $ServiceName\\n\\nNote that the 'status' check does not require sudo.\\n\\nThe systemd unit file is:\\n\\n<<<<<<<<\\n$(sudo systemctl cat "$ServiceName")\\n>>>>>>>\\n\\nIf you used 'systemctl' and still got this message, then please check if OS_INIT_MECHANISM is defined\\nin the above file. See /p4/sdp/Server/Unix/setup/systemd/p4d_1.service for an example." fi fi # For P4D 2017.1+, automatically replace '-d' with '--daemonsafe', unless # we are running with systemd. If running with systemd, we drop the # '-d'/'--daemonsafe' flags and let systemd handle the process forking. # Disable shellcheck as we intend a string compare even though it looks like # we're trying to compare a decimal value. # shellcheck disable=SC2072 if [[ "$P4D_VERSION" > "2017.1" ]]; then if [[ "$UseSystemd" -eq 1 ]]; then export P4D_FLAGS=${P4D_FLAGS/ -d / } export P4D_FLAGS=${P4D_FLAGS/ --daemonsafe / } else export P4D_FLAGS=${P4D_FLAGS/ -d / --daemonsafe } fi fi Log="$LOGS/p4d_init.log" export LOGFILE="$Log" TmpDir=$(mktemp -d 2>/dev/null) if [[ ! -d "$TmpDir" ]]; then TmpDir=$(mktemp -d -p "$P4TMP" -t 'tmp_p4d_base.XXXXXXXX') fi if [[ ! -d "$TmpDir" ]]; then echo -e "\\nError: Could not initialize TmpDir [$TmpDir]\\n" | tee -a "$Log" exit 1 fi TmpLog="${TmpDir}/tmp.p4d_base.log" echo -e "\\n$(date) Called $ThisScript v$Version with command line:\\n$CmdLine" >> "$Log" set_vars # If the 'tail' command responds to the '--version' flag, it is useful for our # purposes in this script. If it does not recognize '--version' (e.g. on OSX), # then it likely will not work with 'tail -<number>', and so not useful in this # script. if [[ -n "$(command -v tail)" ]]; then if tail --version > /dev/null 2>&1; then TailCmdUsable=1 fi fi # See how we were called. case "$OpMode" in (force_start) if [[ -e "$P4ROOT/P4ROOT_not_usable.txt" ]]; then echo -e "\\nWarning: The $P4ROOT/P4ROOT_not_usable.txt file exists, indicating\\ndatabases in P4ROOT are not safe to use. Contents:\\n$(cat "$P4ROOT/P4ROOT_not_usable.txt")\\nIgnoring this due to use of force_start option. Review the bottom of this log:\\n$Log\\n" | tee -a "$Log" fi if [[ "$UseSystemd" -eq 1 && "$OS_INIT_MECHANISM" != "systemd" ]]; then echo -e "\\nWarning: An attempt was made to start the $ServiceName service\\nwithout using systemd on a system configured to use systemd. Due to use of\\nforce_start, this will be allowed.\\n" | tee -a "$Log" fi echo "Starting $P4DBIN $P4D_FLAGS" | tee -a "$Log" # Delay start $StartDelay seconds, unless P4ROOT is empty. [[ -r "$P4ROOT/db.domain" ]] && sleep "$StartDelay" # shellcheck disable=SC2164 cd "$P4ROOT" 2>&1 | tee -a "$Log" if [[ -n "$(command -v ulimit)" && "$(ulimit -c)" != "0" ]]; then ulimit -c unlimited fi # shellcheck disable=SC2086 "$P4DBIN" $P4D_FLAGS > "$TmpLog" 2>&1 ExitCode=$? cat "$TmpLog" >> "$Log" cat "$TmpLog" ;; (start) if [[ -e "$P4ROOT/P4ROOT_not_usable.txt" ]]; then echo -e "\\nError: The $P4ROOT/P4ROOT_not_usable.txt file exists, indicating\\ndatabases in P4ROOT are not safe to use. Contents:\\n$(cat "$P4ROOT/P4ROOT_not_usable.txt")\\nRefusing to start p4d. Review the bottom of this log:\\n$Log\\n" | tee -a "$Log" exit 1 fi if [[ "$UseSystemd" -eq 1 && "$OS_INIT_MECHANISM" != "systemd" ]]; then echo -e "$MustUseSystemdMsg" exit 1 fi if [[ -r "$P4ROOT/db.domain" ]]; then echo "Preflight check: $P4DBIN -r $P4ROOT -xvU" > "$TmpLog" "$P4DBIN" -r "$P4ROOT" -xvU >> "$TmpLog" 2>&1 ExitCode=$? echo "EXIT_CODE=$ExitCode" >> "$TmpLog" cat "$TmpLog" >> "$Log" if [[ "$ExitCode" -ne 0 ]]; then cat "$TmpLog" echo -e "\\nError: DB check with 'p4d -xvU' failed. Database integrity is in question.\\nPlease Contact Perforce Support (support-helix-core@perforce.com). The force_start option\\nis available, but not recommended. Review the bottom of this log:\\n$Log\\n" | tee -a "$Log" exit 1 fi if [[ -e "$P4JOURNAL" && "$TailCmdUsable" -eq 1 ]]; then echo "Preflight journal health check - validating journal in a temp directory" > "$TmpLog" TmpJnl="$TmpDir/jnl.test" tail -10000 "$P4JOURNAL" 2>/dev/null | grep -av "@vv@" > "$TmpJnl" 2>> "$TmpLog" # Check for first full record. Some records are multiline and we # may be part way through. FirstLine=$(grep -a -En "^@(ex|nx|pv|rv|dv)@" "$TmpJnl" 2>&1 | head -1 | cut -d: -f1) if [[ "$FirstLine" -gt "1" ]]; then # Skip lines if necessary to start with a full record. mv "$TmpJnl" "${TmpJnl}.1" tail -n "+$FirstLine" "${TmpJnl}.1" > "$TmpJnl" 2>> "$Log" fi "$P4DBIN" -r "$TmpDir" -f -jr "$TmpJnl" >> "$TmpLog" 2>&1 ExitCode=$? echo "EXIT_CODE=$ExitCode" >> "$TmpLog" if [[ "$ExitCode" -ne 0 ]]; then cat "$TmpLog" >> "$Log" cat "$TmpLog" echo -e "\\nError: possible corruption at end of journal detected. Journal is being rotated so any corruption is at end of file. You may wish to contact Perforce Support (support-helix-core@perforce.com). Server is still being started as normal. NOTE If there is any corruption then replicas will likely stop replicating until this is fixed!! If there are no problems with replication then this error can be ignored. Review this file:\\n$Log\\n" | tee -a "$Log" get_journalnum p4d_truncate_journal subject="ERROR!!! - $HOSTNAME $P4SERVER Possible journal corruption detected." mail_sender_opt=$(get_mail_sender_opt) echo "Sending mail: $SDPMAIL -s $subject $mail_sender_opt $MAILTO" | tee -a "$Log" "$SDPMAIL" -s "$subject" "$mail_sender_opt" "$MAILTO" < "$Log" fi fi fi echo "Starting $P4DBIN $P4D_FLAGS" | tee -a "$Log" # Delay start $StartDelay seconds, unless P4ROOT is empty. [[ -r "$P4ROOT/db.domain" ]] && sleep "$StartDelay" # shellcheck disable=SC2164 cd "$P4ROOT" 2>&1 | tee -a "$Log" if [[ -n "$(command -v ulimit)" && "$(ulimit -c)" != "0" ]]; then ulimit -c unlimited fi # shellcheck disable=SC2086 "$P4DBIN" $P4D_FLAGS > "$TmpLog" 2>&1 ExitCode=$? cat "$TmpLog" >> "$Log" cat "$TmpLog" ;; (status) if [[ -r "$P4ROOT/server.pid" ]]; then pid=$(cat "$P4ROOT/server.pid") echo -e "\\nThe $P4ROOT/server.pid file contains pid $pid. Pid info:" "$PS" -f -p "$pid" > "$TmpLog" ExitCode=$? cat "$TmpLog" if [[ "$ExitCode" -ne 0 ]]; then echo -e "\\nError: A server.pid file exists, but that process id is not running. This could indicate abnormal process termination.\\n" fi fi if "$P4BIN" -p "$P4PORT" info -s > "$TmpLog" 2>&1; then ExitCode=0 else # If we get an SSL trust error, then the server is online. if grep -lq 'The authenticity of' "$TmpLog"; then # Do a fire & forget attempt to fix the 'p4 trust' issue. "$P4BIN" -p "$P4PORT" trust -f -y > /dev/null 2>&1 ExitCode=0 else ExitCode=1 fi fi cat "$TmpLog" ;; (admin_stop) if [[ "$UseSystemd" -eq 1 && "$OS_INIT_MECHANISM" != "systemd" ]]; then echo -e "$MustUseSystemdMsg" exit 1 fi # If there is no server.pid file, shut down the old fashioned way. echo -n "Shutting down $P4DBIN: " | tee -a "$Log" if [[ "${P4REPLICA}" == "FALSE" ]]; then "$P4CBIN/p4login" fi echo "$P4BIN -p $P4PORT -u $P4USER admin stop" | tee -a "$Log" "$P4BIN" -p "$P4PORT" -u "$P4USER" admin stop 2>&1 | tee -a "$Log" sleep 5 "$P4BIN" -p "$P4PORT" info > /dev/null 2>&1 ExitCode=$? if [[ "$ExitCode" -eq 0 ]]; then echo -e "\\nError: Server shutdown failed." | tee -a "$Log" exit 1 else exit 0 fi ;; (stop) if [[ -r "$P4ROOT/server.pid" ]]; then pid=$(cat "$P4ROOT/server.pid") if "$PS" -p "$pid" > /dev/null 2>&1; then if [[ "$UseSystemd" -eq 1 && "$OS_INIT_MECHANISM" != "systemd" ]]; then echo -e "$MustUseSystemdMsg" exit 1 fi echo -e "\\nSending SIGTERM signal to pid $pid in $P4ROOT/server.pid." | tee -a "$Log" kill "$pid" 2>&1 | tee -a "$Log" sleep 1 "$PS" -p "$pid" > /dev/null 2>&1 status=$? if [[ $status -eq 0 ]]; then echo -n "Waiting for p4d to shutdown ..." | tee -a "$Log" while [[ $status -eq 0 ]]; do echo -n "." sleep 5 "$PS" -p "$pid" > /dev/null 2>&1 status=$? done fi echo -e "\\nConfirmed shutdown of $P4DBIN." | tee -a "$Log" else echo -e "\\nError: A server.pid file exists, but that process id is not running. This could indicate abnormal process termination.\\n" | tee -a "$Log" exit 1 fi else # If there is no server.pid file, but we can still get a response from # 'p4 info', try to shut down with the front-door method of calling # 'p4 admin stop'. If there is no server.pid file and no response from # 'p4 info', then p4d is down. if "$P4BIN" info -s > /dev/null 2>&1; then echo -e "\\nWarning: Missing $P4ROOT/server.pid. Attempting shutdown with 'p4 admin stop'.\\n" | tee -a "$Log" echo "$0" "$SDP_INSTANCE" admin_stop | tee -a "$Log" $0 "$SDP_INSTANCE" admin_stop fi fi ;; (restart) $0 "$SDP_INSTANCE" stop $0 "$SDP_INSTANCE" start ;; (*) echo -e "\\nUsage: $0 SDP_INSTANCE {start|stop|status|restart|force_start|admin_stop}\\n" exit 1 ;; esac [[ -d "$TmpDir" ]] && rm -rf "$TmpLog" exit "$ExitCode"
# | Change | User | Description | Committed | |
---|---|---|---|---|---|
#44 | 30266 | C. Thomas Tyler |
Added logic surrounding 'ulimit -c unlimited' to avoid calling 'ulimit' on systems on which it is doomed to fail with an 'Operation not permitted' error. |
||
#43 | 30127 | C. Thomas Tyler |
Enhanced p4d_base to start p4d from the P4ROOT directory and to set 'ulimit -c unlimited' to enable core dumps. Similar was done for p4p_base and p4broker_base. For best results, combine this with doing the following as root on p4d server machines: bash -c "echo 'core.%e.%p.%s' > /proc/sys/kernel/core_pattern" This wll generate core files for p4d in $P4ROOT, something like this: /p4/1/root/core.p4d_1.2351279.11 For p4p, core dumps appear in the $P4PCACHE dir. For p4broker, core dumps appear in the $LOGS dir. Also added version identifier that is captured in systemctl status output. #review-30128 |
||
#42 | 29100 | C. Thomas Tyler |
Removed requirement for systemd *.service files to have open perms in *_base scripts and templates. Removed checks for open perms on *.service files in verify_sdp.sh. Fixed minor ShellCheck compliance issue. See also: HI-101: https://swarm.workshop.perforce.com/jobs/HI-101 [Submitting, then re-opening for post-commit final review]. |
||
#41 | 28976 | Domenic |
Add back the info from $(date) to p4d_base so the p4d_init.log file captures the date and time that a command was ran. It looks like this was (accidentally?) lost in CL 27064 when converting from "log" to "echo -e". |
||
#40 | 28771 | C. Thomas Tyler |
Changed email address for Perforce Support. #review-28772 @amo @robert_cowham |
||
#39 | 28674 | C. Thomas Tyler |
Enhanced call to 'mktemp -d' to be more portable. The call was only made when /var/log is 100% full did a call to 'mktemp -d' that works on some platforms but not Ubuntu. Without this change, if /var/log was 100% full, on some platforms the error would show as: mktemp: too few X's in template ‘tmp_p4d_base’ This logic was only exercised if /var/log was 100% full. #review-28675 |
||
#38 | 28369 | Robert Cowham |
Add explanation message for p4d_base when systemd file not correct. Move warning about p4d journal validation and only output on error. Avoids unnecessary alarms. |
||
#37 | 27955 | C. Thomas Tyler |
Fixed issue that can result in false-positives in the pre-flight journal corruption check. #review-27948 @pascal_soccard @robert_cowham |
||
#36 | 27206 | C. Thomas Tyler | More minor message refinements. | ||
#35 | 27205 | C. Thomas Tyler | Cosmetic refinement to error message. | ||
#34 | 27199 | C. Thomas Tyler |
Made is_server_running() function in backup_functions.sh aware of the nuances of of checking whether p4broker, p4d, and p4p are up. For the broker in particular, only the default broker configuration is checked. This will avoid process compatibility issues with customers using DFM (Down for Maintenance) brokers as part of the upgrade procedure, either manually or with the Helix Management System (HMS). This was done by deferring to the *_base scripts for each service type, as these scripts now return reliable exit codes for status. Made p4d_base not write anything to p4d_init.log if only a 'status' check is done. Fixed bug in 'p4d base' status check where it would return a wrong exit code if p4d was up but SSL trust was not established. Removed redundant logic in start_p4[d,p,broker] functions for service status checking. Enhanced p4d_base to clean up temp dir when done. #review-27200 |
||
#33 | 27109 | C. Thomas Tyler |
Refined logic so that if a shutdown is attempted and there is no server.pid file, the 'fallback/2nd pass/assured shutdown' logic to do a 'p4 admin stop' files only if a 'p4 info' command can reach the server. Otherwise, the stop command is silently ignored. Refined exit code handling and fixed log interaciton issues. Fixed issue with systemd status not indicating successful startup. The systemd unit files now specify the Type as simple rather than forking, and defer to systemd to handle the forking. When systemd is used, the -d/--daemonsafe flags are removed from service start commands for p4p/p4broker/p4d, and for p4dtg the & is removed so the process is not started in the background. For compatibility with non-systemd systems, the -d/--daemonsafe flags are still applied when systemd is not in use. With this change, systemd's concept of whether the service is running should now be reliable. If you try to start without using systemd on a system for which a systemd unit file exists, an error is displayed -- a big, hopefully useful and informative error message. #review |
||
#32 | 27075 | C. Thomas Tyler |
Minor changes to formatting. Added warning if P4ROOT_not_usable.txt is found when starting in 'force_start' mode, but the server still attempts to start. Note: Unlike the journal corruption check, the 'force_start' option is not mentioned in the error message to the user if the server refuses to start due to ahving a P4ROOT_not_usable.txt file. This is because potential journal corruption is something that can sometimes be ignored, and in any case you may need to start the server to work on it. But for the scenarios in which P4ROOT_not_usable.txt is there, e.g. a checkpiont wasn't done replaying when 'load_checkpoint.sh' got interrupted, force_start won't likely be helpful. |
||
#31 | 27070 | C. Thomas Tyler |
Added backup_functions.sh back into p4d_base. Fixed failing regression tests. |
||
#30 | 27064 | C. Thomas Tyler |
Fixed issue where 'source p4_vars' hangs if load_checkpoint.sh is running. Added new semaphore file, $P4ROOT/P4ROOT_not_usable.txt. This is used in a way similar to 'offline_db_usable.txt' in the offline_db, except that this file only exists when the databases in P4ROOT are not usable. This is the opposite of how offline_db_usable.txt works, because P4ROOT is expected to be usable 99.9% fo the time. p4d_base will refuse to start p4d if this file exists, protecting against possible operator errors (like trying to start p4d when a checkpoint is still loading). Added check_file_dne() function to verify_sdp.sh to confirm a named file does not exist. Added checks in verify_sdp.sh that P4ROOT_not_usable.txt does not exist in P4ROOT or offline_db. Modified switch_db_files() (called by refresh_P4ROOT_from_offline_db.sh) to properly use the new P4ROOT_not_usable.txt safety file. Fixed bugs in p4d_base that could cause p4d_init.log to be overwritten if error output was generated. Removed call to 'backup_functions.sh' in p4d_base, as on balance it added more complexity than needed. #review-27065 |
||
#29 | 26928 | Robert Cowham |
Fix problem with line breaks when testing for journal corruption. Also softened the error message to avoid unnecessary alarm for users! Observed "cd -" not working on AWS Linux box. Changed to pushd/popd. |
||
#28 | 26881 | C. Thomas Tyler |
Skip journal corruption check if no P4JOURNAL exists. If the P4JOURNAL file doesn't exist for whatever reason, the code does a 'tail' on a non-existent file, resutling in a harmless error in the p4d_init.log file. Changed wording from "Preflight journal corrpution check" to "Preflight journal health check", as it just feels wrong to have the words "journal corruption" appear every time p4d starts. And in at least one case I saw a customer have terminal auto-coloring react, highlighthing in red and bolding the word "corruption," making it even scarier, even though the test passed. Fixed an unrelated shellcheck issue (no functional effect) to maintain shellcheck compliance. |
||
#27 | 26399 | C. Thomas Tyler |
Shellcheck compliance tweaks. Fixed quoting bug in unrelease dev branch version. Added startDelay to p4broker_base and p4p_base. |
||
#26 | 26398 | C. Thomas Tyler |
Added delay on start, to workaround common issues where a fast cycle of p4d process fails because the TCP port (e.g. 1666) is still in use briefly after a clean shutdown, a system resource isn't available immediately on boot, or similar timing issue. The delay on start is skipped if P4ROOT is empty (tested for by checking for existence of db.comain). Added optional new SDP_START_DELAY setting in the p4_vars file (generated from p4_vars.template) to configure the delay. Files in this changelist now pass shellcheck (v0.61) (driving various minor non-functional edits). |
||
#25 | 25970 | Robert Cowham | Detect and warn on journal corruption | ||
#24 | 25453 | C. Thomas Tyler | Added missing 'exit 1' after failed 'p4d -xvU' check. | ||
#23 | 25206 | C. Thomas Tyler |
Removed logic that uses 'p4d -cset' to force the value for P4JOURNAL, and also automatic journal rotation on server startup. Added related logic to verify_sdp.sh to ensure there is one source of truth for the P4JOURNAL definition. === On Journal Rotation at Server Startup === The goal with journal rotation on server stratup is noble, to make it so any potential journal corruption *always* appears at the end of a numbered journal file, rather than being in the middle of the active journal. This can make it easier and faster to recover from journal corruption caused by sudden power loss, kernel panic, a p4d bug/crash, etc. However, the implementation causes problems (noted below). === On Forcing P4JOURNAL === The goal of forcing the value of P4JOURNAL via db.config is also noble, in that having a value anything other than the SDP standard can really wreak havoc with things. This is generally not an issue in a 'fresh' SDP install, but can be an issue (wreak havoc!) in cases where 'p4 configure' was used to set a value for P4JOURNAL that conflicts with the value defined by the SDP environment mechanism, which is in turn passed to 'p4d' on the command line. Even if the value defined differently, it should be set in to exactly one value, and exactly one mechanism. The current implementation causes problems (noted below). == Problems with setting P4JOURNAL in db.config == 1. Things Break The forced P4JOURNAL set via 'p4d -cset' causes a mild form of journal corruption that breaks 'standby' replicas using journalcopy, as this type of replica is extremely sensitive to the contents of every byte in the journal file, and doesn't allow for use of 'p4d -cset' to modify the P4JOURNAL file. While it does not cause any actual loss of data, it does require manual reset to fix things. In the case of a site-wide topology with a mandatory standby replica, it causes global replication to stall. 2. Not our Place (not the place of SDP scripts) Based on the above and taking a step back, I think this script behavior of forcing a back-door journal rotation is simply too intrusive for what SDP scritps should be allowed to do. They live to have some understanding of p4d workings, but shoulnd't pretend to have too much insight into the inner workings of p4d. == Problem with Always-On Journal Rotation on Start == 1. What the wah? This confuses admins by incrementing the journal counter unexpectedly. In Battle School training classes, for example, students (Perforce admins) are confused by seemingly random journal incrementing. While this could be documented and trained for, it violates the principal of least surprise, and is not typical 'p4d' beavhior. 2. Always vs. Rare It rotates the journal even when there is no corruption, which of course 99.99999% or more of the time at any given site. Anyone who has been through a corruption scenario is happy to have the corruption at the end rather than in the middle of a journal file -- as noted, the intent here is noble. But before we do any journal rotations, we should detect whether there is corruption. Turns out we have a means to detect journal corruption at the end of the current/active journal file, and should employ such detection and handle it in some approrpaite manner, e.g. by expanding the 'force_start' logic in this p4d_base init script. Journal corrption detection and preliminary handling may be added in a future SDP release. When the journal is truly corrupted, global replication will stall in any case, so measure like journal file rotation may be called for in that scenario. 3. Accelerated Deletion of Backups Increased journal counter rotations result in unexpectedly fast removal of backups. Admins are used to thinking that roughly, "one journal rotation is roughly one day." Settings like KEEPLOGS, KEEPCKPS, and KEEPJNLS trigger off the number of journal rotatations, not the number of actual calendar days. Now, I think it's OK that journal rotations and days don't match precisely. In a typical "big deal" maintenance window, for example, there might be an additional 1-3 journal rotations induced by extra checkpoints or journals being created over the course of a maintenance activity. But there may be a dozen more 'p4d' restarts during sanity testing and playing around with things. With the current logic, each restart causes another journal rotation. By the end fo the weekend, your next call to daily_checkpoint might remove more of your recent backups than you'd like or expect. (A long standing safety feature always preserves the last few, but still we don't want to delete more than desired.) === Foor for Thougt: KEEP* = numbrer of days? === Making it so KEEPLOGS/KEEPJNLS/KEEPCKPS mean literally number of days rather than journal rotations is worthy of consideration. That's beyond the scope of this change though. #review @robert_cowham @josh |
||
#22 | 23755 | Russell C. Jackson (Rusty) | Moved cset of P4JOURNAL so that it is only set on a master or edge server. | ||
#21 | 23428 | Robert Cowham | Fix hang on older p4d when starting | ||
#20 | 23136 | Russell C. Jackson (Rusty) |
Corrected a typo. Changed Shtudown to Shutdown |
||
#19 | 23040 | C. Thomas Tyler |
Enhancements to p4d_base: * Added $LOGS/p4d_init.log to track calls to init. * Increased verbosity somewhat, though the 'p4d -xvu' output still goes only to the p4d_init.log, with an error message referencing that log in event of error. * Replaced old preflight log with standard LOGFILE for all tracking. * Fixed issue starting server due to LOGFILE not being defined; not documented as a bug since it is a fix to unreleased behavior in the last change in the dev branch. * Enhanced to automatically replace '-d' with '--daemonsafe' if p4d 2017.1+ * Enhanced to call p4login rather than duplicating logic in p4login. No rotation is provided for p4d_init.log; it is continuously appended and is expected to grow only a few M per year with typical usage (the bigger the server, the less the growth). Enhancements to all *_base scripts: * Enhanced so default behvaior, with no args, is to show a usage message (rather than doing 'status'). |
||
#18 | 22250 | C. Thomas Tyler |
Further refinements to the new 'rotate journal on p4d start' change: * Fixed p4d_truncate_journal so it has less environment dependencies (e.g. doesn't depend on LOGFILE, etc.) and doesn't try sending email. * Introduced msg() and bail(), counterparts to log() and die() which don't try to write to LOGFILE and don't try to send email. * Added call to get_journalnum() before call to p4d_truncate_journal(). * Fixed logic in get_journalnum() so it gets the journal number w/o needing p4d to be up. * I think I fixed the syntax error in bitwise operator check when setting EDGE_SERVER. It works on a non-edge server (sets EDGESERVER=0). For now I have it doing an 'echo EDGESERVER=$EDGESERVER', but need to test that it correctly sets EDGESERVER=1 on an edge server. TO DO: Remove that 'echo EDGESERVER=$EDGESERVER' once we verify it correctly sets the value for $EDGESERVER. (Or not?) |
||
#17 | 22239 | Russell C. Jackson (Rusty) |
Change set_vars to look up the edge server directly in the database so the server does not have to be on-line to check. Fix for Job: SDP-223 |
||
#16 | 22215 | Russell C. Jackson (Rusty) |
First step towards solving corruption problem with journals and replicas. Rotate the journal on the master when starting. That makes it easier to remove any corruption since it will be the last thing in the file. |
||
#15 | 20559 | C. Thomas Tyler |
Fixed recent regression with 'restart' option in init scripts; they now need to pass in the SDP_INSTANCE parameter. Also corrected usage messages. |
||
#14 | 20554 | C. Thomas Tyler | Removed harmless but unnecessary debug message from p4d_base. | ||
#13 | 20491 | C. Thomas Tyler |
Fixed an environment insulation bug in init scripts. More testing (and thinking) revealed that the only way to truly provide a guarantee of insulation from user-set defaults in ~perforce/.bashrc (etc.) is to pass SDP_INSTANCE as a parameter to the *_base scripts, so that the instance name is explicitly passed into the su/exec call (when run as root). This change also includes minor cleanup in init scripts and *_base scripts. |
||
#12 | 20448 | C. Thomas Tyler |
Fixed env bug seen when running init scripts as root. Updated *_base scripts and *_init script templates to a new standard. Goals: 1. Init scripts that use corresponding *_base scripts are minimized so that do nothing more than set SDP_INSTANCE and then call the corresponding *_base script. 2. The 'su' commands always pass $* instead of $1, deferring all processing to the *_base script. 3. The shell environment is now guaranteeds to have the same results regardless of whether it is called as 'root' or as the defined OSUSER. 4. The p4_vars file is always sourced exactly once. Two calls to 'source p4_vars' appear in some cases, one immediatley before the su/exec call, and another after the after the 'su/exec'. Only one or the other is sourced. 5. All init scripts have a reasonably consistent usage message. 6. All init scripts accept a 'status' argument. This change fixes a bug where 'p4broker_N_init status' run as the 'perforce' user would report many pids unrelated to Perforce if run as root, e.g. doing 'service p4broker_N_init status'. This also eliminates a potential issue where the 'perforce' user might source a p4_vars with a default instance in ~/.profile or ~/.bashrc, thus invalidating the instance specified when the user ran the init script as root. |
||
#11 | 20348 | C. Thomas Tyler |
Use pid to shutdown the p4d process. Goals: 1. Make shutdown more standard using kill/SIGTERM. 2. Make shutdown more reliable; no need to be logged in, insluated from P4AUTH, etc. 3. Use the now-built-in pid mechanism. 4. On shutdown, make it so the script doesn't return until the p4d process is well and truly down. This was implemented with some backward-compatibilty features to simplfy SDP upgrades: 1. Though the p4_N.vars file should add the required '--pid-flag' to P4D_FLAGS, this new p4d_base checks and adds it if it is missing. 2. The old 'p4 admin stop' logic is retained as a new 'admin_stop' function, and is used if the server.pid file does not exist when stop is called. |
||
#10 | 20170 | Russell C. Jackson (Rusty) |
Moved password and users into the config directory to allow for instance specific users and passwords. Ran into a case where two different teams were sharing the same server hardware and needed this type of differentiation. Surprised that we haven't hit this sooner. Also defaulted mkdirs to use the numeric ports since this is the most common installation. |
||
#9 | 18686 | Russell C. Jackson (Rusty) |
#REVIEW-18670 Added -cset of P4JOURNAL to the start section so that it is always set to the correct location. |
||
#8 | 17281 | Robert Cowham |
When stopping, send error to /dev/null Remove tabs #review @rjackson @ttyler ttyler: Looks good! |
||
#7 | 16335 | C. Thomas Tyler |
Routine Merge Down to dev from main using: p4 merge -b perforce_software-sdp-dev |
||
#6 | 16029 | C. Thomas Tyler |
Routine merge to dev from main using: p4 merge -b perforce_software-sdp-dev |
||
#5 | 15778 | C. Thomas Tyler | Routine Merge Down to dev from main. | ||
#4 | 13906 | C. Thomas Tyler |
Normalized P4INSTANCE to SDP_INSTANCE to get Unix/Windows implementations in sync. Reasons: 1. Things that interact with SDP in both Unix and Windows environments shoudn't have to account for this obscure SDP difference between Unix and Windows. (I came across this doing CBD work). 2. The Windows and Unix scripts have different variable names for defining the same concept, the SDP instance. Unix uses P4INSTANCE, while Windows uses SDP_INSTANCE. 3. This instance tag, a data set identifier, is an SDP concept. I prefer the SDP_INSTANCE name over P4INSTANCE, so I prpose to normalize to SDP_INSTANCE. 4. The P4INSTANCE name makes it look like a setting that might be recognized by the p4d itself, which it is not. (There are other such things such as P4SERVER that could perhaps be renamed as a separate task; but I'm not sure we want to totally disallow the P4 prefix for variable names. It looks too right to be wrong in same cases, like P4BIN and P4DBIN. That's a discussion for another day, outside the scope of this task). Meanwhile: * Fixed a bug in the Windows 2013.3 upgrade script that was referencing undefined P4INSTANCE, as the Windows environment defined only SDP_INSTANCE. * Had P4INSTANCE been removed completely, this change would likely cause trouble for users doing updates for existing SDP installations. So, though it involves slight technical debt, I opted to keep a redundant definition of P4INSTANCE in p4_vars.template, with comments indicating SDP_INSTANCE should be used in favor of P4INSTANCE, with a warning that P4INSTANCE may go away in a future release. This should avoid unnecessary upgrade pain. * In mkdirs.sh, the varialbe name was INSTANCE rather than SDP_INSTANCE. I changed that as well. That required manual change rather than sub/replace to avoid corrupting other similar varialbe names (e.g. MASTERINSTANCE). This is a trivial change technically (a substitute/replace, plus tweaks in p4_vars.template), but impacts many files. |
||
#3 | 12169 | Russell C. Jackson (Rusty) |
Updated copyright date to 2015 Updated shell scripts to require an instance parameter to eliminate the need for calling p4master_run. Python and Perl still need it since you have to set the environment for them to run in. Incorporated comments from reviewers. Left the . instead of source as that seems more common in the field and has the same functionality. |
||
#2 | 11493 | Russell C. Jackson (Rusty) | Removed echo for Preflight Check. | ||
#1 | 10638 | C. Thomas Tyler | Populate perforce_software-sdp-dev. | ||
//guest/perforce_software/sdp/main/Server/Unix/p4/common/bin/p4d_base | |||||
#1 | 10148 | C. Thomas Tyler | Promoted the Perforce Server Deployment Package to The Workshop. |