#!/usr/local/bin/perl5 # -*-Fundamental-*- # $Id: //guest/richard_geiger/utils/snap_checkpoint/snap_checkpoint#1 $ # # Original Author: Richard Geiger, Network Appliance, Inc. # use Carp; use strict; $| = 1; # Configuration Settings # # REVIEW THESE SETTINGS, AND ADJUST THEM AS NECESSARY FOR USE IN YOUR # ENVIRONMENT: # # $P4PORT for the server you wish to checkpoint; # my $P4PORT = "p4netapp:1678"; # $P4ROOT for the server you wish to checkpoint # my $P4ROOT = "/u/p4/root.$P4PORT"; # The path to the "p4" client to be used # my $P4 = "/u/p4/VERS/bin.osf/p4"; # The path to the "p4d" server to be used # my $P4D = "/u/p4/VERS/bin.osf/p4d"; # The path to the directory where the checkpoint should be written # my $P4CHECKPOINT = "/u/p4/checkpoint.$P4PORT"; # The path to the journal file # my $P4JOURNAL = "/u/p4/checkpoint.$P4PORT/journal"; # The name of the NetApp filer that holds the volume where # $P4ROOT is stored # # my $FILER = "maglite"; # The volume name of the volume where $P4ROOT is stored # my $VOLUME = "perforce"; # Path to the host's "rsh" command # my $RSH = "/bin/rsh"; # Path to the host's "gzip" command # my $GZIP = "/usr/local/bin/gzip"; # The locking order of the db.* files, as of r99.1, per information # supplied by Perforce Software. # # ***** You should confirm the correct order for any other version # ***** of the Perforce server; if the locking order is not correct, # ***** it is possible to get into a deadlock situation!) # my $dbfiles = <<EOL; db.counters db.user db.group db.depot db.domain db.view db.review db.have db.integ db.locks db.rev db.revcx db.working db.change db.desc db.job db.jobpend db.jobdesc db.fix db.fixrev db.boddate db.bodtext db.ixdate db.ixtext db.protect db.trigger EOL my @dbfiles = split(/\n/, $dbfiles); use Fcntl ':flock'; # import LOCK_* constants sub p4d_lock { no strict 'refs'; foreach my $file (@dbfiles) { my $filepath = "$P4ROOT/$file"; my $handle = $file; $handle =~ s/^db\.//; $handle =~ tr/a-z/A-Z/; if (! open($handle, "<$P4ROOT/$file")) { die "can't open \"$P4ROOT/$file\": $!"; } if (! flock($handle, LOCK_EX)) { die "can't lock \"$P4ROOT/$file\": $!"; } } use strict 'refs'; print "$P4ROOT locked.\n"; } sub p4d_unlock { no strict 'refs'; foreach my $file (reverse(@dbfiles)) { my $handle = $file; $handle =~ s/^db\.//; $handle =~ tr/a-z/A-Z/; close $handle; } use strict 'refs'; print "$P4ROOT unlocked.\n"; } # Run a command, returning status and output; terminate # on any error. # sub s { my ($cmd) = @_; my ($sts, $output); print("> $cmd\n"); if (! open(CMD, "$cmd 2>&1 |")) { die "can't open \"$cmd 2>&1 |\": $!"; } while (<CMD>) { print(": $_"); $output .= $_; } close CMD; if ($sts = $?) { my $sig = $sts & 0x0f; $sts = $sts >> 8; die "\"$cmd\" exited with signal $sig status $sts"; } return ($sts, $output); } # OK, here's the drill... # First, look up the journal sequence number counter... Logically, # we'd prefer to do this with the database locked, but we can't, # because we have to run "p4 counters", and the danger of a rogue # checkpoint -jc happening seems tolerable. (After all, this process # should be the only one intending to do a checkpoint!) # my ($sts, $output) = &s("$P4 -p $P4PORT counters"); my $journal_counter = ""; foreach $_ (split(/\n/, $output)) { if (/^journal = (\d+)/) { $journal_counter = $1; } } if ($journal_counter eq "") { die "can't get journal counter, nothing done."; } # Now increment the counter # $journal_counter++; ($sts, $output) = &s("$P4 -p $P4PORT counter journal $journal_counter"); chop $output; if ($output !~ /^Counter journal set\.$/) { die "couldn't increment journal counter:\n$output"; } # Next, we lock down the entire database # &p4d_lock; # Then: Copy & truncate the journal (since the server is locked, we # know that nobody will be trying to write it while we do this.) # my $journaln = "$P4JOURNAL.$journal_counter"; ($sts, $output) = &s("/bin/cp -p $P4JOURNAL $journaln"); if ($sts) { die "couldn't copy the journal file."; } if (! open(J, ">$P4JOURNAL")) { die "couldn't truncate \"$P4JOURNAL\": $!"; } close J; # Now, we snapshot the database filesystem... # # First, delete any leftover snapshot named "checkpoint": # ($sts, $output) = &s("$RSH $FILER snap delete $VOLUME checkpoint 2>&1"); chop $output; if ($sts || ($output !~ /^(deleting snapshot\.+|No such snapshot.)$/)) { die "couldn't delete snapshot."; } # Now take the snapshot: # ($sts, $output) = &s("$RSH $FILER snap create $VOLUME checkpoint 2>&1"); chop $output; if ($sts || $output !~ /^creating snapshot\.+$/) { die "couldn't create snapshot."; } # Now we can release the lock... # &p4d_unlock; # At this point, the "live" Perforce server is up and available to # users. So, now we do the time consuming step, checkpointing from the # snapshot... # # Compress the saved journal segment... # (Ignore errors - they can be dealt with later) # &s("$GZIP $journaln"); # Diddle $P4ROOT so that this checkpoint is done from the snapshot # we just took... # my $P4ROOT_sav = $P4ROOT; $P4ROOT .= "/.snapshot/checkpoint"; $ENV{"P4ROOT"} = $P4ROOT; my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); my $tstamp = sprintf("%04d%02d%02d%02d%02d%02d", 1900+$year, $mon+1, $mday, $hour, $min, $sec); ($sts, $output) = &s("$P4D -r $P4ROOT -p $P4PORT -z -jd $P4CHECKPOINT/checkpoint.$tstamp.gz"); chop $output; if ($sts || $output !~ /^Dumping to $P4CHECKPOINT\/checkpoint\.$tstamp\.gz\.\.\.$/) { die "checkpoint failed."; } # OK, the checkpoint finished; now we can delete the snapshot # (Or, you might want to keep it around; your call!) # ($sts, $output) = &s("$RSH $FILER snap delete $VOLUME checkpoint 2>&1"); chop $output; if ($sts || $output !~ /^(deleting snapshot\.+|No such snapshot.)$/) { print STDERR "WARNING: couldn't delete snapshot \"checkpoint\".\n"; # no exit here - next checkpoint will attempt to delete it again. } exit $sts;
# | Change | User | Description | Committed | |
---|---|---|---|---|---|
#9 | 1544 | Richard Geiger |
Update to reflect changes in p4d 2002.1: a) The change in the locking order, due to db.changex b) The fix for job006497 |
||
#8 | 942 | Richard Geiger | Use $VOLUME, too! | ||
#7 | 941 | Richard Geiger | Use the $GZIP variable instead of the literal path. | ||
#6 | 920 | Richard Geiger |
add 2001.1 locking order; correct open mode ("+<") for Solaris; some notes; and the "lockcheck" option. |
||
#5 | 437 | Richard Geiger |
Hack to handle r2000.1's newfound reluctance to do "p4 counter journal NNNN". |
||
#4 | 248 | Richard Geiger |
The main change here is to move the copying of the journal file to done from the checkpoint, outside of the region where the server is locked. This can make the whole thing go much faster when the journal is sizable enought that the copy takes a significant amoutn of time to happen. |
||
#3 | 246 | Richard Geiger |
Update the script such that we use, verbatim, the p4d_snap_checkpoint function from "p4d_admin", which the version we're finally really deploying. This should make it much easier to maintain in the future. Also update the html doc to match. |
||
#2 | 239 | Richard Geiger |
- Use LOCK_SH when locking the database - Use ALL CAPS when shunning all responsibility for the thing (Warranty disclaimer) |
||
#1 | 238 | Richard Geiger |
Sample script illustrating how to use Data ONTAP snapshots for a "fast checkpoint", plus accompanying notes |