gentrevml #1

#!/usr/local/bin/perl -w

=head2 NAME

gentrevml - Generate a .revml file used by the t/ scripts

=head2 SYNOPSIS

   perl bin/gentrevml --(revml|p4|cvs) [--bootstrap] [--batch=1]

=head2 DESCRIPTION

The test suite uses a bas RevML file to check to see vcp it can copy in to
and out of a repository correctly.  This is done for each repository class.

Note that going through a repository may lose some information, so the
test suite can't always compare the input RevML to the output RevML.

Only the revml->revml case is known to be idempotent.

I chose to do this over using some base repository because not every user
is going to happen to have that repository, and (2) not every repository
will pass through all information correctly.

=head2 COPYRIGHT

Copyright 2000, Perforce Software, Inc.  All Rights Reserved.

This will be licensed under a suitable license at a future date.  Until
then, you may only use this for evaluation purposes.  Besides which, it's
in an early alpha state, so you shouldn't depend on it anyway.

=head2 AUTHOR

Barrie Slaymaker <barries@slaysys.com>

=cut

use Text::Diff ;
use VCP::DiffFormat ;
use Getopt::Long ;
use MIME::Base64 ;

use strict ;

my $which ;
my $debug ;

sub which {
   die "Only one mode allowed\n" if $which ;
   $which = shift ;
}

my $batch ;
my $bootstrap ;

BEGIN {
   ## Need to know how to name the output file before we can
   ## "use RevML::Writer".
   $batch = 0 ;
   Getopt::Long::Configure( qw( no_auto_abbrev no_getopt_compat ) ) ;
   unless (
      GetOptions(
	 'p4'           => \&which,
	 'cvs'          => \&which,
	 'revml'        => \&which,
	 'b|bootstrap'  => \$bootstrap,
	 'batch=i'      => \$batch,
	 'd|debug'      => \$debug,
      )
      && $which
   ) {
      require Pod::Usage ;
      Pod::Usage::pod2usage( exitval => 2, verbose => 3 ) ;
   }
}

if ( $debug ) {
   print STDERR "for $which\n" ;
   print STDERR "bootstrap mode ", $bootstrap ? "on" : "off", "\n" ;
   print STDERR "batch $batch\n" ;
   print STDERR "\n" ;
}

##
## BUILD  THE MANIFEST
##
my @file_names ;
## Put @files in alpha order.  p4 likes to output in alpha order, and this
## makes comparing p4->revml output to revml->p4 input easier.
for my $main (
   qw( a/deeply/buried/file a_big_file add binary del readd ),
#   "spacey file name"
) {
   if ( $main eq "add" || $main eq "del" ) {
      for my $file    ( qw( f1 f2 f3 f4 ) ) {
	 my $fn = "$main/$file" ;
	 next if $fn eq "del/f1" ; # Can't delete in change 1
	 push @file_names, $fn ;
      }
   }
   else {
      push @file_names, $main ;
   }
}


##
## BUILD REVISIONS IN MEMORY
##
my @changes ;

my $binary_counter = 0 ;

{
   my $user_id = "${which}_t_user" ;
   $user_id .= '@p4_t_client' if $which eq 'p4' ;

   my %rev_num ;

   ## We never get around to changes 7..9
   my %deleted_change_num = (
      ## Delete the 'del/f<x>' in change <x>
      ## except you can't delete anything in change 1, eh?
      map( ( "del/f$_" => $_ ), (2..9) ),
   ) ;

   my %created_change_num = (
      ## Add the 'add/f<x>' in change <x>
      map( ( "add/f$_" => $_ ), (1..9) ),
   ) ;

   my $counter = "00" ;

   for my $change_num ( 1..6 ) {
      print STDERR "concocting \@$change_num:\n" if $debug ;
      ## We do the file names in sorted order because going in and out of
      ## some repositories like CVS folds all timestamps in a change to
      ## all be the same time (the cvs commit sets the timestamp), and
      ## we want the revml that comes out to be in the same order
      ## as the revml that went in.
      for my $name ( sort @file_names ) {
	 next
	    if (
	       defined $created_change_num{$name}
	       && $change_num < $created_change_num{$name} 
	    )
	    || (
	       defined $deleted_change_num{$name}
	    && $change_num > $deleted_change_num{$name}
	    ) ;

	 ++$rev_num{$name} ;

	 print STDERR "   $name#$rev_num{$name}:" if $debug ;

	 die "counter too big" if $counter > 254 ;

	 my $content =  $name eq "binary" 
	   ? chr( $binary_counter++ & 0x07 ) x 100
	   : sprintf(
	       qq{%s, revision %d, char 0x%02x="%s"\n},
	       $name,
	       $rev_num{$name},
	       $counter + 1,
	       chr( $counter + 1 ),
	    ) ;

	 $content = $content x 200 if $name eq "a_big_file" ;

	 my $r = {
	    name     => $name,
	    type     => $name eq "binary" ? "binary" : "text",
	    encoding => $name eq "binary" ? "base64" : "none",
	    user_id  => $user_id,
	    content  => $content,
	    time     => "2000-01-01 12:00:${counter}Z",
	    ## In p4, all files in a change number have an identical comment.
	    ## We impose this on the other solutions to test foo->p4 change
	    ## number aggregation.
	    comment  => "comment $change_num\n",
	 } ;

	 ## p4 doesn't handle modtime until very recently, and then it
	 ## doesn't expose it easily.
	 $r->{mod_time} = "2000-01-01 12:01:${counter}Z" unless $which eq 'p4' ;

	 if ( $which eq 'p4' ) {
	    $r->{p4_info}   = "Some info $which might emit about this file" ;
	    $r->{rev_id}    = $rev_num{$name} ;
	    ## In p4, you may have skipped some change numbers
	    $r->{change_id} = ( $r->{rev_id} - 1 ) * 2 + 1 ;
	    ## TODO: Delete this next line when we get VCP::Dest::p4 to sync
	    ## change numbers
	    $r->{change_id} = $change_num ;
	 }
	 elsif ( $which eq 'cvs' ) {
	    $r->{cvs_info}  = "Some info $which might emit about this file" ;
	    $r->{rev_id} = "1.$rev_num{$name}" ;
	    # We provide a change ID to see if the label makes it in and
	    # so that the label can be used to test incremental exports from
	    # cvs.
	    $r->{change_id} = $change_num ;
	 }
	 elsif ( $which eq 'revml' ) {
	    $r->{cvs_info}  ="Some info about this file" ;
	    $r->{rev_id}    = $rev_num{$name} ;
	    $r->{change_id} = $change_num ;
	 }
	 else {
	    die "$which unhandled" ;
	 }

	 if (  defined $deleted_change_num{$name} 
	    && $change_num == $deleted_change_num{$name}
	 ) {
	    $r->{action} = 'delete' ;
	 }
	 elsif ( $rev_num{$name} eq 1 ) {
	    $r->{action} = 'add' ;
	 }
	 elsif ( $name eq "readd" ) {
	     if ( $change_num % 2 ) {
	         ## Add it on the odd numbers
		 $r->{action} = 'add' ;
	     }
	     else {
	         $r->{action} = 'delete' ;
	     }
	 }
	 else {
	    $r->{action} = 'edit' ;
	 }

	 unless ( $r->{action} eq 'delete' || $counter % 2 ) {
	    $r->{labels} = [
		"achoo$counter",
		"blessyou$counter",
	    ] ;
	 }

	 $counter = sprintf "%02d", $counter + 1 ;

	 push @{$changes[$change_num]}, $r ;
	 if ( $debug ) {
	    print STDERR " #$r->{rev_id}" ;
	    print STDERR " \@$r->{change_id})" if defined $r->{change_id} ;
	    print STDERR " ($r->{action})\n" ;
	 }
      }
      print STDERR "\n" if $debug ;
   }
}

## Emit the document

use Digest::MD5 qw( md5_base64 ) ;
use File::Basename ;
use RevML::Doctype 'DEFAULT' ;
use RevML::Writer qw( :all :dtd_tags ) ;

sub _emit_characters {
   my ( $buf ) = @_ ;

   setDataMode( 0 ) ;
   ## note that we don't let XML munge \r to be \n!
   while ( $buf =~ m{\G(?:
      (  [ \x00-\x08\x0b-\x1f\x7f-\xff])
      | ([^\x00-\x08\x0b-\x1f\x7f-\xff]*)
      )}gx
   ) {
      if ( defined $1 ) {
	 char( "", code => sprintf( "0x%02x", ord $1 ) ) ;
      }
      else {
	 characters( $2 ) ;
      }
   }

}


my $prog = basename $0 ;
my $f0 = "$prog.0" ;
my $f1 = "$prog.1" ;

binmode STDOUT ;

setDataMode 1 ;

xmlDecl ;
time '2000-01-01 00:00:00Z' ;
rep_type $which ;
rep_desc 'random text, for now' ;

my %prev ;

## TODO: Branching, moving, and binary files

# This is a bogus rev_root, we set it to see if it gets ignored on
# transfers with a destination rev_root specified on the command line.
# TODO: We should also see what happens when dest no rev_root is specified.
rev_root "foo/bar/bah" ;

## Note the overlapping range here.  Batch 1 (0 or 1) needs to have a digest
## of the rev _before_ the start of the batch unless it's in bootstrap mode.
my @change_nums = (
   ( ! $batch )
      ? (1..3)
   : $bootstrap
      ? (4..6)
      : (3..6)
) ;

## Build @files from @changes.  An older version built revml in change number
## order, but we now built in filename, change number order to make sorting
## of the output of vcp tests in to a predictable order possible.  This is
## because cvs->revml does not result in predictable revml order, so
## all the tests generate revml in name,rev order.
my %revs_by_name ;

for my $change_num ( @change_nums ) {
   for my $rev ( @{$changes[$change_num]} ) {
      push @{$revs_by_name{$rev->{name}}}, $rev ;
   }
}

my @sorted_rev_names = sort {
   my @a = split "/", $a ;
   my @b = split "/", $b ;
   while ( @a && @b ) {
      my $result = shift( @a ) cmp shift( @b ) ;
      return $result if $result ;
   }
   return @a <=> @b ;
} keys %revs_by_name ;

for my $rev_name ( @sorted_rev_names ) {
   print STDERR "emitting $rev_name:\n" if $debug ;

   for my $r ( @{$revs_by_name{$rev_name}} ) {
      my $change_num = $r->{change_id} ;
      my $is_first = $change_num eq $change_nums[0] ;
      my $digest_mode = $is_first && $batch && ! $bootstrap ;

      next if ( $is_first
	 && ( ! $batch || $digest_mode )
	 && $r->{action} eq 'delete'
      ) ;

      print STDERR "   $r->{name}#$r->{rev_id}:" if $debug ;

      my $pr = $prev{$r->{name}} ;

      start_rev ;
      name        $r->{name} ;
      type        $r->{type} ;

      if ( ! $digest_mode ) {
	 p4_info     $r->{p4_info}  if defined $r->{p4_info} ;
	 cvs_info    $r->{cvs_info} if defined $r->{cvs_info} ;
      }

      rev_id      $r->{rev_id} ;
      change_id   $r->{change_id}   if defined $r->{change_id} ;

      my $digestion = 1 ;

      if ( $digest_mode ) {
	 print STDERR " digest" if $debug ;
      }
      else {
	 time      $r->{time} ;
	 mod_time  $r->{mod_time} if defined $r->{mod_time} ;

	 user_id   $r->{user_id} ;

         if ( $r->{labels} ) {
	    label $_ for @{$r->{labels}} ;
	 }

	 ## In p4, all files in a change number have an identical comment.
	 comment $r->{comment} ;

	 if ( $r->{action} eq 'delete' ) {
	    print STDERR " delete" if $debug ;
	    defaultWriter->delete() ;
	    $digestion = 0 ;
	 }
	 else {
	    if ( ! $pr || $r->{encoding} ne "none" ) {
	       print STDERR " content" if $debug ;
	       start_content encoding => $r->{encoding} ;
	       if ( $r->{encoding} eq "none" ) {
		  _emit_characters $r->{content} ;
	       }
	       else {
	          setDataMode( 0 ) ;
	          characters encode_base64 $r->{content} ;
	       }
	       end_content ;
	       setDataMode( 1 ) ;
	    }
	    else {
	       print STDERR " delta" if $debug ;
	       base_rev_id $pr->{rev_id} ;
	       start_delta type => 'diff-u', encoding => 'none' ;
	       _emit_characters(
                  diff \$pr->{content}, \$r->{content}, {
                     STYLE => "VCP::DiffFormat",
                  }
               );
	       end_delta ;
	       setDataMode( 1 ) ;
	    }
	 }
      }
      digest md5_base64( $r->{content} ), type => 'MD5', encoding => 'base64'
         if $digestion ;

      $prev{$r->{name}} = $r->{action} eq "delete" ? undef : $r ;
      if ( $debug ) {
	 print STDERR " #$r->{rev_id}" ;
	 print STDERR " \@$r->{change_id})" if defined $r->{change_id} ;
	 print STDERR " ($r->{action})\n" ;
      }
   }
   print STDERR "\n" if $debug ;
}

END {
   if ( -f $f0 ) {
      unlink $f0 or warn "$!: $f0" ;
   }
   if ( -f $f1 ) {
      unlink $f1 or warn "$!: $f1" ;
   }
}

endAllTags ;

#	Change	User	Description
#1	1375	Sean McCune	Creating my own branch for work on vcp.
//guest/perforce_software/revml/bin/gentrevml
#11	1358	Barrie Slaymaker	Win32 changes
#10	1055	Barrie Slaymaker	add sorting, revamp test suite, misc cleanup. Dest/revml is not portable off my system yet (need to release ...::Diff)
#9	701	Barrie Slaymaker	Fixed VCP::Dest::p4 re-rooting problem, further t/* cleanup
#8	695	Barrie Slaymaker	Cleaned up support for binary files in VCP::Dest::revml and altered test suite to deal with it better. Added some thoughts to the TODO file.
#7	619	Barrie Slaymaker	Avoid using p4 print -s, it puts linebreaks in every 4098 characters or so.
#6	609	Barrie Slaymaker	Add a file to the test procedure that it alternately added and deleted (file is named "readd"). Fixed all destinations to handle that.
#5	608	Barrie Slaymaker	Lots of changes to get vcp to install better, now up to 0.066. Many thanks to Matthew Attaway for testing & suggestions.
#4	480	Barrie Slaymaker	0.06 Wed Dec 20 23:19:15 EST 2000 - bin/vcp: Added --versions, which loads all modules and checks them for a $VERSION and print the results out. This should help with diagnosing out-of-sync modules. - Added $VERSION vars to a few modules :-). Forgot to increment any $VERSION strings. - VCP::Dest::cvs: The directory "deeply" was not being `cvs add`ed on paths like "a/deeply/nested/file", assuming "deeply" had no files in it. - VCP::Dest::revml: fixed a bug that was causing files with a lot of linefeeds to be emitted in base64 instead of deltaed. This means most text files. - Various minor cleanups of diagnostics and error messages, including exposing "Can't locate Foo.pm" when a VCP::Source or VCP::Dest module depends on a module that's not installed, as reported by Jeff Anton.
#3	478	Barrie Slaymaker	0.05 Mon Dec 18 07:27:53 EST 2000 - Use `p4 labels //...@label` command as per Rober Cowham's suggestion, with the '-s' flag recommended by Christopher Siewald and Amaury.FORGEOTDARC@atsm.fr. Though it's actually something like vcp: running /usr/bin/p4 -u safari -c safari -p localhost:5666 -s files //.../NtLkly //...@compiler_a3 //.../NtLkly //...@compiler_may3 and so //on //for 50 parameters to get the speed up. I use the //.../NtLkly "file" as //a separator between the lists of files in various //revisions. Hope nobody has any files named that :-). What I should do is choose a random label that doesn't occur in the labels list, I guess. - VCP::Source::revml and VCP::Dest::revml are now binary, control code, and "hibit ASCII" (I know, that's an oxymoron) clean. The <comment>, <delta>, and <content> elements now escape anything other than tab, line feed, space, or printable chars (32 <= c <= ASCII 126) using a tag like '<char code="0x09">'. The test suite tests all this. Filenames should also be escaped this way, but I didn't get to that. - The decision whether to do deltas or encode the content in base64 is now based on how many characters would need to be escaped. - We now depend on the users' diff program to have a "-a" option to force it to diff even if the files look binary to it. I need to use Diff.pm and adapt it for use on binary data. - VCP::Dest::cvs now makes sure that no two consecutive revisions of the same file have the same mod_time. VCP::Source::p4 got so fast at pulling revisions from the repositories the test suite sets up that CVS was not noticing that files had changed. - VCP::Plugin now allows you to set a list of acceptable result codes, since we now use p4 in ways that make it return non-zero result codes. - VCP::Revs now croaks if you try to add two entries of the same VCP::Rev (ie matching filename and rev_id). - The <type> tag is now limited to "text" or "binary", and is meant to pass that level of info between foreign repositories. - The <p4_info> on each file now carries the one line p4 description of the file so that p4->p4 transferes can pick out the more detailed info. VCP::Source::p4, VCP::Dest::p4 do this. - VCP::{Source,Dest}::{p4,cvs} now set binaryness on added files properly, I think. For p4->p4, the native p4 type is preserved. For CVS sources, seeing the keyword substitution flag 'o' or 'b' implies binaryness, for p4, seeing a filetype like qr/u?x?binary/ or qr/x?tempobj/ or "resource" implies binaryness (to non-p4 destinations). NOTE: Seeing a 'o' or 'b' in a CVS source only ends up setting the 'b' option on the destination. That should be ok for most uses, but we can make it smarter for cvs->cvs transfers if need be.
#2	468	Barrie Slaymaker	- VCP::Dest::p4 now does change number aggregation based on the comment field changing or whenever a new revision of a file with unsubmitted changes shows up on the input stream. Since revisions of files are normally sorted in time order, this should work in a number of cases. I'm sure we'll need to generalize it, perhaps with a time thresholding function. - t/90cvs.t now tests cvs->p4 replication. - VCP::Dest::p4 now doesn't try to `p4 submit` when no changes are pending. - VCP::Rev now prevents the same label from being applied twice to a revision. This was occuring because the "r_1"-style label that gets added to a target revision by VCP::Dest::p4 could duplicate a label "r_1" that happened to already be on a revision. - Added t/00rev.t, the beginnings of a test suite for VCP::Rev. - Tweaked bin/gentrevml to comment revisions with their change number instead of using a unique comment for every revision for non-p4 t/test-*-in-0.revml files. This was necessary to test cvs->p4 functionality.
#1	467	Barrie Slaymaker	Version 0.01, initial checkin in perforce public depot.