package VCP::Source::revml ; =head1 NAME VCP::Source::revml - Outputs versioned files to a revml file =head1 SYNOPSIS ## revml output class: vcp foo.revml [dest_spec] vcp foo.revml --uncompress [dest_spec] vcp foo.revml --dtd [dest_spec] vcp foo.revml --version [dest_spec] vcp revml:foo.revml:/foo/bar/... [dest_spec] Where is a filename for input; or missing or '-' for STDIN. =head1 DESCRIPTION This source driver allows L to read a RevML file. For now, all revisions are fully reconstituted in the working directory in order to make sure that all of the patches apply cleanly. This can require a huge amount of disk space, but it works (optimizing this is on the TODO). =head1 OPTIONS =over =item --dtd --dtd=0.33 --version=0.33 Use the indicated DTD version (must be compiled in to VCP) to parse the RevML file. =item --uncompress Force uncompression of gzipped input. If the input file ends in '.gz', the uncompress flag is implied. =back =cut =for DEVELOPER_USE_ONLY To use an alternate DTD: vcp revml[:] --dtd =cut use VCP::Logger qw( pr ); BEGIN { ## Beginning vcpers might try running a command like "vcp" just ## to see what happens. Since RevML is not required for most ## vcp uses and XML::Parser requires a C compiler, vcp is often ## distributed without XML::Parser, so this message is to help ## steer hapless new users to the help system. ## The exit(1) is to avoid untidy and scary ## "compilation failed in BEGIN" messages pr( < is read 'IN_FH', ## The handle of the input revml file 'WORK_NAME', ## The name of the working file (diff or content) 'WORK_FH', ## The filehandle of working file 'REV', ## The VCP::Rev containing all of this rev's meta info 'STACK', ## A stack of currently open elements 'UNDECODED_CONTENT', ## Base64 content waiting to be decoded. 'FILESPEC_RE', ## A perl5 re compiled from $self->repo_filespec 'UNCOMPRESS', ## un-compress gzipped input ) ; #=item new # #Creates a new instance. The only parameter is '-dtd', which overrides #the default DTD found by searching for modules matching RevML::DTD:v*.pm. # #=cut sub new { my $class = shift ; $class = ref $class || $class ; my VCP::Source::revml $self = $class->SUPER::new( @_ ) ; my ( $spec, $options ) = @_ ; $self->parse_revml_repo_spec( $spec ) unless empty $spec; $self->parse_options( $options ); return $self; } sub options_spec { my VCP::Source::revml $self = shift; return ( $self->SUPER::options_spec, 'dtd|version=s' => \$self->{DTD_OPTION}, 'uncompress' => \$self->{UNCOMPRESS}, ); } sub init { my VCP::Source::revml $self = shift ; $self->SUPER::init; $self->{DOCTYPE} = RevML::Doctype->new( $self->{DTD_OPTION} ); ## This supports a brain-dead compatability mode where you can ## "just" say revml:filename.revml or even just filename.revml ## on the command line. The parse routines will stick that ## in repo_filespec and not set the server. If, however, the ## server is set, then the filespec is a pattern we need to use ## to select files. my $file = $self->repo_server; if ( empty $file ) { $self->repo_server( $self->repo_filespec ); $self->repo_filespec( undef ); $file = $self->repo_server; } $file = "-" if empty $file; $file = $self->repo_server( File::Spec->rel2abs( $file, start_dir ) ) unless $file eq "-"; $file = "-" if empty $file; # always un-compress if filename ends in ".gz" my $gzip; if ( $^O =~ /Win32/ ) { $self->{UNCOMPRESS} = 1 if $file =~ /\.gz$/i ; $gzip = "gzip.exe"; } else { $self->{UNCOMPRESS} = 1 if $file =~ /\.gz$/ ; $gzip = "gzip"; } my $fs = $self->repo_filespec; $self->{FILESPEC_RE} = ( ! empty $fs ) ? compile_shellish( $fs ) : qr{^}; if ( $file eq '-' ) { if( $self->{UNCOMPRESS} ) { open( $self->{IN_FH}, "gzip --decompress --stdout - |" ) or die "$!: gzip --decompress --stdout - |"; } else { $self->{IN_FH} = \*STDIN ; } ## TODO: Check IN_FH for writability when it's set to STDIN ## don't you mean readability? } else { require Symbol ; $self->{IN_FH} = Symbol::gensym ; if( $self->{UNCOMPRESS} ) { my $in_name = shell_quote $file; open( $self->{IN_FH}, "gzip --decompress --stdout $in_name |" ) or die "$!: gzip --decompress --stdout $in_name |"; } else { open( $self->{IN_FH}, "<$file" ) or die "$!: $file\n"; } } $self->{WORK_FH} = Symbol::gensym ; return $self ; } sub handle_header { my VCP::Source::revml $self = shift ; ( $self->{HEADER} ) = @_; ## Save this off until we get our first rev from the input $self->revs( VCP::Revs->new ) ; $self->parse_revml_file ; $self->dest->handle_header( $self->{HEADER} ); } sub get_rev { my VCP::Source::revml $self = shift ; my VCP::Rev $r ; ( $r ) = @_ ; die "can't check out ", $r->as_string, "\n" unless defined $r->is_base_rev || $r->action eq "add" || $r->action eq "edit"; return $r->work_path; } sub parse_revml_file { my VCP::Source::revml $self = shift ; my @stack ; $self->{STACK} = \@stack ; my $char_handler = sub { my $expat = shift ; my $pelt = $stack[-1] ; ## parent element my $tag = $pelt->{NAME} ; $pelt->{TEXT} .= $_[0] if exists $pelt->{TEXT} && defined $pelt->{TEXT}; my $sub = "${tag}_characters" ; $self->$sub( @_ ) if $self->can( $sub ) ; } ; my $p = XML::Parser->new( Handlers => { Start => sub { my $expat = shift ; my $tag = shift ; if ( $tag eq "char" ) { while ( @_ ) { my ( $attr, $value ) = ( shift, shift ) ; #print STDERR $value, "=" ; if ( $attr eq "code" ) { if ( $value =~ s{^0x}{} ) { $value = chr( hex( $value ) ) ; } else { $value = chr( $value ) ; } #print STDERR ord $value, "\n" ; $char_handler->( $expat, $value ) ; } } return ; } ## TODO: suss out "container" elements from the doctype. push @stack, { NAME => $tag, @_, ( $self->can( "${tag}_characters" ) || 0 <= index "revml,rev,branch,branches,", $tag . "," ) ? () : ( TEXT => "" ), } ; my $sub = "start_$tag" ; $self->$sub( @_ ) if $self->can( $sub ) ; }, End => sub { my $expat = shift ; my $tag = shift ; return if $tag eq "char" ; #print STDERR "\n" ; die "Unexpected , expected \n" unless $tag eq $stack[-1]->{NAME} ; my $sub = "end_$tag" ; $self->$sub( @_ ) if $self->can( $sub ) ; my $elt = pop @stack ; if ( @stack ) { if ( exists $elt->{TEXT} && defined $elt->{TEXT} ) { ## Save all the meta fields for start_content() or start_diff() if ( $tag eq 'label' ) { push @{$stack[-1]->{labels}}, $elt->{TEXT} ; } elsif ( $stack[-1]->{NAME} eq "revml" ) { die "Header field $tag after first rev\n" if $self->revs->get; ## ASSume none of these occur after first rev. $self->{HEADER}->{$tag} = $elt->{TEXT} ; } else { $stack[-1]->{$tag} = $elt->{TEXT} ; } } else { ## It's a node with child nodes. delete $elt->{NAME}; if ( $tag eq "branch" ) { require VCP::Branches; require VCP::Branch; $stack[-1]->{branches} ||= VCP::Branches->new; $stack[-1]->{branches}->add( VCP::Branch->new( %$elt ) ); } elsif ( $tag eq "branches" ) { $self->{HEADER}->{branches} = $elt->{branches}; } elsif ( $stack[-1]->{NAME} eq "revml" && $tag ne "rev" ) { die "Header field $tag after first rev\n" if $self->revs->get; ## ASSume none of these occur after first rev. $self->{HEADER}->{$tag} = $elt; } else { $stack[-1]->{$tag} = $elt; } } } }, Char => $char_handler, }, ) ; $p->parse( $self->{IN_FH} ) ; } sub start_rev { my VCP::Source::revml $self = shift ; ## Make sure no older rev is lying around to confuse us. $self->{REV} = undef ; } ## RevML is contstrained so that the diff and content tags are after all of ## the meta info for a revision. And we really don't want to hold ## the entire content of a file in memory, in case it's large. So we ## intercept start_content and start_diff and initialize the REV ## member as well as opening a place to catch all of the data that gets ## extracted from the file. sub init_rev_meta { my VCP::Source::revml $self = shift ; my ( $is_placeholder ) = @_; my $rev_elt = $self->{STACK}->[-2] ; my VCP::Rev $r = VCP::Rev->new() ; ## All revml tag naes are lc, all internal data member names are uc #require Data::Dumper ; print Data::Dumper::Dumper( $self->{STACK} ) ; for my $key ( grep /^[a-z_0-9]+$/, keys %$rev_elt ) { if ( $key eq 'labels' ) { $r->set_labels( $rev_elt->{labels} ); } else { ## We know that all kids *in use today* of are pure PCDATA ## Later, we'll need sub-attributes. ## TODO: Flatten the element tree by preficing attribute names ## with '@'?. $r->$key( $rev_elt->{$key} ) ; } } #require Data::Dumper ; print Data::Dumper::Dumper( $r ) ; if ( $is_placeholder ) { $r->action( "placeholder" ); } else { $r->work_path( $self->work_path( $r->name, $r->branch_id || "-", $r->rev_id ) ) ; $self->mkpdir( $r->work_path ) ; } $self->{REV} = $r ; return ; } sub start_delete { my VCP::Source::revml $self = shift ; $self->init_rev_meta ; $self->{REV}->set_action( "delete" ) ; ## Clear the work_path so that VCP::Rev doesn't try to delete it. $self->{REV}->set_work_path( undef ) ; 1; ## prevent void context warning } sub start_move { my VCP::Source::revml $self = shift ; $self->init_rev_meta ; $self->{REV}->set_action( "move" ) ; ## Clear the work_path so that VCP::Rev doesn't try to delete it. $self->{REV}->set_work_path( undef ) ; die " unsupported" ; } sub start_content { my VCP::Source::revml $self = shift ; $self->init_rev_meta ; #require Data::Dumper ; print Data::Dumper::Dumper( $self->{REV} ) ; $self->{REV}->action( "edit" ) ; $self->{WORK_NAME} = $self->{REV}->work_path ; $self->{UNDECODED_CONTENT} = "" ; debug "writing $self->{WORK_NAME}" if debugging; sysopen $self->{WORK_FH}, $self->{WORK_NAME}, O_WRONLY | O_CREAT | O_TRUNC or die "$!: $self->{WORK_NAME}" ; ## The binmode here is to make sure we don't convert \n to \r\n and ## to allow ^Z out the door (^Z is EOF on windows, and they take those ## things rather more seriously there than on Unix). binmode $self->{WORK_FH}; } sub content_characters { my VCP::Source::revml $self = shift ; if ( $self->{STACK}->[-1]->{encoding} eq "base64" ) { $self->{UNDECODED_CONTENT} .= shift ; if ( $self->{UNDECODED_CONTENT} =~ s{(.*\n)}{} ) { syswrite( $self->{WORK_FH}, decode_base64( $1 ) ) or die "$! writing $self->{WORK_NAME}" ; } } elsif ( $self->{STACK}->[-1]->{encoding} eq "none" ) { # print STDERR map( sprintf( " %02x=$_", ord ), $_[0] =~ m/(.)/gs ), "\n" ; syswrite $self->{WORK_FH}, $_[0] or die "$! writing $self->{WORK_NAME}" ; } else { die "unknown encoding '$self->{STACK}->[-1]->{encoding}'\n"; } return ; } sub end_content { my VCP::Source::revml $self = shift ; if ( length $self->{UNDECODED_CONTENT} ) { syswrite( $self->{WORK_FH}, decode_base64( $self->{UNDECODED_CONTENT} ) ) or die "$! writing $self->{WORK_NAME}" ; } close $self->{WORK_FH} or die "$! closing $self->{WORK_NAME}" ; } sub start_delta { my VCP::Source::revml $self = shift ; $self->init_rev_meta ; my $r = $self->{REV} ; $r->action( 'edit' ) ; $self->{WORK_NAME} = $self->work_path( $r->name, $r->branch_id || "-", 'delta' ) ; sysopen $self->{WORK_FH}, $self->{WORK_NAME}, O_WRONLY | O_CREAT | O_TRUNC or die "$!: $self->{WORK_NAME}" ; ## See comment in start_content :) binmode $self->{WORK_FH}; } ## TODO: Could keep deltas in memory if they're small. *delta_characters = \&content_characters ; ## grumble...name used once warning...grumble *delta_characters = \&content_characters ; sub end_delta { my VCP::Source::revml $self = shift ; close $self->{WORK_FH} or die "$! closing $self->{WORK_NAME}" ; #print STDERR `hexdump -cx $self->{WORK_NAME}` ; my VCP::Rev $r = $self->{REV} ; my $abs_name = "$self->{HEADER}->{rev_root}/" . $r->name; if ( $abs_name !~ $self->{FILESPEC_RE} ) { return; } my VCP::Rev $bv_r = $self->revs->get( $r->previous_id ) ; $bv_r = $bv_r->previous while ! defined $bv_r->work_path; die "No original content to patch for ", $r->as_string unless defined $bv_r; if ( -s $self->{WORK_NAME} ) { ## source fn, result fn, patch fn vcp_patch( $bv_r->work_path, $r->work_path, $self->{WORK_NAME} ); unless ( $ENV{VCPNODELETE} ) { unlink $self->{WORK_NAME} or pr "$! unlinking $self->{WORK_NAME}\n" ; } } else { ## TODO: Don't assume working link() debug "linking ", $bv_r->work_path, ", ", $r->work_path if debugging ; link $bv_r->work_path, $r->work_path or die "$! linking ", $bv_r->work_path, ", ", $r->work_path } } ## Convert ISO8601 UTC time to local time since the epoch sub end_time { my VCP::Source::revml $self = shift ; my $timestr = $self->{STACK}->[-1]->{TEXT}; ## TODO: Get parser context here & give file, line, and column. filename ## and rev, while we're scheduling more work for the future. confess "Malformed time value $timestr\n" unless $timestr =~ /^\d\d\d\d(\D\d\d){5}/ ; confess "Non-UTC time value $timestr\n" unless substr $timestr, -1 eq 'Z' ; my @f = split( /\D/, $timestr ) ; --$f[1] ; # Month of year needs to be 0..11 $self->{STACK}->[-1]->{TEXT} = timegm( reverse @f ) ; } # double assign => avoid used once warning *end_mod_time = *end_mod_time = \&end_time ; ## TODO: Verify that we should be using a Base64 encoded MD5 digest, ## according to 's attributes. Oh, and same goes for 's ## encoding. ## TODO: workaround backfilling if the destination is revml, since ## it can't put the original content in place. We'll need to flag ## some kind of special pass-through mode for that. sub end_digest { my VCP::Source::revml $self = shift ; $self->init_rev_meta unless defined $self->{REV} ; my $r = $self->{REV} ; my $abs_name = "$self->{HEADER}->{rev_root}/" . $r->name; if ( $abs_name !~ $self->{FILESPEC_RE} ) { return; } my $original_digest = $self->{STACK}->[-1]->{TEXT}; if ( $r->is_base_rev ) { ## Don't bother checking the digest if the destination returns ## FALSE, meaning that a backfill is not possible with that destination. ## VCP::Dest::revml does this. if ( $self->{HEADER} ) { $self->dest->handle_header( $self->{HEADER} ); $self->{HEADER} = undef; } return unless $self->dest->backfill( $r ) ; } my $work_path = $r->work_path ; my $d = Digest::MD5->new() ; sysopen F, $work_path, O_RDONLY or die "$! opening '$work_path' for digestion\n" ; ## See comment for binmode in start_content :) binmode F; $d->addfile( \*F ) ; close F ; my $reconstituted_digest = $d->b64digest ; ## TODO: provide an option to turn this in to a warning ## TODO: make this abort writing anything to the dest, but continue ## processing, so as to deliver as many error messages as possible. unless ( $original_digest eq $reconstituted_digest ) { my $reject_file_name = $r->name ; $reject_file_name =~ s{[^A-Za-z0-9 -.]+}{-}g ; $reject_file_name =~ s{^-+}{}g ; my $reject_file_path = File::Spec->catfile( File::Spec->tmpdir, $reject_file_name ) ; link $work_path, $reject_file_path or die "digest check failed for ", $r->as_string, "\n", " failed to leave copy in '$reject_file_path': $!\n" ; die "digest check failed for ", $r->as_string, "\n", " copy left in '$reject_file_path'\n", " got digest: $reconstituted_digest\n", " expected digest: $original_digest\n"; } } sub end_placeholder { my VCP::Source::revml $self = shift ; $self->init_rev_meta( "is_placeholder" ); } ## Having this and no sub rev_characters causes the parser to accumulate ## content. sub end_rev { my VCP::Source::revml $self = shift ; my $abs_name = "$self->{HEADER}->{rev_root}/" . $self->{REV}->name; if ( $abs_name !~ $self->{FILESPEC_RE} ) { return; } $self->revs->add( $self->{REV} ); ## This is backwards from the other sources, because RevML files are ## in oldest-first order, while other VCP sources deal with SCMs that ## report file logs in newest-first order. if ( defined $self->{REV}->previous_id ) { $self->{REV}->previous( $self->revs->get( $self->{REV}->previous_id ) ) if defined $self->{REV}->previous_id; } ## Release this rev. $self->{REV} = undef ; } =head1 AUTHOR Barrie Slaymaker =head1 COPYRIGHT Copyright (c) 2000, 2001, 2002 Perforce Software, Inc. All rights reserved. See L (C) for the terms of use. =cut 1 ;