relocate_xhtml.py #1

#             Perforce Defect Tracking Integration Project
#              <http://www.ravenbrook.com/project/p4dti/>
#
#           RELOCATE_XHTML.PY -- MAKE DOCUMENTS RELOCATABLE
#
#             Gareth Rees, Ravenbrook Limited, 2001-07-10
#
#
# 1. INTRODUCTION
#
# This module defines a class that edits links in XHTML documents so
# that a set of documents can be packaged into a distribution, unpacked
# on a random machine, and the links will still work.
#
# The intended readership is project developers.
#
# This document is not confidential.
#
#
# 1.1. What it does
#
# It goes through XHTML documents looking at every 'href' attribute.
# (It does so in a naive way, grepping for href="[^"]+".  I tried to the
# approach of analyzing the XHTML directly and so guaranteeing only to
# find the href attributes of anchor tags, but the XML parser in Python
# 2.0 [xml.sax] is not accurate enough: it does not call back with all
# document entities, for example &nbsp; appears to be ignored.)
#
# The xhtml/url rule [GDR 2001-04-22] means that every URL in a
# Ravenbrook document either specifies a method, like
# "http://info.ravenbrook.com/mail/2001/04/18/14-13-41/0.txt" or
# "mailto:gd@ravenbrook.com", or else specifies no method, no host, and
# an absolute path like "/project/p4dti/issue/job000331/".  See [RFC
# 1738] for the specification of URLs.
#
# Case 1. If the URL specifies a method, a host, or no path (for
# example, fragment identifier only) then we leave it unchanged.
#
# Case 2. If the URL specifies a relative path, convert to an absolute
# path and apply case 3 or 4 as appropriate.
#
# Case 3. If the URL names a document that will belong to the
# distribution.  We replace the absolute URL by a relative URL that will
# point to the target on the local disk or local web site.  We add
# "index.txt" or "index.html" as appropriate so that the link will
# resolve properly on servers that don't automatically supply index
# files, such as public.perforce.com.  We use ../ to specify the parent
# directory in the URL path, in accordance with [RFC 1808].
#
# Case 4. Otherwise, the URL names a document that will not belong to
# the distribution.  We add the method "http" and the host
# "www.ravenbrook.com" so that readers can get to the docuument online.
#
#
# 1.2. Terminology
#
# A "file path" (abbreviated to "fp") names a file or directory in the
# file system.  File paths have different conventions on different
# operating systems, for example 'd:\\p4\\project\\p4dti' on Windows,
# '/home/gdr/p4dti' on Unix, or 'Grouse:p4:project:p4dti' on MacOS.
#
# A "URL path" (abbreviated to "up") is the path component of a URL.
# URL paths always use '/' as the separator, regardless of operating
# system.
#
# A "file path list" (abbreviated to "fpl") is a list of components
# making up a file path, with the empty string indicating an empty
# component.
#
# A "URL path list" (abbreviated to "upl") is a list of components
# making up a URL path, with the empty string indicating an empty
# component.
#
#
# 1.3. How to use it
#
# Create a relocater object and pass:
#
#   1. The file path for the root of input to the distribution (no
#      trailing separator).
#   2. The URL path for the root of the distribution (no trailing
#      slash).
#   3. The file path for the root of the output (where converted files
#      are written).  If omitted, this defaults to the input root file
#      path.  No trailing separator.
#
# For example, when building the Integration Kit on Windows, you might
# specify
#
#   from relocate_xhtml import relocater
#   r = relocater('d:\\p4\\project\\p4dti\\version\\1.1',
#                 '/project/p4dti/version/1.1',
#                 'c:\\temp\\build')
#
# Or when building the P4DTI release on Unix, you might specify
#
#   r = relocater('/home/gdr/p4dti/version/1.1/manual',
#                 '/project/p4dti/version/1.1/manual',
#                 '/tmp/build')
#
# Then call the relocate_distribution() method, passing a file path (or
# list of file paths) specifying the distribution.
#
# This program can also be run as a script from the command line.  Use
# these arguments:
#
#   -i, --input   Input root file path.
#   -u, --url     Corresponding URL.
#   -o, --output  Output root file path.
#   -d, --dist    Distribution file path.
#
# You must make sure that all -d paths are below the -i path.  You can
# specify multiple -d options if your distribution isn't a complete
# subtree.


import getopt
import os
import re
import string
import sys
import types
import urlparse


# 2. THE RELOCATER CLASS

class relocater:


    # 2.1. Path variables and initialization

    default_hostname = None  # Host for unhosted URLs.
    dist_fpl_list = None     # File path lists in distribution.
    dist_upl_list = None     # URL path lists in distribution.
    input_fp = None          # Path to file currently being relocated.
    input_up = None          # URL path to current file.
    output_root_fp = None    # Output root file path.
    root_fp = None           # Input root file path.
    root_fpl = None          # The same, converted to a path list
    root_up = None           # Root URL path for the distribution.
    root_upl = None          # The same, converted to a path list

    def __init__(self, root_fp, root_up, output_root_fp = None,
                 default_hostname = "www.ravenbrook.com"):
        self.root_fp = root_fp
        self.root_fpl = self.fp_to_fpl(root_fp)
        self.root_up = root_up
        self.root_upl = self.up_to_upl(root_up)
        if output_root_fp:
            self.output_root_fp = output_root_fp
        else:
            # Modify in-place
            self.output_root_fp = root_fp
        self.default_hostname = default_hostname

        # Check there's no trailing separator on file paths.
        assert self.root_fpl[-1]
        assert self.fp_to_fpl(self.output_root_fp)[-1]
        # Check there's no trailing slash to the root URL path.
        assert self.root_upl[-1]


    # 2.2. Path conversion utilities

    # Convert a URL path into a list of its components, for example
    # '/project/p4dti/version/1.1/' -> ['', 'project', 'p4dti',
    # 'version', '1.1', ''].
    def up_to_upl(self, up):
        return string.split(up, '/')

    # Convert a list of path components to a URL path, for example ['',
    # 'project', 'p4dti', ''] -> '/project/p4dti/'.
    def upl_to_up(self, upl):
        return string.join(upl, '/')

    # Convert a file path to a list of its components, for example
    # 'd:\\p4\\project\\p4dti\\' -> ['d:\\', 'p4', 'project', 'p4dti',
    # ''].
    def fp_to_fpl(self, fp):
        pathlist = []
        while 1:
            dirname, basename = os.path.split(fp)
            if dirname == fp:
                pathlist.insert(0, dirname)
                return pathlist
            else:
                pathlist.insert(0, basename)
                fp = dirname

    # Convert a list of path components to a file path, for example
    # ['/', 'home', 'gdr', 'p4dti', 'index.html'] ->
    # '/home/gdr/p4dti/index.html'.
    def fpl_to_fp(self, fpl):
        return apply(os.path.join, fpl)

    # Convert a file path list to the URL path list that names the same
    # file (based on the correspondence between root_fp and root_up).
    # For example, if
    #
    #   root_fp = 'd:\\p4dti'
    #   root_up = '/project/p4dti/version/1.1'
    #
    # then this method will perform this conversion:
    #
    #   ['d:\\', 'p4dti', 'index.html']
    #   -> ['', 'project', 'p4dti', 'version', '1.1', 'index.html']
    #
    # It is an error if the argument is not under the root file path.
    def fpl_to_upl(self, fpl):
        l = len(self.root_fpl)
        assert (len(fpl) >= l and fpl[0:l] == self.root_fpl)
        return self.root_upl + fpl[l:]

    # Convert a URL path list to the file path list that names the same
    # file (based on the correspondence between root_fp and root_up),
    # choosing an index file if appropriate and if one exists.  For
    # example, if
    #
    #   root_fp = '/home/gdr/p4dti'
    #   root_up = '/project/p4dti/version/1.1'
    #
    # then this method will perform this conversion:
    #
    #   ['', 'project', 'p4dti', 'version', '1.1', 'manual', 'ag', '']
    #   -> ['/', 'home', 'gdr', 'manual', 'ag', 'index.html']
    #
    # It is an error if the argument is not under the root URL path.
    def upl_to_fpl(self, upl):
        l = len(self.root_upl)
        assert (len(upl) >= l and upl[0:l] == self.root_upl)
        fpl = self.root_fpl + upl[l:]
        if fpl[-1] == '':
            for f in ['index.html', 'index.txt']:
                fp = self.fpl_to_fp(fpl[0:-1] + [f])
                if os.path.isfile(fp):
                    fpl[-1] = f
                    return fpl
        return fpl

    # Convert file path to URL path.
    def fp_to_up(self, fp):
        return self.upl_to_up(self.fpl_to_upl(self.fp_to_fpl(fp)))

    # Convert URL path to file path.
    def up_to_fp(self, up):
        return self.fpl_to_fp(self.upl_to_fpl(self.up_to_upl(up)))

    # Convert a file path from one root to another.  It is an error if
    # the fpl argument isn't under root_fpl_1.
    def fp_to_fp(self, fp, root_fp_1, root_fp_2):
        fpl = self.fp_to_fpl(fp)
        root_fpl_1 = self.fp_to_fpl(root_fp_1)
        l = len(root_fpl_1)
        assert (len(fpl) >= l and fpl[0:l] == root_fpl_1)
        return self.fpl_to_fp(self.fp_to_fpl(root_fp_2) + fpl[l:])


    # 2.3. Make a relative URL path
    #
    # relative_up(source_fp, target_up) returns a URL path that could be
    # inserted into the file named by source_fp and would link to the
    # same target as target_up), specifying an index file if appropriate
    # and one exists.  For example, if we have
    #
    #   root_fp = 'd:\\p4dti'
    #   root_up = '/project/p4dti'
    #   source_fp = 'd:\\p4dti\\version\\1.1\\manual\\ag\\index.html'
    #   target_up is '/project/p4dti/version/1.1/manual/ug/'
    #
    # then this method returns '../ug/index.html'
    def relative_up(self, source_fp, target_up):
        source_fpl = self.fp_to_fpl(source_fp)
        target_upl = self.up_to_upl(target_up)
        target_fpl = self.upl_to_fpl(target_upl)
        while (source_fpl and target_fpl
               and source_fpl[0] == target_fpl[0]):
            source_fpl = source_fpl[1:]
            target_fpl = target_fpl[1:]
        relative_upl = ['..'] * (len(source_fpl) - 1) + target_fpl
        return self.upl_to_up(relative_upl)


    # 2.4. Determine if something is in the distribution

    # fp_in_distribution(fp) returns 1 if the file path is in the
    # distribution, 0 otherwise.
    def fp_in_distribution(self, fp):
        fpl = self.fp_to_fpl(fp)
        for dist_fpl in self.dist_fpl_list:
            l = len(dist_fpl)
            if len(fpl) >= l and fpl[0:l] == dist_fpl:
                return 1
        return 0

    # up_in_distribution(up) returns 1 if the URL path is in the
    # distribution, 0 otherwise.
    def up_in_distribution(self, up):
        upl = self.up_to_upl(up)
        for dist_upl in self.dist_upl_list:
            l = len(dist_upl)
            if len(upl) >= l and upl[0:l] == dist_upl:
                return 1
        return 0


    # 2.5. Replace a URL
    #
    # This method is designed to be used as an argument to the re.sub
    # method.  It takes a match object whose group 0 is 'href="TARGET"'
    # and whose group 1 is the target itself.  It returns replacement
    # text of the form 'href="REVISED-TARGET"'.
    def replace_url(self, match):
        target = list(urlparse.urlparse(match.group(1)))
        if (target[0] or target[1] or target[2] == ''):
            # Case 1. Target URL specifies a method, a host, or no path.
            # Leave it unchanged.
            return match.group(0)
        else:
            if target[2][0] != '/':
                # Case 2. Target URL specifies a relative URL path.
                # Convert to absolute URL path and continue.
                target_upl = (self.input_upl[0:-1]
                              + self.up_to_upl(target[2]))
                target[2] = self.upl_to_up(target_upl)
            if self.up_in_distribution(target[2]):
                # Case 3. Target URL names a document that will belong
                # to the distribution.  Replace by relative URL.
                target[2] = self.relative_up(self.input_fp, target[2])
            else:
                # Case 4. Target not in distribution, add method, host.
                target[0] = 'http'
                target[1] = self.default_hostname
        new_url = urlparse.urlunparse(tuple(target))
        return ('href="%s"' % new_url)


    # 2.6. Relocate a file

    url_re = re.compile('href="([^"]+)"')

    def relocate_file(self, fp):
        self.input_fp = fp
        self.input_up = self.fp_to_up(fp)
        self.input_upl = self.up_to_upl(self.input_up)
        output_fp = self.fp_to_fp(self.input_fp, self.root_fp,
                                  self.output_root_fp)
        dirname = os.path.dirname(output_fp)
        if not os.path.isdir(dirname):
            os.makedirs(dirname)
        if self.input_fp != output_fp:
            print "  Converting", self.input_fp, "to", output_fp
        else:
            print "  Converting", self.input_fp
        input = open(self.input_fp, 'r')
        lines = map(lambda l, s=self: s.url_re.sub(s.replace_url, l),
                    input.readlines())
        input.close()
        output = open(output_fp, 'w')
        output.writelines(lines)
        output.close()


    # 2.7. Relocate files in a path
    #
    # relocate_path(fp) recursively descends directories below the file
    # path given by fp, relocating all the XHTML files it finds there .
    def relocate_path(self, fp):
        if os.path.isdir(fp):
            for f in os.listdir(fp):
                new_fp = os.path.join(fp, f)
                self.relocate_path(new_fp)
        elif (os.path.isfile(fp)
              and os.path.splitext(fp)[1] == '.html'
              and open(fp, 'r').readline()[0:5] == '<?xml'):
            self.relocate_file(fp)


    # 2.8. Relocate a distribution.
    #
    # relocate_distribution(fp_list) takes a list of file paths
    # constituting the whole distribution.  It relocates all the XHTML
    # files below these paths.
    def relocate_distribution(self, fp_list):
        if not isinstance(fp_list, types.ListType):
            fp_list = [fp_list]
        self.dist_fpl_list = map(self.fp_to_fpl, fp_list)
        self.dist_upl_list = map(self.fpl_to_upl, self.dist_fpl_list)
        for fp in fp_list:
            self.relocate_path(fp)


# 3. COMMAND-LINE INTERFACE

def usage(err = None):
    if err:
        print err
    print("Usage: %s OPTIONS\n"
          "Options: -i, --input   Input root file path.\n"
          "         -u, --url     Corresponding URL.\n"
          "         -o, --output  "
          "Output root file path (defaults to -i).\n"
          "         -d, --dist    "
          "Distribution file path (defaults to -i).\n")
    sys.exit(1)

def run():
    opts, paths = getopt.getopt(sys.argv[1:], 'i:u:o:d:',
                                ['input=', 'url=', 'ouput=', 'dist='])
    root_fp = None
    root_up = None
    output_root_fp = None
    dist_fp_list = []
    for o, a in opts:
        if o in ('-i', '--input'):
            root_fp = a
        elif o in ('-u', '--url'):
            root_up = a
        elif o in ('-o', '--output'):
            output_root_fp = a
        elif o in ('-d', '--dist'):
            dist_fp_list.append(a)
        else:
            usage()
    if paths or root_fp == None or root_up == None:
        usage()
    if output_root_fp == None:
        output_root_fp = root_fp
    if dist_fp_list == []:
        dist_fp_list.append(root_fp)
    r = relocater(root_fp, root_up, output_root_fp)
    r.relocate_distribution(dist_fp_list)

if __name__ == '__main__':
    run()


# A. REFERENCES
#
# [GDR 2001-04-22] "Rules for XHTML documents"; Gareth Rees; Ravenbrook
# Limited; 2001-04-22; <http://info.ravenbrook.com/rule/xhtml/>.
#
# [RFC 1738] "Uniform Resource Locators (URL)"; T Berners-Lee, L
# Masinter, M McCahill; 1994-12;
# <http://src.doc.ic.ac.uk/rfc/rfc1738.txt>.
#
# [RFC 1808] "Relative Uniform Resource Locators"; R Fielding; 1995-06;
# <http://src.doc.ic.ac.uk/rfc/rfc1808.txt>.
#
# [xml.sax] "xml.sax -- Support for SAX2 parsers"; Guido van Rossum;
# 2000-10-16;
# <http://www.python.org/doc/2.0/lib/module-xml.sax.html>.
#
#
# B. DOCUMENT HISTORY
#
# 2001-07-10 GDR Created.
#
#
# C. COPYRIGHT AND LICENCE
#
# This file is copyright (c) 2001 Perforce Software, Inc.  All
# rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# 1.  Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#
# 2.  Redistributions in binary form must reproduce the above copyright
#     notice, this list of conditions and the following disclaimer in
#     the documentation and/or other materials provided with the
#     distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDERS AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
# DAMAGE.
#
#
# $Id: //info.ravenbrook.com/project/p4dti/version/2.0/tool/relocate_xhtml.py#1 $
#	Change	User	Description	Committed
#2	4326	Robert Cowham	Tidied up
#1	4187	Robert Cowham	Initial version of PVCS (now Merant) Tracker Integration.