# Perforce Defect Tracking Integration Project
# <http://www.ravenbrook.com/project/p4dti/>
#
# RELOCATE_XHTML.PY -- MAKE DOCUMENTS RELOCATABLE
#
# Gareth Rees, Ravenbrook Limited, 2001-07-10
#
#
# 1. INTRODUCTION
#
# This module defines a class that edits links in XHTML documents so
# that a set of documents can be packaged into a distribution, unpacked
# on a random machine, and the links will still work.
#
# The intended readership is project developers.
#
# This document is not confidential.
#
#
# 1.1. What it does
#
# It goes through XHTML documents looking at every 'href' attribute.
# (It does so in a naive way, grepping for href="[^"]+". I tried to the
# approach of analyzing the XHTML directly and so guaranteeing only to
# find the href attributes of anchor tags, but the XML parser in Python
# 2.0 [xml.sax] is not accurate enough: it does not call back with all
# document entities, for example appears to be ignored.)
#
# The xhtml/url rule [GDR 2001-04-22] means that every URL in a
# Ravenbrook document either specifies a method, like
# "http://info.ravenbrook.com/mail/2001/04/18/14-13-41/0.txt" or
# "mailto:gd@ravenbrook.com", or else specifies no method, no host, and
# an absolute path like "/project/p4dti/issue/job000331/". See [RFC
# 1738] for the specification of URLs.
#
# Case 1. If the URL specifies a method, a host, or no path (for
# example, fragment identifier only) then we leave it unchanged.
#
# Case 2. If the URL specifies a relative path, convert to an absolute
# path and apply case 3 or 4 as appropriate.
#
# Case 3. If the URL names a document that will belong to the
# distribution. We replace the absolute URL by a relative URL that will
# point to the target on the local disk or local web site. We add
# "index.txt" or "index.html" as appropriate so that the link will
# resolve properly on servers that don't automatically supply index
# files, such as public.perforce.com. We use ../ to specify the parent
# directory in the URL path, in accordance with [RFC 1808].
#
# Case 4. Otherwise, the URL names a document that will not belong to
# the distribution. We add the method "http" and the host
# "www.ravenbrook.com" so that readers can get to the docuument online.
#
#
# 1.2. Terminology
#
# A "file path" (abbreviated to "fp") names a file or directory in the
# file system. File paths have different conventions on different
# operating systems, for example 'd:\\p4\\project\\p4dti' on Windows,
# '/home/gdr/p4dti' on Unix, or 'Grouse:p4:project:p4dti' on MacOS.
#
# A "URL path" (abbreviated to "up") is the path component of a URL.
# URL paths always use '/' as the separator, regardless of operating
# system.
#
# A "file path list" (abbreviated to "fpl") is a list of components
# making up a file path, with the empty string indicating an empty
# component.
#
# A "URL path list" (abbreviated to "upl") is a list of components
# making up a URL path, with the empty string indicating an empty
# component.
#
#
# 1.3. How to use it
#
# Create a relocater object and pass:
#
# 1. The file path for the root of input to the distribution (no
# trailing separator).
# 2. The URL path for the root of the distribution (no trailing
# slash).
# 3. The file path for the root of the output (where converted files
# are written). If omitted, this defaults to the input root file
# path. No trailing separator.
#
# For example, when building the Integration Kit on Windows, you might
# specify
#
# from relocate_xhtml import relocater
# r = relocater('d:\\p4\\project\\p4dti\\version\\1.1',
# '/project/p4dti/version/1.1',
# 'c:\\temp\\build')
#
# Or when building the P4DTI release on Unix, you might specify
#
# r = relocater('/home/gdr/p4dti/version/1.1/manual',
# '/project/p4dti/version/1.1/manual',
# '/tmp/build')
#
# Then call the relocate_distribution() method, passing a file path (or
# list of file paths) specifying the distribution.
#
# This program can also be run as a script from the command line. Use
# these arguments:
#
# -i, --input Input root file path.
# -u, --url Corresponding URL.
# -o, --output Output root file path.
# -d, --dist Distribution file path.
#
# You must make sure that all -d paths are below the -i path. You can
# specify multiple -d options if your distribution isn't a complete
# subtree.
import getopt
import os
import re
import string
import sys
import types
import urlparse
# 2. THE RELOCATER CLASS
class relocater:
# 2.1. Path variables and initialization
default_hostname = None # Host for unhosted URLs.
dist_fpl_list = None # File path lists in distribution.
dist_upl_list = None # URL path lists in distribution.
input_fp = None # Path to file currently being relocated.
input_up = None # URL path to current file.
output_root_fp = None # Output root file path.
root_fp = None # Input root file path.
root_fpl = None # The same, converted to a path list
root_up = None # Root URL path for the distribution.
root_upl = None # The same, converted to a path list
def __init__(self, root_fp, root_up, output_root_fp = None,
default_hostname = "www.ravenbrook.com"):
self.root_fp = root_fp
self.root_fpl = self.fp_to_fpl(root_fp)
self.root_up = root_up
self.root_upl = self.up_to_upl(root_up)
if output_root_fp:
self.output_root_fp = output_root_fp
else:
# Modify in-place
self.output_root_fp = root_fp
self.default_hostname = default_hostname
# Check there's no trailing separator on file paths.
assert self.root_fpl[-1]
assert self.fp_to_fpl(self.output_root_fp)[-1]
# Check there's no trailing slash to the root URL path.
assert self.root_upl[-1]
# 2.2. Path conversion utilities
# Convert a URL path into a list of its components, for example
# '/project/p4dti/version/1.1/' -> ['', 'project', 'p4dti',
# 'version', '1.1', ''].
def up_to_upl(self, up):
return string.split(up, '/')
# Convert a list of path components to a URL path, for example ['',
# 'project', 'p4dti', ''] -> '/project/p4dti/'.
def upl_to_up(self, upl):
return string.join(upl, '/')
# Convert a file path to a list of its components, for example
# 'd:\\p4\\project\\p4dti\\' -> ['d:\\', 'p4', 'project', 'p4dti',
# ''].
def fp_to_fpl(self, fp):
pathlist = []
while 1:
dirname, basename = os.path.split(fp)
if dirname == fp:
pathlist.insert(0, dirname)
return pathlist
else:
pathlist.insert(0, basename)
fp = dirname
# Convert a list of path components to a file path, for example
# ['/', 'home', 'gdr', 'p4dti', 'index.html'] ->
# '/home/gdr/p4dti/index.html'.
def fpl_to_fp(self, fpl):
return apply(os.path.join, fpl)
# Convert a file path list to the URL path list that names the same
# file (based on the correspondence between root_fp and root_up).
# For example, if
#
# root_fp = 'd:\\p4dti'
# root_up = '/project/p4dti/version/1.1'
#
# then this method will perform this conversion:
#
# ['d:\\', 'p4dti', 'index.html']
# -> ['', 'project', 'p4dti', 'version', '1.1', 'index.html']
#
# It is an error if the argument is not under the root file path.
def fpl_to_upl(self, fpl):
l = len(self.root_fpl)
assert (len(fpl) >= l and fpl[0:l] == self.root_fpl)
return self.root_upl + fpl[l:]
# Convert a URL path list to the file path list that names the same
# file (based on the correspondence between root_fp and root_up),
# choosing an index file if appropriate and if one exists. For
# example, if
#
# root_fp = '/home/gdr/p4dti'
# root_up = '/project/p4dti/version/1.1'
#
# then this method will perform this conversion:
#
# ['', 'project', 'p4dti', 'version', '1.1', 'manual', 'ag', '']
# -> ['/', 'home', 'gdr', 'manual', 'ag', 'index.html']
#
# It is an error if the argument is not under the root URL path.
def upl_to_fpl(self, upl):
l = len(self.root_upl)
assert (len(upl) >= l and upl[0:l] == self.root_upl)
fpl = self.root_fpl + upl[l:]
if fpl[-1] == '':
for f in ['index.html', 'index.txt']:
fp = self.fpl_to_fp(fpl[0:-1] + [f])
if os.path.isfile(fp):
fpl[-1] = f
return fpl
return fpl
# Convert file path to URL path.
def fp_to_up(self, fp):
return self.upl_to_up(self.fpl_to_upl(self.fp_to_fpl(fp)))
# Convert URL path to file path.
def up_to_fp(self, up):
return self.fpl_to_fp(self.upl_to_fpl(self.up_to_upl(up)))
# Convert a file path from one root to another. It is an error if
# the fpl argument isn't under root_fpl_1.
def fp_to_fp(self, fp, root_fp_1, root_fp_2):
fpl = self.fp_to_fpl(fp)
root_fpl_1 = self.fp_to_fpl(root_fp_1)
l = len(root_fpl_1)
assert (len(fpl) >= l and fpl[0:l] == root_fpl_1)
return self.fpl_to_fp(self.fp_to_fpl(root_fp_2) + fpl[l:])
# 2.3. Make a relative URL path
#
# relative_up(source_fp, target_up) returns a URL path that could be
# inserted into the file named by source_fp and would link to the
# same target as target_up), specifying an index file if appropriate
# and one exists. For example, if we have
#
# root_fp = 'd:\\p4dti'
# root_up = '/project/p4dti'
# source_fp = 'd:\\p4dti\\version\\1.1\\manual\\ag\\index.html'
# target_up is '/project/p4dti/version/1.1/manual/ug/'
#
# then this method returns '../ug/index.html'
def relative_up(self, source_fp, target_up):
source_fpl = self.fp_to_fpl(source_fp)
target_upl = self.up_to_upl(target_up)
target_fpl = self.upl_to_fpl(target_upl)
while (source_fpl and target_fpl
and source_fpl[0] == target_fpl[0]):
source_fpl = source_fpl[1:]
target_fpl = target_fpl[1:]
relative_upl = ['..'] * (len(source_fpl) - 1) + target_fpl
return self.upl_to_up(relative_upl)
# 2.4. Determine if something is in the distribution
# fp_in_distribution(fp) returns 1 if the file path is in the
# distribution, 0 otherwise.
def fp_in_distribution(self, fp):
fpl = self.fp_to_fpl(fp)
for dist_fpl in self.dist_fpl_list:
l = len(dist_fpl)
if len(fpl) >= l and fpl[0:l] == dist_fpl:
return 1
return 0
# up_in_distribution(up) returns 1 if the URL path is in the
# distribution, 0 otherwise.
def up_in_distribution(self, up):
upl = self.up_to_upl(up)
for dist_upl in self.dist_upl_list:
l = len(dist_upl)
if len(upl) >= l and upl[0:l] == dist_upl:
return 1
return 0
# 2.5. Replace a URL
#
# This method is designed to be used as an argument to the re.sub
# method. It takes a match object whose group 0 is 'href="TARGET"'
# and whose group 1 is the target itself. It returns replacement
# text of the form 'href="REVISED-TARGET"'.
def replace_url(self, match):
target = list(urlparse.urlparse(match.group(1)))
if (target[0] or target[1] or target[2] == ''):
# Case 1. Target URL specifies a method, a host, or no path.
# Leave it unchanged.
return match.group(0)
else:
if target[2][0] != '/':
# Case 2. Target URL specifies a relative URL path.
# Convert to absolute URL path and continue.
target_upl = (self.input_upl[0:-1]
+ self.up_to_upl(target[2]))
target[2] = self.upl_to_up(target_upl)
if self.up_in_distribution(target[2]):
# Case 3. Target URL names a document that will belong
# to the distribution. Replace by relative URL.
target[2] = self.relative_up(self.input_fp, target[2])
else:
# Case 4. Target not in distribution, add method, host.
target[0] = 'http'
target[1] = self.default_hostname
new_url = urlparse.urlunparse(tuple(target))
return ('href="%s"' % new_url)
# 2.6. Relocate a file
url_re = re.compile('href="([^"]+)"')
def relocate_file(self, fp):
self.input_fp = fp
self.input_up = self.fp_to_up(fp)
self.input_upl = self.up_to_upl(self.input_up)
output_fp = self.fp_to_fp(self.input_fp, self.root_fp,
self.output_root_fp)
dirname = os.path.dirname(output_fp)
if not os.path.isdir(dirname):
os.makedirs(dirname)
if self.input_fp != output_fp:
print " Converting", self.input_fp, "to", output_fp
else:
print " Converting", self.input_fp
input = open(self.input_fp, 'r')
lines = map(lambda l, s=self: s.url_re.sub(s.replace_url, l),
input.readlines())
input.close()
output = open(output_fp, 'w')
output.writelines(lines)
output.close()
# 2.7. Relocate files in a path
#
# relocate_path(fp) recursively descends directories below the file
# path given by fp, relocating all the XHTML files it finds there .
def relocate_path(self, fp):
if os.path.isdir(fp):
for f in os.listdir(fp):
new_fp = os.path.join(fp, f)
self.relocate_path(new_fp)
elif (os.path.isfile(fp)
and os.path.splitext(fp)[1] == '.html'
and open(fp, 'r').readline()[0:5] == '<?xml'):
self.relocate_file(fp)
# 2.8. Relocate a distribution.
#
# relocate_distribution(fp_list) takes a list of file paths
# constituting the whole distribution. It relocates all the XHTML
# files below these paths.
def relocate_distribution(self, fp_list):
if not isinstance(fp_list, types.ListType):
fp_list = [fp_list]
self.dist_fpl_list = map(self.fp_to_fpl, fp_list)
self.dist_upl_list = map(self.fpl_to_upl, self.dist_fpl_list)
for fp in fp_list:
self.relocate_path(fp)
# 3. COMMAND-LINE INTERFACE
def usage(err = None):
if err:
print err
print("Usage: %s OPTIONS\n"
"Options: -i, --input Input root file path.\n"
" -u, --url Corresponding URL.\n"
" -o, --output "
"Output root file path (defaults to -i).\n"
" -d, --dist "
"Distribution file path (defaults to -i).\n")
sys.exit(1)
def run():
opts, paths = getopt.getopt(sys.argv[1:], 'i:u:o:d:',
['input=', 'url=', 'ouput=', 'dist='])
root_fp = None
root_up = None
output_root_fp = None
dist_fp_list = []
for o, a in opts:
if o in ('-i', '--input'):
root_fp = a
elif o in ('-u', '--url'):
root_up = a
elif o in ('-o', '--output'):
output_root_fp = a
elif o in ('-d', '--dist'):
dist_fp_list.append(a)
else:
usage()
if paths or root_fp == None or root_up == None:
usage()
if output_root_fp == None:
output_root_fp = root_fp
if dist_fp_list == []:
dist_fp_list.append(root_fp)
r = relocater(root_fp, root_up, output_root_fp)
r.relocate_distribution(dist_fp_list)
if __name__ == '__main__':
run()
# A. REFERENCES
#
# [GDR 2001-04-22] "Rules for XHTML documents"; Gareth Rees; Ravenbrook
# Limited; 2001-04-22; <http://info.ravenbrook.com/rule/xhtml/>.
#
# [RFC 1738] "Uniform Resource Locators (URL)"; T Berners-Lee, L
# Masinter, M McCahill; 1994-12;
# <http://src.doc.ic.ac.uk/rfc/rfc1738.txt>.
#
# [RFC 1808] "Relative Uniform Resource Locators"; R Fielding; 1995-06;
# <http://src.doc.ic.ac.uk/rfc/rfc1808.txt>.
#
# [xml.sax] "xml.sax -- Support for SAX2 parsers"; Guido van Rossum;
# 2000-10-16;
# <http://www.python.org/doc/2.0/lib/module-xml.sax.html>.
#
#
# B. DOCUMENT HISTORY
#
# 2001-07-10 GDR Created.
#
#
# C. COPYRIGHT AND LICENCE
#
# This file is copyright (c) 2001 Perforce Software, Inc. All
# rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDERS AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
# DAMAGE.
#
#
# $Id: //info.ravenbrook.com/project/p4dti/version/2.0/tool/relocate_xhtml.py#1 $