check_xhtml.py #1

#             Perforce Defect Tracking Integration Project
#              <http://www.ravenbrook.com/project/p4dti/>
#
#                CHECK_XHTML.PY -- CHECK XHTML DOCUMENT
#
#             Gareth Rees, Ravenbrook Limited, 2001-04-30
#
#
# 1. INTRODUCTION
#
# This Python script checks that XHTML documents conform to the XHTML
# 1.0 Transitional specification [XHTML 1.0] and to the Ravenbrook
# document rules [Rules].
#
# The intended readership is Ravenbrook staff.
#
# This document is not confidential.
#
#
# 1.1. Use
#
# This module is intended for use in two circumstances.  It may be run
# from the command line, passing a list of paths.  It then checks those
# paths and writes its output to stderr.  This is convenient for use in
# Emacs under M-x compile, for then the errors can be browsed using
# `next-error' (C-x `).
#
# It may also be run from other Python programs, which should
# instantiate an object from the checker class, and call the check()
# method, passing appropriate paths, or the check_stream() method,
# passing a file stream.  When constructing the checker object, you may
# supply an error stream object with a write() method: all error
# messages will be written to this object and you may then divert them
# for exammple to unittest's fail() method.

import dircache
import getopt
import os
import re
import string
import sys
import types
import xml.sax


# 2. XHTML DEFINITION
#
# This section defines a bunch of tables that describe the XHTML 1.0
# Transitional document type [XHTML 1.0 DTD].  It would be nice to
# generate these tables automatically by parsing the document type
# definition, but as far as I know there's no DTD parser for XML, and I
# didn't want to write one just for the purpose of checking one document
# type.
#
# Instead, these tables have been derived mechanically (typically using
# Emacs Lisp) from the XHTML DTD [XHTML 1.0 DTD].
#
# In any case, there are a number of constraints on XHTML documents that
# are not specified in the DTD; see [XHTML 1.0].  So parsing the DTD
# wouldn't be the whole story.
#
#
# 2.1. Useful element contents
#
# These variables contain lists of elements: they will be used in the
# element definitions [2.2] to define the set of elements that may
# legally appear in the content of each XHTML element.

elt_special        = ['br', 'span', 'bdo', 'object', 'applet', 'img',
                      'map', 'iframe']
elt_fontstyle      = ['tt', 'i', 'b', 'big', 'small', 'u', 's',
                      'strike', 'font', 'basefont']
elt_phrase         = ['em', 'strong', 'dfn', 'code', 'q', 'sub', 'sup',
                      'samp', 'kbd', 'var', 'cite', 'abbr', 'acronym']
elt_inline_forms   = ['input', 'select', 'textarea', 'label', 'button']
elt_misc           = ['ins', 'del', 'script', 'noscript']
elt_inline         = (['a'] + elt_special + elt_fontstyle + elt_phrase
                      + elt_inline_forms)
elt_Inline         = ['#PCDATA'] + elt_inline + elt_misc
elt_heading        = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
elt_lists          = ['ul', 'ol', 'dl', 'menu', 'dir']
elt_blocktext      = ['pre', 'hr', 'blockquote', 'address', 'center',
                      'noframes']
elt_block          = (['p', 'div', 'isindex', 'fieldset', 'table']
                      + elt_heading + elt_lists + elt_blocktext)
elt_Block          = elt_block + ['form'] + elt_misc
elt_Flow           = (['#PCDATA', 'form'] + elt_block + elt_inline
                      + elt_misc)
elt_a_content      = (['#PCDATA'] + elt_special + elt_fontstyle
                      + elt_phrase + elt_inline_forms + elt_misc)
elt_pre_content    = (['#PCDATA', 'a', 'br', 'span', 'bdo', 'map', 'tt',
                       'i', 'b', 'u', 's']
                      + elt_phrase + elt_inline_forms)
elt_form_content   = ['#PCDATA'] + elt_block + elt_inline + elt_misc
elt_button_content = (['#PCDATA', 'p', 'div', 'table', 'br', 'span',
                       'bdo', 'object', 'applet', 'img', 'map']
                      + elt_heading + elt_lists + elt_blocktext
                      + elt_fontstyle + elt_phrase + elt_misc)
elt_head_misc      = ['script', 'style', 'meta', 'link', 'object',
                      'isindex']


# 2.2. Elements definitions
#
# The legal_elements dictionary maps the name of an XHTML element to the
# list of elements that are legal members of that element.

legal_elements = {
    'a': elt_a_content,
    'abbr': elt_Inline,
    'acronym': elt_Inline,
    'address': elt_Inline,
    'applet': (['#PCDATA', 'param', 'form'] + elt_block + elt_inline
               + elt_misc),
    'area': [],
    'b': elt_Inline,
    'base': [],
    'basefont': [],
    'bdo': elt_Inline,
    'big': elt_Inline,
    'blockquote': elt_Flow,
    'body': elt_Flow,
    'br': [],
    'button': elt_button_content,
    'caption': elt_Inline,
    'center': elt_Flow,
    'cite': elt_Inline,
    'code': elt_Inline,
    'col': [],
    'colgroup': ['col'],
    'dd': elt_Flow,
    'del': elt_Flow,
    'dfn': elt_Inline,
    'dir': ['li'],
    'div': elt_Flow,
    'dl': ['dt', 'dd'],
    'dt': elt_Inline,
    'em': elt_Inline,
    'fieldset': (['#PCDATA', 'legend', 'form'] + elt_block + elt_inline
                 + elt_misc),
    'font': elt_Inline,
    'form': elt_form_content,
    'h1': elt_Inline,
    'h2': elt_Inline,
    'h3': elt_Inline,
    'h4': elt_Inline,
    'h5': elt_Inline,
    'h6': elt_Inline,
    'head': elt_head_misc + ['title', 'base'],
    'hr': [],
    'html': ['head', 'body'],
    'i': elt_Inline,
    'iframe': elt_Flow,
    'img': [],
    'input': [],
    'ins': elt_Flow,
    'isindex': [],
    'kbd': elt_Inline,
    'label': elt_Inline,
    'legend': elt_Inline,
    'li': elt_Flow,
    'link': [],
    'map': ['form', 'area'] + elt_block + elt_misc,
    'menu': ['li'],
    'meta': [],
    'noframes': elt_Flow,
    'noscript': elt_Flow,
    'object': (['#PCDATA', 'param', 'form'] + elt_block + elt_inline
               + elt_misc),
    'ol': ['li'],
    'optgroup': ['option'],
    'option': ['#PCDATA'],
    'p': elt_Inline,
    'param': [],
    'pre': elt_pre_content,
    'q': elt_Inline,
    's': elt_Inline,
    'samp': elt_Inline,
    'script': ['#PCDATA'],
    'select': ['optgroup', 'option'],
    'small': elt_Inline,
    'span': elt_Inline,
    'strike': elt_Inline,
    'strong': elt_Inline,
    'style': ['#PCDATA'],
    'sub': elt_Inline,
    'sup': elt_Inline,
    'table': ['caption', 'col', 'colgroup', 'thead', 'tfoot', 'tbody',
              'tr'],
    'tbody': ['tr'],
    'td': elt_Flow,
    'textarea': ['#PCDATA'],
    'tfoot': ['tr'],
    'th': elt_Flow,
    'thead': ['tr'],
    'title': ['#PCDATA'],
    'tr': ['th', 'td'],
    'tt': elt_Inline,
    'u': elt_Inline,
    'ul': ['li'],
    'var': elt_Inline,
    }


# 2.3. Nonempty elements
#
# The nonempty_element array is a list of elements that may not be empty
# (that is, they must have at least one element in their contents: for
# example <ul> must contain at least one <li>, and <optgroup> must
# contain at least one <option>.

nonempty_elements = [ 'dir', 'dl', 'head', 'html', 'map', 'menu', 'ol',
                      'optgroup', 'select', 'table', 'tbody', 'tfoot',
                      'thead', 'tr', 'ul',]


# 2.4. Other constraints
#
# There are constraints on <head>, <html>, <map> and <table> which are
# not specified in [2.2] and [2.3] (that is, the constraints can't be
# expressed as a combination of "these elements are legal contents" and
# "the contents must not be empty").  The contraints dictionary maps
# element name to a pair consisting of the element definition from
# [XHTML 1.0 DTD], and a regular expression that matches legal contents
# of the element, when the content elements have '<' appended and are
# joined together.  The reason for using '<' as the terminator is that
# it may not appear in an element name.

constraints = {
    'html':   ( "(head, body)",
                "^head<body<$" ),
    'head':   ( "(%head.misc;, "
                "((title, %head.misc;, (base, %head.misc;)?) "
                "| (base, %head.misc;, (title, %head.misc;))))",
                "^(script<|style<|meta<|link<|object<|isindex<)*"
                "(title<(script<|style<|meta<|link<|object<|isindex<)*"
                "(base<)?|base<(script<|style<|meta<|link<|object<|"
                "isindex<)*title<)(script<|style<|meta<|link<|object<|"
                "isindex<)*$" ),
    'map':   ( "((%block; | form | %misc;)+ | area+)",
               "^(p<|div<|isindex<|fieldset<|table<|h1<|h2<|h3<|h4<|"
               "h5<|h6<|ul<|ol<|dl<|menu<|dir<|pre<|hr<|blockquote<|"
               "address<|center<|noframes<|form<|ins<|del<|script<|"
               "noscript<)+|(area<)+$" ),
    'table': ( "(caption?, (col*|colgroup*), thead?, tfoot?, "
               "(tbody+|tr+))",
               "^(caption<)?((col<)*|(colgroup<)*)(thead<)?(tfoot<)?"
               "((tbody<)+|(tr<)+)$" ),
    }

# The illegal_ancestors dictionary is a map from element name to a list
# of elements that the element may not be found in (no matter how deep
# in the document tree).  This list is derived from [XHTML 1.0, B].

illegal_ancestors = {
    'a': ['a'],
    'big': ['pre'],
    'button': ['button'],
    'fieldset': ['button'],
    'form': ['button', 'form'],
    'iframe': ['button'],
    'img': ['pre'],
    'input': ['button'],
    'isindex': ['button'],
    'label': ['button', 'label'],
    'object': ['pre'],
    'select': ['button'],
    'small': ['pre'],
    'sub': ['pre'],
    'sup': ['pre'],
    'textarea': ['button'],
    }


# 2.5. Attribute definitions
#
# These variables define sets of attributes that are common to a number
# of elements.  The variables will be used to help build the attributes
# table in [2.6] below.

attrs_cellhalign = [
    ('align', ['left', 'center', 'right', 'justify', 'char'], 0),
    ('char', 'Character', 0),
    ('charoff', 'Length', 0),
    ]
attrs_cellvalign = [
    ('valign', ['top', 'middle', 'bottom', 'baseline'], 0),
    ]
attrs_coreattrs = [
    ('id', 'ID', 0),
    ('class', 'Class', 0),
    ('style', 'StyleSheet', 0),
    ('title', 'Text', 0),
    ]
attrs_events = [
    ('onclick', 'Script', 0),
    ('ondblclick', 'Script', 0),
    ('onmousedown', 'Script', 0),
    ('onmouseup', 'Script', 0),
    ('onmouseover', 'Script', 0),
    ('onmousemove', 'Script', 0),
    ('onmouseout', 'Script', 0),
    ('onkeypress', 'Script', 0),
    ('onkeydown', 'Script', 0),
    ('onkeyup', 'Script', 0),
    ]
attrs_focus = [
    ('accesskey', 'Character', 0),
    ('tabindex', 'Number', 0),
    ('onfocus', 'Script', 0),
    ('onblur', 'Script', 0),
    ]
attrs_i18n = [
    ('lang', 'LanguageCode', 0),
    ('xml:lang', 'LanguageCode', 0),
    ('dir', ['ltr', 'rtl'], 0),
    ]
attrs_TextAlign = [
    ('align', ['left', 'center', 'right'], 0),
    ]
attrs_attrs = attrs_coreattrs + attrs_i18n + attrs_events


# 2.6. Attributes
#
# This dictionary maps element name to a list of legal attributes for
# that element.  Each member of the list is a triple (NAME, TYPE,
# DISPOSITION).  NAME is the name of the attribute.  TYPE is either a
# string naming the type of the attribute value, or a list of strings
# which are the valid values for the attribute.  DISPOSITION is either 0
# (meaning optional), 1 (meaning required), or a string which is the
# default value for the attribute.

type_CAlign   = ['top', 'bottom', 'left', 'right']
type_ImgAlign = ['top', 'middle', 'bottom', 'left', 'right']
type_Scope    = ['row', 'col', 'rowgroup', 'colgroup']
type_Shape    = ['rect', 'circle', 'poly', 'default']
type_TAlign   = ['left', 'center', 'right']
type_TFrame   = ['void', 'above', 'below', 'hsides', 'lhs', 'rhs',
                 'vsides', 'box', 'border']
type_TRules   = ['none', 'groups', 'rows', 'cols', 'all']

attributes = {
    'a': attrs_attrs + attrs_focus + [
	('charset', 'Charset', 0),
	('type', 'ContentType', 0),
	('name', 'NMTOKEN', 0),
	('href', 'URI', 0),
	('hreflang', 'LanguageCode', 0),
	('rel', 'LinkTypes', 0),
	('rev', 'LinkTypes', 0),
	('shape', type_Shape, "rect"),
	('coords', 'Coords', 0),
	('target', 'FrameTarget', 0),
	],
    'abbr': attrs_attrs,
    'acronym': attrs_attrs,
    'address': attrs_attrs,
    'applet': attrs_coreattrs + [
	('codebase', 'URI', 0),
	('archive', 'CDATA', 0),
	('code', 'CDATA', 0),
	('object', 'CDATA', 0),
	('alt', 'Text', 0),
	('name', 'NMTOKEN', 0),
	('width', 'Length', 1),
	('height', 'Length', 1),
	('align', type_ImgAlign, 0),
	('hspace', 'Pixels', 0),
	('vspace', 'Pixels', 0),
	],
    'area': attrs_attrs + attrs_focus + [
	('shape', type_Shape, "rect"),
	('coords', 'Coords', 0),
	('href', 'URI', 0),
	('nohref', ['nohref'], 0),
	('alt', 'Text', 1),
	('target', 'FrameTarget', 0),
	],
    'b': attrs_attrs,
    'base': [
	('href', 'URI', 0),
	('target', 'FrameTarget', 0),
	],
    'basefont': [
	('id', 'ID', 0),
	('size', 'CDATA', 1),
	('color', 'Color', 0),
	('face', 'CDATA', 0),
	],
    'bdo': attrs_coreattrs + attrs_events + [
	('lang', 'LanguageCode', 0),
	('xml:lang', 'LanguageCode', 0),
	('dir', ['ltr', 'rtl'], 1),
	],
    'big': attrs_attrs,
    'blockquote': attrs_attrs + [
	('cite', 'URI', 0),
	],
    'body': attrs_attrs + [
	('onload', 'Script', 0),
	('onunload', 'Script', 0),
	('background', 'URI', 0),
	('bgcolor', 'Color', 0),
	('text', 'Color', 0),
	('link', 'Color', 0),
	('vlink', 'Color', 0),
	('alink', 'Color', 0),
	],
    'br': attrs_coreattrs + [
	('clear', ['left', 'all', 'right', 'none'], "none"),
	],
    'button': attrs_attrs + attrs_focus + [
	('name', 'CDATA', 0),
	('value', 'CDATA', 0),
	('type', ['button', 'submit', 'reset'], "submit"),
	('disabled', ['disabled'], 0),
	],
    'caption': attrs_attrs + [
	('align', type_CAlign, 0),
	],
    'center': attrs_attrs,
    'cite': attrs_attrs,
    'code': attrs_attrs,
    'col': attrs_attrs + attrs_cellhalign + attrs_cellvalign + [
	('span', 'Number', "1"),
	('width', 'MultiLength', 0),
	],
    'colgroup': attrs_attrs + attrs_cellhalign + attrs_cellvalign + [
	('span', 'Number', "1"),
	('width', 'MultiLength', 0),
	],
    'dd': attrs_attrs,
    'del': attrs_attrs + [
	('cite', 'URI', 0),
	('datetime', 'Datetime', 0),
	],
    'dfn': attrs_attrs,
    'dir': attrs_attrs + [
	('compact', ['compact'], 0),
	],
    'div': attrs_attrs + attrs_TextAlign,
    'dl': attrs_attrs + [
	('compact', ['compact'], 0),
	],
    'dt': attrs_attrs,
    'em': attrs_attrs,
    'fieldset': attrs_attrs,
    'font': attrs_coreattrs + attrs_i18n + [
	('size', 'CDATA', 0),
	('color', 'Color', 0),
	('face', 'CDATA', 0),
	],
    'form': attrs_attrs + [
	('action', 'URI', 1),
	('method', ['get', 'post'], "get"),
	('name', 'NMTOKEN', 0),
	('enctype', 'ContentType', "application/x-www-form-urlencoded"),
	('onsubmit', 'Script', 0),
	('onreset', 'Script', 0),
	('accept', 'ContentTypes', 0),
	('accept-charset', 'Charsets', 0),
	('target', 'FrameTarget', 0),
	],
    'h1': attrs_attrs + attrs_TextAlign,
    'h2': attrs_attrs + attrs_TextAlign,
    'h3': attrs_attrs + attrs_TextAlign,
    'h4': attrs_attrs + attrs_TextAlign,
    'h5': attrs_attrs + attrs_TextAlign,
    'h6': attrs_attrs + attrs_TextAlign,
    'head': attrs_i18n + [
	('profile', 'URI', 0),
	],
    'hr': attrs_attrs + [
	('align', ['left','center','right'], 0),
	('noshade', ['noshade'], 0),
	('size', 'Pixels', 0),
	('width', 'Length', 0),
	],
    'html': attrs_i18n + [
	('xmlns', 'URI', 'http://www.w3.org/1999/xhtml'),
	],
    'i': attrs_attrs,
    'iframe': attrs_coreattrs + [
	('longdesc', 'URI', 0),
	('name', 'NMTOKEN', 0),
	('src', 'URI', 0),
	('frameborder', ['1','0'], "1"),
	('marginwidth', 'Pixels', 0),
	('marginheight', 'Pixels', 0),
	('scrolling', ['yes','no','auto'], "auto"),
	('align', type_ImgAlign, 0),
	('height', 'Length', 0),
	('width', 'Length', 0),
	],
    'img': attrs_attrs + [
	('src', 'URI', 1),
	('alt', 'Text', 1),
	('name', 'NMTOKEN', 0),
	('longdesc', 'URI', 0),
	('height', 'Length', 0),
	('width', 'Length', 0),
	('usemap', 'URI', 0),
	('ismap', ['ismap'], 0),
	('align', type_ImgAlign, 0),
	('border', 'Length', 0),
	('hspace', 'Pixels', 0),
	('vspace', 'Pixels', 0),
	],
    'input': attrs_attrs + attrs_focus + [
	('type', 'InputType', "text"),
	('name', 'CDATA', 0),
	('value', 'CDATA', 0),
	('checked', ['checked'], 0),
	('disabled', ['disabled'], 0),
	('readonly', ['readonly'], 0),
	('size', 'CDATA', 0),
	('maxlength', 'Number', 0),
	('src', 'URI', 0),
	('alt', 'CDATA', 0),
	('usemap', 'URI', 0),
	('onselect', 'Script', 0),
	('onchange', 'Script', 0),
	('accept', 'ContentTypes', 0),
	('align', type_ImgAlign, 0),
	],
    'ins': attrs_attrs + [
	('cite', 'URI', 0),
	('datetime', 'Datetime', 0),
	],
    'isindex': attrs_coreattrs + attrs_i18n + [
	('prompt', 'Text', 0),
	],
    'kbd': attrs_attrs,
    'label': attrs_attrs + [
	('for', 'IDREF', 0),
	('accesskey', 'Character', 0),
	('onfocus', 'Script', 0),
	('onblur', 'Script', 0),
	],
    'legend': attrs_attrs + [
	('accesskey', 'Character', 0),
	('align', 'LAlign', 0),
	],
    'li': attrs_attrs + [
	('type', 'LIStyle', 0),
	('value', 'Number', 0),
	],
    'link': attrs_attrs + [
	('charset', 'Charset', 0),
	('href', 'URI', 0),
	('hreflang', 'LanguageCode', 0),
	('type', 'ContentType', 0),
	('rel', 'LinkTypes', 0),
	('rev', 'LinkTypes', 0),
	('media', 'MediaDesc', 0),
	('target', 'FrameTarget', 0),
	],
    'map': attrs_i18n + attrs_events + [
	('id', 'ID', 1),
	('class', 'CDATA', 0),
	('style', 'StyleSheet', 0),
	('title', 'Text', 0),
	('name', 'CDATA', 0),
	],
    'menu': attrs_attrs + [
	('compact', ['compact'], 0),
	],
    'meta': attrs_i18n + [
	('http-equiv', 'CDATA', 0),
	('name', 'CDATA', 0),
	('content', 'CDATA', 1),
	('scheme', 'CDATA', 0),
	],
    'noframes': attrs_attrs,
    'noscript': attrs_attrs,
    'object': attrs_attrs + [
	('declare', ['declare'], 0),
	('classid', 'URI', 0),
	('codebase', 'URI', 0),
	('data', 'URI', 0),
	('type', 'ContentType', 0),
	('codetype', 'ContentType', 0),
	('archive', 'UriList', 0),
	('standby', 'Text', 0),
	('height', 'Length', 0),
	('width', 'Length', 0),
	('usemap', 'URI', 0),
	('name', 'NMTOKEN', 0),
	('tabindex', 'Number', 0),
	('align', type_ImgAlign, 0),
	('border', 'Pixels', 0),
	('hspace', 'Pixels', 0),
	('vspace', 'Pixels', 0),
	],
    'ol': attrs_attrs + [
	('type', ['1', 'a', 'A', 'i', 'I'], 0),
	('compact', ['compact'], 0),
	('start', 'Number', 0),
	],
    'optgroup': attrs_attrs + [
	('disabled', ['disabled'], 0),
	('label', 'Text', 1),
	],
    'option': attrs_attrs + [
	('selected', ['selected'], 0),
	('disabled', ['disabled'], 0),
	('label', 'Text', 0),
	('value', 'CDATA', 0),
	],
    'p': attrs_attrs + attrs_TextAlign,
    'param': [
	('id', 'ID', 0),
	('name', 'CDATA', 1),
	('value', 'CDATA', 0),
	('valuetype', ['data', 'ref', 'object'], "data"),
	('type', 'ContentType', 0),
	],
    'pre': attrs_attrs + [
	('width', 'Number', 0),
	('xml:space', ['preserve'], 'preserve'),
	],
    'q': attrs_attrs + [
	('cite', 'URI', 0),
	],
    's': attrs_attrs,
    'samp': attrs_attrs,
    'script': [
	('charset', 'Charset', 0),
	('type', 'ContentType', 1),
	('language', 'CDATA', 0),
	('src', 'URI', 0),
	('defer', ['defer'], 0),
	('xml:space', ['preserve'], 'preserve'),
	],
    'select': attrs_attrs + [
	('name', 'CDATA', 0),
	('size', 'Number', 0),
	('multiple', ['multiple'], 0),
	('disabled', ['disabled'], 0),
	('tabindex', 'Number', 0),
	('onfocus', 'Script', 0),
	('onblur', 'Script', 0),
	('onchange', 'Script', 0),
	],
    'small': attrs_attrs,
    'span': attrs_attrs,
    'strike': attrs_attrs,
    'strong': attrs_attrs,
    'style': attrs_i18n + [
	('type', 'ContentType', 1),
	('media', 'MediaDesc', 0),
	('title', 'Text', 0),
	('xml:space', ['preserve'], 'preserve'),
	],
    'sub': attrs_attrs,
    'sup': attrs_attrs,
    'table': attrs_attrs + [
	('summary', 'Text', 0),
	('width', 'Length', 0),
	('border', 'Pixels', 0),
	('frame', type_TFrame, 0),
	('rules', type_TRules, 0),
	('cellspacing', 'Length', 0),
	('cellpadding', 'Length', 0),
	('align', 'TAlign', 0),
	('bgcolor', 'Color', 0),
	],
    'tbody': attrs_attrs + attrs_cellhalign + attrs_cellvalign,
    'td': attrs_attrs + attrs_cellhalign + attrs_cellvalign + [
	('abbr', 'Text', 0),
	('axis', 'CDATA', 0),
	('headers', 'IDREFS', 0),
	('scope', type_Scope, 0),
	('rowspan', 'Number', "1"),
	('colspan', 'Number', "1"),
	('nowrap', ['nowrap'], 0),
	('bgcolor', 'Color', 0),
	('width', 'Pixels', 0),
	('height', 'Pixels', 0),
	],
    'textarea': attrs_attrs + attrs_focus + [
	('name', 'CDATA', 0),
	('rows', 'Number', 1),
	('cols', 'Number', 1),
	('disabled', ['disabled'], 0),
	('readonly', ['readonly'], 0),
	('onselect', 'Script', 0),
	('onchange', 'Script', 0),
	],
    'tfoot': attrs_attrs + attrs_cellhalign + attrs_cellvalign,
    'th': attrs_attrs + attrs_cellhalign + attrs_cellvalign + [
	('abbr', 'Text', 0),
	('axis', 'CDATA', 0),
	('headers', 'IDREFS', 0),
	('scope', type_Scope, 0),
	('rowspan', 'Number', "1"),
	('colspan', 'Number', "1"),
	('nowrap', ['nowrap'], 0),
	('bgcolor', 'Color', 0),
	('width', 'Pixels', 0),
	('height', 'Pixels', 0),
	],
    'thead': attrs_attrs + attrs_cellhalign + attrs_cellvalign,
    'title': attrs_i18n,
    'tr': attrs_attrs + attrs_cellhalign + attrs_cellvalign + [
	('bgcolor', 'Color', 0),
	],
    'tt': attrs_attrs,
    'u': attrs_attrs,
    'ul': attrs_attrs + [
	('type', ['disc', 'square', 'circle'], 0),
	('compact', ['compact'], 0),
	],
    'var': attrs_attrs,
    }

def make_attribute_map():
    a = {}
    for element, attrs in attributes.items():
        a[element] = {}
        for name, type, disposition in attrs:
            a[element][name] = (type, disposition)
    return a

legal_attributes = make_attribute_map()


# The recommended_attributes dictionary maps element name to a list of
# attributes that are recommended by [Chisholm 2000-11-06] and [HTML
# 4.01].

recommended_attributes = {
    'abbr': ['title'],
    'acronym': ['title'],
    'applet': ['alt'],
    'area': ['alt'],
    'img': ['height', 'width'],
    'input': ['alt'],
    }


# 2.7. Attribute checkers
#
# The attribute_checkers dictionary maps attribute type to a function of
# one argument that checks that an attribute value is legal.  See [HTML
# 4.01, 6].

character_re = re.compile("^.$")
color_re = re.compile("^#[0-9A-Fa-f]+$")
colors = ['black', 'green', 'silver', 'lime', 'gray', 'olive', 'white',
          'yellow', 'maroon', 'navy', 'red', 'blue', 'purple', 'teal',
          'fuchsia', 'aqua']
datetime_re = re.compile("^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}"
                         "(Z|[+-]\\d{2}:\\d{2})$")
id_re = re.compile("^[A-Za-z][A-Za-z0-9-_:.]*$")
idref_re = re.compile("^[A-Za-z][A-Za-z0-9-_:.]*$")
idrefs_re = re.compile("^[A-Za-z]([A-Za-z0-9-_:.]| +[A-Za-z])*$")
length_re = re.compile("^\\d+%?$")
multilength_re = re.compile("^\\d+[%*]?$")
number_re = re.compile("^\\d+$")

attribute_checkers = {
    'Character': lambda v: character_re.match(v),
    'Color': lambda v: color_re.match(v) or string.lower(v) in colors,
    'Datetime': lambda v: datetime_re.match(v),
    'ID': lambda v: id_re.match(v),
    'IDREF': lambda v: idref_re.match(v),
    'IDREFS': lambda v: idrefs_re.match(v),
    'Length': lambda v: length_re.match(v),
    'MultiLength': lambda v: multilength_re.match(v),
    'NMTOKEN': lambda v: id_re.match(v),
    'Number': lambda v: number_re.match(v),
    'Pixels': lambda v: number_re.match(v),
    }


# 3. CHECK XHTML
#
# The error_sets dictionary defines sets of error messages.
# 'accessibility' errors violate rules in [Chisholm 2000-11-06];
# 'ravenbrook' errors violate rules in [Rules]; 'xhtml-1.0' errors
# violate rules in [XHTML 1.0].

error_sets = {
    'accessibility': [ 26, ],
    'ravenbrook': [ 5, 6, 17, 18, 19, 20, 23, 21, 25, ],
    'xhtml-1.0': [ 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 15, 16, 22,
                   27, ],
    }

class handler(xml.sax.handler.ContentHandler):
    locator = None
    current_heading_level = 0
    current_section = None
    exception = "Fatal XML parsing error."
    error_stream = None

    def __init__(self, doc, noerrors = [], error_stream = sys.stderr):
        xml.sax.handler.ContentHandler.__init__(self)
        self.doc = doc
        self.noerrors = noerrors
        self.stack = [[]]
        self.stack2 = []
        self.ids = {}
        self.nocase_ids = {} # Lower-case keys.
        self.cross_refs = []
        self.refs = {}
        self.idrefs = []
        self.error_stream = error_stream

    def err(self, code, msg):
        self.write_error(code, msg, self.locator.getLineNumber())

    def write_error(self, code, msg, line):
        if code not in self.noerrors:
            msg = "%s(%d) [%d] %s\n" % (self.doc, line, code, msg)
            self.error_stream.write(msg)

    def setDocumentLocator(self, locator):
        self.locator = locator

    def startElement(self, element, attrs):
        if len(self.stack2) == 0:
            parent = None
        else:
            parent = self.stack2[-1][0]
        self.stack2.append((element, self.locator.getLineNumber()))
        self.stack[-1].append(element)
        self.stack.append([])

        # Check that element is legal in XHTML.
        if not legal_elements.has_key(element):
            self.err(1, "<%s> element is not legal in XHTML 1.0 "
                     "Transitional." % element)
            return

        # Check that the element may legally appear at this point in the
        # document.
        if (parent and legal_elements.has_key(parent)
            and element not in legal_elements[parent]):
            self.err(2, "Element <%s> appears in <%s> (not allowed)."
                     % (element, parent))
        if not parent and element != 'html':
            self.err(3, "Top-level element is <%s> (should be <html>)."
                     % element)
        if illegal_ancestors.has_key(element):
            for ancestor, line in self.stack2[0:-1]:
                if ancestor in illegal_ancestors[element]:
                    self.err(16, "Element <%s> appears below <%s> on "
                             "line %d (not allowed)."
                             % (element, ancestor, line))

        # Check attributes.
        for attr, value in attrs.items():
            # Check that the attributes is legal for the element.
            if not legal_attributes[element].has_key(attr):
                self.err(4, "Attribute '%s' is not allowed in <%s>."
                         % (attr, element))
                continue

            # Check that the value for the attribute is legal.
            type, disposition = legal_attributes[element][attr]
            if isinstance(type, types.ListType):
                if value not in type:
                    self.err(10, "Attribute '%s' for element <%s> has "
                             "value '%s' (must be one of [%s])."
                             % (attr, element, value,
                                string.join(type, ', ')))
            elif attribute_checkers.has_key(type):
                if not attribute_checkers[type](value):
                    self.err(11, "Attribute '%s' for element <%s> has "
                             "illegal value '%s' (not a %s)."
                             % (attr, element, value, type))

            # Check that id is unique (ignoring case) within the
            # document.
            if attr == 'id':
                nocase_value = string.lower(value)
                orig_line = self.nocase_ids.get(nocase_value, None)
                if orig_line:
                    self.err(12, "Duplicate id '%s' (original on line "
                             "%d)." % (value, orig_line))
                self.ids[value] = 1
                line = self.locator.getLineNumber()
                self.nocase_ids[nocase_value] = line

            # Remember IDREF and IDREFS for checking later.
            if type in ['IDREF', 'IDREFS']:
                idrefs = re.split(" +", value)
                for i in idrefs:
                    line = self.locator.getLineNumber()
                    self.idrefs.append((i, line))

        # Check that required attributes are present.
        legal_attrs = legal_attributes[element].items()
        for attr, (type, disposition) in legal_attrs:
            if disposition == 1 and attr not in attrs.keys():
                self.err(13, "Attribute '%s' is required for <%s> but "
                         "not present." % (attr, element))

        # Check that recommended attributes are present.
        if recommended_attributes.has_key(element):
            for a in recommended_attributes[element]:
                if not attrs.has_key(a):
                    self.err(26, "Attribute '%s' is recommended for "
                             "<%s> but not present." % (a, element))

        # <td valign="..."> is deprecated: should set same valign for
        # all cells in the row using <tr valign="...">.
        if element in ['td','th'] and attrs.has_key('valign'):
            self.err(20, "<%s> has valign attribute: better in the "
                     "<tr>." % element)

        # Test against rule xhtml/id.
        if (element == 'a' and attrs.has_key('id')
            and not attrs.has_key('name')):
            self.err(5, "<%s> element has 'id' attribute but no 'name' "
                     "attribute." % element)
        if (element == 'a' and attrs.has_key('name')
            and not attrs.has_key('id')):
            self.err(6, "<%s> element has 'name' attribute but no 'id' "
                     "attribute." % element)
        if (attrs.has_key('name') and attrs.has_key('id')
            and attrs['name'] != attrs['id']):
            self.err(7, "<%s> element has id '%s' but name '%s'."
                     % (element, attrs['id'], attrs['name']))

        # Test against rule xhtml/section.  Remember current section
        # number.
        if element == 'a' and parent in ['h2', 'h3', 'h4', 'h5', 'h6']:
            level = int(re.match('h([0-9])$', parent).group(1))
            if not attrs.has_key('id'):
                self.err(17, "Section anchor has no 'id' attribute.")
            elif re.match("section-", attrs['id']):
                ref_re = re.compile("section-(\\w+"
                                    + "\\.[0-9]+" * max(0,level-2)
                                    + ")$")
                ref_example = "section-1" + ".1" * max(0,level-2)
                match = ref_re.match(attrs['id'])
                if not match:
                    self.err(18, "Anchor for <%s> has id '%s': should "
                             "look like '%s'."
                             % (parent, attrs['id'], ref_example))
                else:
                    self.current_section = match.group(1)

        # Check (some of) rule xhtml/ref-id.
        if (self.current_section == 'A' and len(self.stack2) >= 4
            and element == 'a' and self.stack2[-2][0] == 'td'
            and self.stack2[-3][0] == 'tr'
            and self.stack2[-4][0] == 'table'):
            if (attrs.has_key('id')
                and not re.match("ref-", attrs['id'])):
                self.err(21, "Reference anchor has id '%s': should "
                         "start with 'ref-'." % attrs['id'])
            # Remember the target of the reference so that we can check
            # the rule xhtml/ref-link in the endDocument() method.
            if attrs.has_key('id') and attrs.has_key('href'):
                self.refs[attrs['id']] = attrs['href']

        # Remember cross-refs for later checking against the set of ids.
        if (element == 'a' and attrs.has_key('href') and attrs['href']
            and attrs['href'][0] == '#'):
            self.cross_refs.append((attrs['href'][1:],
                                    self.locator.getLineNumber()))

        # Check ordering of sections (<h3> can't follow <h1>).
        if element in elt_heading:
            level = int(re.match('h([0-9])$', element).group(1))
            if level > self.current_heading_level + 1:
                self.err(25, "<%s> follows <h%d>."
                         % (element, self.current_heading_level))
            self.current_heading_level = level

    def characters(self, content):
        if len(self.stack2) > 0:
            element = self.stack2[-1][0]
            # Check that non-blank character data is part of an element
            # only when #PCDATA is legal contents.
            if (legal_elements.has_key(element)
                and '#PCDATA' not in legal_elements[element]
                and not re.match('^\\s+$', content)):
                self.err(8, "<%s> element contains character data '%s'."
                         % (element, content))

    def endElement(self, element):
        contents = self.stack[-1]

        # Check that required contents are present.
        if not contents and element in nonempty_elements:
            self.err(9, "<%s> element is empty." % element)

        # Extra contraints.  See [2.4].
        if contents and constraints.has_key(element):
            if not re.match(constraints[element][1],
                            string.join(contents,'<') + '<'):
                self.err(15, "Contents of <%s> element [%s] doesn't "
                         "match XHTML specification %s."
                         % (element, string.join(contents, ', '),
                            constraints[element][0]))

        # Section headings (except <h1>) must have anchors so they can
        # be referred to (rule xhtml/section).
        if (element in ['h2', 'h3', 'h4', 'h5', 'h6']
            and 'a' not in contents):
            self.err(19, "<%s> has no section anchor." % element)

        # Pop the stacks.
        self.stack2 = self.stack2[0:-1]
        self.stack = self.stack[0:-1]
        assert self.stack[-1][-1] == element

    def endDocument(self):
        # Check that cross-references have a target.
        for target, line in self.cross_refs:
            if not self.ids.has_key(target):
                self.write_error(22, "Cross-reference '#%s' has no "
                                 "target." % target, line)

            # Check the xhtml/ref-link rule.
            elif self.refs.has_key(target):
                self.write_error(23, "Cross-reference '#%s' to "
                                 "references section should link to "
                                 "target '%s' instead."
                                 % (target, self.refs[target]), line)

        # Check that IDREFs have a target.
        for idref, line in self.idrefs:
            if not self.ids.has_key(idref):
                self.write_error(27, "IDREF '%s' has no target."
                                 % (idref,), line)

    def error(self, exception):
        line = exception.getLineNumber()
        message = exception.getMessage()
        # This unfortunately depends on the XML parser producing this
        # error.
        if message == 'mismatched tag' and self.stack2:
            message = ("Mismatched closing tag (opening tag was <%s> "
                       "at line %d)." % self.stack2[-1])
        self.error_stream.write("%s(%d) %s\n"
                                % (self.doc, line, message))

    def fatalError(self, exception):
        self.error(exception)
        raise self.exception

    def warning(self, exception):
        self.error(exception)

    def check(self, path_or_stream):
        try:
            xml.sax.parse(path_or_stream, self, self)
        except self.exception:
            pass


# 4. CHECK DIRECTORIES AND FILES

class checker:
    noerrors = []
    skip = []

    def __init__(self, skip = [], noerrors = [],
                 error_stream = sys.stderr):
        self.skip = skip
        self.noerrors = noerrors
        self.error_stream = error_stream

    def check(self, path):
        if os.path.isdir(path):
            for f in dircache.listdir(path):
                if f not in self.skip:
                    self.check(os.path.join(path, f))
        elif (os.path.isfile(path) and re.search("\\.html$", path)
              and re.match("<\\?xml", open(path).readline())):
            handler(path, self.noerrors, self.error_stream).check(path)

    def check_stream(self, name, stream):
        handler(name, self.noerrors, self.error_stream).check(stream)


# 5. COMMAND-LINE INTERFACE

def run():
    opts, paths = getopt.getopt(sys.argv[1:], 's:n:',
                                ['skip=', 'noerror='])
    skip = []
    noerrors = []
    for o, a in opts:
        if o in ('-s', '--skip'):
            skip.extend(string.split(a, ','))
        if o in ('-n', '--noerror'):
            noerrors.extend(map(int, string.split(a, ',')))
    c = checker(skip, noerrors)
    for p in paths:
        c.check(p)

if __name__ == "__main__":
    run()


# A. REFERENCES
#
# [Chisholm 2000-11-06] "HTML Techniques for Web Content Accessibility
# Guidelines 1.0"; Wendy Chisholm, Gregg Vanderheiden, Ian Jacobs;
# 2000-11-06; <http://www.w3.org/TR/WCAG10-TECHS/>.
#
# [HTML 4.01] "HTML 4.01 Specification"; World Wide Web Consortium;
# 1999-12-24; <http://www.w3.org/TR/html4/>.
#
# [Jacobs 2001-04-09] "User Agent Accessibility Guidelines 1.0" (W3C
# Working Draft); Ian Jacobs, Jon Gunderson, Eric Hansen; 2001-04-09;
# <http://www.w3.org/TR/2001/WD-UAAG10-20010409/>.
#
# [Rules] "Rules"; Gareth Rees; Ravenbrook Limited; 2001-04-22;
# <http://info.ravenbrook.com/rule/>.
#
# [SAX] "xml.sax -- Support for SAX2 parsers"; Python;
# <http://www.python.org/doc/current/lib/module-xml.sax.html>.
#
# [XHTML 1.0] "XHTML 1.0: The Extensible HyperText Markup Language";
# World Wide Web Consortium; 2000-01-26; <http://www.w3.org/TR/xhtml1/>.
#
# [XHTML 1.0 DTD] "XHTML 1.0 Transitional Document Type Definition";
# World Wide Web Consortium; 2000-01-26;
# <http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd>.
#
#
# B. DOCUMENT HISTORY
#
# 2001-04-29 GDR Created.
#
# 2001-04-30 GDR Added checking of attributes to the check_xhtml.py
# script. Added Ravenbrook document format checks for section headers
# (rule xhtml/section) and reference ids (rule xhtml/ref-id).
#
# 2001-05-01 GDR Added checks for attribute values, cross-references,
# links to the references section, ordering of sections, XHTML
# constraints for <head>, <html>, <map> and <table>, recommended
# attributes, IDREFs.
#
# 2001-05-03 GDR For mismatched tags, report the opening tag and its
# line number.
#
# 2001-05-07 GDR Use the 'handler' class as both ContentHandler and
# ErrorHandler.  New method check_stream takes a stream argument, not a
# path.
#
# 2001-07-25 GDR Handler and checker classes take error_stream object as
# parameter (defaults to sys.stderr) so that they can be used in other
# checking situations (e.g., under unittest.py).  Added section on use.
#
#
# C. COPYRIGHT AND LICENCE
#
# Copyright 2001 Gareth Rees.  All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the
#    distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDERS AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
# DAMAGE.
#
#
# $Id: //info.ravenbrook.com/project/p4dti/version/2.0/test/check_xhtml.py#1 $
#	Change	User	Description	Committed
#2	4326	Robert Cowham	Tidied up
#1	4187	Robert Cowham	Initial version of PVCS (now Merant) Tracker Integration.