# Perforce Defect Tracking Integration Project
# <http://www.ravenbrook.com/project/p4dti/>
#
# CHECK_XHTML.PY -- CHECK XHTML DOCUMENT
#
# Gareth Rees, Ravenbrook Limited, 2001-04-30
#
#
# 1. INTRODUCTION
#
# This Python script checks that XHTML documents conform to the XHTML
# 1.0 Transitional specification [XHTML 1.0] and to the Ravenbrook
# document rules [Rules].
#
# The intended readership is Ravenbrook staff.
#
# This document is not confidential.
#
#
# 1.1. Use
#
# This module is intended for use in two circumstances. It may be run
# from the command line, passing a list of paths. It then checks those
# paths and writes its output to stderr. This is convenient for use in
# Emacs under M-x compile, for then the errors can be browsed using
# `next-error' (C-x `).
#
# It may also be run from other Python programs, which should
# instantiate an object from the checker class, and call the check()
# method, passing appropriate paths, or the check_stream() method,
# passing a file stream. When constructing the checker object, you may
# supply an error stream object with a write() method: all error
# messages will be written to this object and you may then divert them
# for exammple to unittest's fail() method.
import dircache
import getopt
import os
import re
import string
import sys
import types
import xml.sax
# 2. XHTML DEFINITION
#
# This section defines a bunch of tables that describe the XHTML 1.0
# Transitional document type [XHTML 1.0 DTD]. It would be nice to
# generate these tables automatically by parsing the document type
# definition, but as far as I know there's no DTD parser for XML, and I
# didn't want to write one just for the purpose of checking one document
# type.
#
# Instead, these tables have been derived mechanically (typically using
# Emacs Lisp) from the XHTML DTD [XHTML 1.0 DTD].
#
# In any case, there are a number of constraints on XHTML documents that
# are not specified in the DTD; see [XHTML 1.0]. So parsing the DTD
# wouldn't be the whole story.
#
#
# 2.1. Useful element contents
#
# These variables contain lists of elements: they will be used in the
# element definitions [2.2] to define the set of elements that may
# legally appear in the content of each XHTML element.
elt_special = ['br', 'span', 'bdo', 'object', 'applet', 'img',
'map', 'iframe']
elt_fontstyle = ['tt', 'i', 'b', 'big', 'small', 'u', 's',
'strike', 'font', 'basefont']
elt_phrase = ['em', 'strong', 'dfn', 'code', 'q', 'sub', 'sup',
'samp', 'kbd', 'var', 'cite', 'abbr', 'acronym']
elt_inline_forms = ['input', 'select', 'textarea', 'label', 'button']
elt_misc = ['ins', 'del', 'script', 'noscript']
elt_inline = (['a'] + elt_special + elt_fontstyle + elt_phrase
+ elt_inline_forms)
elt_Inline = ['#PCDATA'] + elt_inline + elt_misc
elt_heading = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
elt_lists = ['ul', 'ol', 'dl', 'menu', 'dir']
elt_blocktext = ['pre', 'hr', 'blockquote', 'address', 'center',
'noframes']
elt_block = (['p', 'div', 'isindex', 'fieldset', 'table']
+ elt_heading + elt_lists + elt_blocktext)
elt_Block = elt_block + ['form'] + elt_misc
elt_Flow = (['#PCDATA', 'form'] + elt_block + elt_inline
+ elt_misc)
elt_a_content = (['#PCDATA'] + elt_special + elt_fontstyle
+ elt_phrase + elt_inline_forms + elt_misc)
elt_pre_content = (['#PCDATA', 'a', 'br', 'span', 'bdo', 'map', 'tt',
'i', 'b', 'u', 's']
+ elt_phrase + elt_inline_forms)
elt_form_content = ['#PCDATA'] + elt_block + elt_inline + elt_misc
elt_button_content = (['#PCDATA', 'p', 'div', 'table', 'br', 'span',
'bdo', 'object', 'applet', 'img', 'map']
+ elt_heading + elt_lists + elt_blocktext
+ elt_fontstyle + elt_phrase + elt_misc)
elt_head_misc = ['script', 'style', 'meta', 'link', 'object',
'isindex']
# 2.2. Elements definitions
#
# The legal_elements dictionary maps the name of an XHTML element to the
# list of elements that are legal members of that element.
legal_elements = {
'a': elt_a_content,
'abbr': elt_Inline,
'acronym': elt_Inline,
'address': elt_Inline,
'applet': (['#PCDATA', 'param', 'form'] + elt_block + elt_inline
+ elt_misc),
'area': [],
'b': elt_Inline,
'base': [],
'basefont': [],
'bdo': elt_Inline,
'big': elt_Inline,
'blockquote': elt_Flow,
'body': elt_Flow,
'br': [],
'button': elt_button_content,
'caption': elt_Inline,
'center': elt_Flow,
'cite': elt_Inline,
'code': elt_Inline,
'col': [],
'colgroup': ['col'],
'dd': elt_Flow,
'del': elt_Flow,
'dfn': elt_Inline,
'dir': ['li'],
'div': elt_Flow,
'dl': ['dt', 'dd'],
'dt': elt_Inline,
'em': elt_Inline,
'fieldset': (['#PCDATA', 'legend', 'form'] + elt_block + elt_inline
+ elt_misc),
'font': elt_Inline,
'form': elt_form_content,
'h1': elt_Inline,
'h2': elt_Inline,
'h3': elt_Inline,
'h4': elt_Inline,
'h5': elt_Inline,
'h6': elt_Inline,
'head': elt_head_misc + ['title', 'base'],
'hr': [],
'html': ['head', 'body'],
'i': elt_Inline,
'iframe': elt_Flow,
'img': [],
'input': [],
'ins': elt_Flow,
'isindex': [],
'kbd': elt_Inline,
'label': elt_Inline,
'legend': elt_Inline,
'li': elt_Flow,
'link': [],
'map': ['form', 'area'] + elt_block + elt_misc,
'menu': ['li'],
'meta': [],
'noframes': elt_Flow,
'noscript': elt_Flow,
'object': (['#PCDATA', 'param', 'form'] + elt_block + elt_inline
+ elt_misc),
'ol': ['li'],
'optgroup': ['option'],
'option': ['#PCDATA'],
'p': elt_Inline,
'param': [],
'pre': elt_pre_content,
'q': elt_Inline,
's': elt_Inline,
'samp': elt_Inline,
'script': ['#PCDATA'],
'select': ['optgroup', 'option'],
'small': elt_Inline,
'span': elt_Inline,
'strike': elt_Inline,
'strong': elt_Inline,
'style': ['#PCDATA'],
'sub': elt_Inline,
'sup': elt_Inline,
'table': ['caption', 'col', 'colgroup', 'thead', 'tfoot', 'tbody',
'tr'],
'tbody': ['tr'],
'td': elt_Flow,
'textarea': ['#PCDATA'],
'tfoot': ['tr'],
'th': elt_Flow,
'thead': ['tr'],
'title': ['#PCDATA'],
'tr': ['th', 'td'],
'tt': elt_Inline,
'u': elt_Inline,
'ul': ['li'],
'var': elt_Inline,
}
# 2.3. Nonempty elements
#
# The nonempty_element array is a list of elements that may not be empty
# (that is, they must have at least one element in their contents: for
# example <ul> must contain at least one <li>, and <optgroup> must
# contain at least one <option>.
nonempty_elements = [ 'dir', 'dl', 'head', 'html', 'map', 'menu', 'ol',
'optgroup', 'select', 'table', 'tbody', 'tfoot',
'thead', 'tr', 'ul',]
# 2.4. Other constraints
#
# There are constraints on <head>, <html>, <map> and <table> which are
# not specified in [2.2] and [2.3] (that is, the constraints can't be
# expressed as a combination of "these elements are legal contents" and
# "the contents must not be empty"). The contraints dictionary maps
# element name to a pair consisting of the element definition from
# [XHTML 1.0 DTD], and a regular expression that matches legal contents
# of the element, when the content elements have '<' appended and are
# joined together. The reason for using '<' as the terminator is that
# it may not appear in an element name.
constraints = {
'html': ( "(head, body)",
"^head<body<$" ),
'head': ( "(%head.misc;, "
"((title, %head.misc;, (base, %head.misc;)?) "
"| (base, %head.misc;, (title, %head.misc;))))",
"^(script<|style<|meta<|link<|object<|isindex<)*"
"(title<(script<|style<|meta<|link<|object<|isindex<)*"
"(base<)?|base<(script<|style<|meta<|link<|object<|"
"isindex<)*title<)(script<|style<|meta<|link<|object<|"
"isindex<)*$" ),
'map': ( "((%block; | form | %misc;)+ | area+)",
"^(p<|div<|isindex<|fieldset<|table<|h1<|h2<|h3<|h4<|"
"h5<|h6<|ul<|ol<|dl<|menu<|dir<|pre<|hr<|blockquote<|"
"address<|center<|noframes<|form<|ins<|del<|script<|"
"noscript<)+|(area<)+$" ),
'table': ( "(caption?, (col*|colgroup*), thead?, tfoot?, "
"(tbody+|tr+))",
"^(caption<)?((col<)*|(colgroup<)*)(thead<)?(tfoot<)?"
"((tbody<)+|(tr<)+)$" ),
}
# The illegal_ancestors dictionary is a map from element name to a list
# of elements that the element may not be found in (no matter how deep
# in the document tree). This list is derived from [XHTML 1.0, B].
illegal_ancestors = {
'a': ['a'],
'big': ['pre'],
'button': ['button'],
'fieldset': ['button'],
'form': ['button', 'form'],
'iframe': ['button'],
'img': ['pre'],
'input': ['button'],
'isindex': ['button'],
'label': ['button', 'label'],
'object': ['pre'],
'select': ['button'],
'small': ['pre'],
'sub': ['pre'],
'sup': ['pre'],
'textarea': ['button'],
}
# 2.5. Attribute definitions
#
# These variables define sets of attributes that are common to a number
# of elements. The variables will be used to help build the attributes
# table in [2.6] below.
attrs_cellhalign = [
('align', ['left', 'center', 'right', 'justify', 'char'], 0),
('char', 'Character', 0),
('charoff', 'Length', 0),
]
attrs_cellvalign = [
('valign', ['top', 'middle', 'bottom', 'baseline'], 0),
]
attrs_coreattrs = [
('id', 'ID', 0),
('class', 'Class', 0),
('style', 'StyleSheet', 0),
('title', 'Text', 0),
]
attrs_events = [
('onclick', 'Script', 0),
('ondblclick', 'Script', 0),
('onmousedown', 'Script', 0),
('onmouseup', 'Script', 0),
('onmouseover', 'Script', 0),
('onmousemove', 'Script', 0),
('onmouseout', 'Script', 0),
('onkeypress', 'Script', 0),
('onkeydown', 'Script', 0),
('onkeyup', 'Script', 0),
]
attrs_focus = [
('accesskey', 'Character', 0),
('tabindex', 'Number', 0),
('onfocus', 'Script', 0),
('onblur', 'Script', 0),
]
attrs_i18n = [
('lang', 'LanguageCode', 0),
('xml:lang', 'LanguageCode', 0),
('dir', ['ltr', 'rtl'], 0),
]
attrs_TextAlign = [
('align', ['left', 'center', 'right'], 0),
]
attrs_attrs = attrs_coreattrs + attrs_i18n + attrs_events
# 2.6. Attributes
#
# This dictionary maps element name to a list of legal attributes for
# that element. Each member of the list is a triple (NAME, TYPE,
# DISPOSITION). NAME is the name of the attribute. TYPE is either a
# string naming the type of the attribute value, or a list of strings
# which are the valid values for the attribute. DISPOSITION is either 0
# (meaning optional), 1 (meaning required), or a string which is the
# default value for the attribute.
type_CAlign = ['top', 'bottom', 'left', 'right']
type_ImgAlign = ['top', 'middle', 'bottom', 'left', 'right']
type_Scope = ['row', 'col', 'rowgroup', 'colgroup']
type_Shape = ['rect', 'circle', 'poly', 'default']
type_TAlign = ['left', 'center', 'right']
type_TFrame = ['void', 'above', 'below', 'hsides', 'lhs', 'rhs',
'vsides', 'box', 'border']
type_TRules = ['none', 'groups', 'rows', 'cols', 'all']
attributes = {
'a': attrs_attrs + attrs_focus + [
('charset', 'Charset', 0),
('type', 'ContentType', 0),
('name', 'NMTOKEN', 0),
('href', 'URI', 0),
('hreflang', 'LanguageCode', 0),
('rel', 'LinkTypes', 0),
('rev', 'LinkTypes', 0),
('shape', type_Shape, "rect"),
('coords', 'Coords', 0),
('target', 'FrameTarget', 0),
],
'abbr': attrs_attrs,
'acronym': attrs_attrs,
'address': attrs_attrs,
'applet': attrs_coreattrs + [
('codebase', 'URI', 0),
('archive', 'CDATA', 0),
('code', 'CDATA', 0),
('object', 'CDATA', 0),
('alt', 'Text', 0),
('name', 'NMTOKEN', 0),
('width', 'Length', 1),
('height', 'Length', 1),
('align', type_ImgAlign, 0),
('hspace', 'Pixels', 0),
('vspace', 'Pixels', 0),
],
'area': attrs_attrs + attrs_focus + [
('shape', type_Shape, "rect"),
('coords', 'Coords', 0),
('href', 'URI', 0),
('nohref', ['nohref'], 0),
('alt', 'Text', 1),
('target', 'FrameTarget', 0),
],
'b': attrs_attrs,
'base': [
('href', 'URI', 0),
('target', 'FrameTarget', 0),
],
'basefont': [
('id', 'ID', 0),
('size', 'CDATA', 1),
('color', 'Color', 0),
('face', 'CDATA', 0),
],
'bdo': attrs_coreattrs + attrs_events + [
('lang', 'LanguageCode', 0),
('xml:lang', 'LanguageCode', 0),
('dir', ['ltr', 'rtl'], 1),
],
'big': attrs_attrs,
'blockquote': attrs_attrs + [
('cite', 'URI', 0),
],
'body': attrs_attrs + [
('onload', 'Script', 0),
('onunload', 'Script', 0),
('background', 'URI', 0),
('bgcolor', 'Color', 0),
('text', 'Color', 0),
('link', 'Color', 0),
('vlink', 'Color', 0),
('alink', 'Color', 0),
],
'br': attrs_coreattrs + [
('clear', ['left', 'all', 'right', 'none'], "none"),
],
'button': attrs_attrs + attrs_focus + [
('name', 'CDATA', 0),
('value', 'CDATA', 0),
('type', ['button', 'submit', 'reset'], "submit"),
('disabled', ['disabled'], 0),
],
'caption': attrs_attrs + [
('align', type_CAlign, 0),
],
'center': attrs_attrs,
'cite': attrs_attrs,
'code': attrs_attrs,
'col': attrs_attrs + attrs_cellhalign + attrs_cellvalign + [
('span', 'Number', "1"),
('width', 'MultiLength', 0),
],
'colgroup': attrs_attrs + attrs_cellhalign + attrs_cellvalign + [
('span', 'Number', "1"),
('width', 'MultiLength', 0),
],
'dd': attrs_attrs,
'del': attrs_attrs + [
('cite', 'URI', 0),
('datetime', 'Datetime', 0),
],
'dfn': attrs_attrs,
'dir': attrs_attrs + [
('compact', ['compact'], 0),
],
'div': attrs_attrs + attrs_TextAlign,
'dl': attrs_attrs + [
('compact', ['compact'], 0),
],
'dt': attrs_attrs,
'em': attrs_attrs,
'fieldset': attrs_attrs,
'font': attrs_coreattrs + attrs_i18n + [
('size', 'CDATA', 0),
('color', 'Color', 0),
('face', 'CDATA', 0),
],
'form': attrs_attrs + [
('action', 'URI', 1),
('method', ['get', 'post'], "get"),
('name', 'NMTOKEN', 0),
('enctype', 'ContentType', "application/x-www-form-urlencoded"),
('onsubmit', 'Script', 0),
('onreset', 'Script', 0),
('accept', 'ContentTypes', 0),
('accept-charset', 'Charsets', 0),
('target', 'FrameTarget', 0),
],
'h1': attrs_attrs + attrs_TextAlign,
'h2': attrs_attrs + attrs_TextAlign,
'h3': attrs_attrs + attrs_TextAlign,
'h4': attrs_attrs + attrs_TextAlign,
'h5': attrs_attrs + attrs_TextAlign,
'h6': attrs_attrs + attrs_TextAlign,
'head': attrs_i18n + [
('profile', 'URI', 0),
],
'hr': attrs_attrs + [
('align', ['left','center','right'], 0),
('noshade', ['noshade'], 0),
('size', 'Pixels', 0),
('width', 'Length', 0),
],
'html': attrs_i18n + [
('xmlns', 'URI', 'http://www.w3.org/1999/xhtml'),
],
'i': attrs_attrs,
'iframe': attrs_coreattrs + [
('longdesc', 'URI', 0),
('name', 'NMTOKEN', 0),
('src', 'URI', 0),
('frameborder', ['1','0'], "1"),
('marginwidth', 'Pixels', 0),
('marginheight', 'Pixels', 0),
('scrolling', ['yes','no','auto'], "auto"),
('align', type_ImgAlign, 0),
('height', 'Length', 0),
('width', 'Length', 0),
],
'img': attrs_attrs + [
('src', 'URI', 1),
('alt', 'Text', 1),
('name', 'NMTOKEN', 0),
('longdesc', 'URI', 0),
('height', 'Length', 0),
('width', 'Length', 0),
('usemap', 'URI', 0),
('ismap', ['ismap'], 0),
('align', type_ImgAlign, 0),
('border', 'Length', 0),
('hspace', 'Pixels', 0),
('vspace', 'Pixels', 0),
],
'input': attrs_attrs + attrs_focus + [
('type', 'InputType', "text"),
('name', 'CDATA', 0),
('value', 'CDATA', 0),
('checked', ['checked'], 0),
('disabled', ['disabled'], 0),
('readonly', ['readonly'], 0),
('size', 'CDATA', 0),
('maxlength', 'Number', 0),
('src', 'URI', 0),
('alt', 'CDATA', 0),
('usemap', 'URI', 0),
('onselect', 'Script', 0),
('onchange', 'Script', 0),
('accept', 'ContentTypes', 0),
('align', type_ImgAlign, 0),
],
'ins': attrs_attrs + [
('cite', 'URI', 0),
('datetime', 'Datetime', 0),
],
'isindex': attrs_coreattrs + attrs_i18n + [
('prompt', 'Text', 0),
],
'kbd': attrs_attrs,
'label': attrs_attrs + [
('for', 'IDREF', 0),
('accesskey', 'Character', 0),
('onfocus', 'Script', 0),
('onblur', 'Script', 0),
],
'legend': attrs_attrs + [
('accesskey', 'Character', 0),
('align', 'LAlign', 0),
],
'li': attrs_attrs + [
('type', 'LIStyle', 0),
('value', 'Number', 0),
],
'link': attrs_attrs + [
('charset', 'Charset', 0),
('href', 'URI', 0),
('hreflang', 'LanguageCode', 0),
('type', 'ContentType', 0),
('rel', 'LinkTypes', 0),
('rev', 'LinkTypes', 0),
('media', 'MediaDesc', 0),
('target', 'FrameTarget', 0),
],
'map': attrs_i18n + attrs_events + [
('id', 'ID', 1),
('class', 'CDATA', 0),
('style', 'StyleSheet', 0),
('title', 'Text', 0),
('name', 'CDATA', 0),
],
'menu': attrs_attrs + [
('compact', ['compact'], 0),
],
'meta': attrs_i18n + [
('http-equiv', 'CDATA', 0),
('name', 'CDATA', 0),
('content', 'CDATA', 1),
('scheme', 'CDATA', 0),
],
'noframes': attrs_attrs,
'noscript': attrs_attrs,
'object': attrs_attrs + [
('declare', ['declare'], 0),
('classid', 'URI', 0),
('codebase', 'URI', 0),
('data', 'URI', 0),
('type', 'ContentType', 0),
('codetype', 'ContentType', 0),
('archive', 'UriList', 0),
('standby', 'Text', 0),
('height', 'Length', 0),
('width', 'Length', 0),
('usemap', 'URI', 0),
('name', 'NMTOKEN', 0),
('tabindex', 'Number', 0),
('align', type_ImgAlign, 0),
('border', 'Pixels', 0),
('hspace', 'Pixels', 0),
('vspace', 'Pixels', 0),
],
'ol': attrs_attrs + [
('type', ['1', 'a', 'A', 'i', 'I'], 0),
('compact', ['compact'], 0),
('start', 'Number', 0),
],
'optgroup': attrs_attrs + [
('disabled', ['disabled'], 0),
('label', 'Text', 1),
],
'option': attrs_attrs + [
('selected', ['selected'], 0),
('disabled', ['disabled'], 0),
('label', 'Text', 0),
('value', 'CDATA', 0),
],
'p': attrs_attrs + attrs_TextAlign,
'param': [
('id', 'ID', 0),
('name', 'CDATA', 1),
('value', 'CDATA', 0),
('valuetype', ['data', 'ref', 'object'], "data"),
('type', 'ContentType', 0),
],
'pre': attrs_attrs + [
('width', 'Number', 0),
('xml:space', ['preserve'], 'preserve'),
],
'q': attrs_attrs + [
('cite', 'URI', 0),
],
's': attrs_attrs,
'samp': attrs_attrs,
'script': [
('charset', 'Charset', 0),
('type', 'ContentType', 1),
('language', 'CDATA', 0),
('src', 'URI', 0),
('defer', ['defer'], 0),
('xml:space', ['preserve'], 'preserve'),
],
'select': attrs_attrs + [
('name', 'CDATA', 0),
('size', 'Number', 0),
('multiple', ['multiple'], 0),
('disabled', ['disabled'], 0),
('tabindex', 'Number', 0),
('onfocus', 'Script', 0),
('onblur', 'Script', 0),
('onchange', 'Script', 0),
],
'small': attrs_attrs,
'span': attrs_attrs,
'strike': attrs_attrs,
'strong': attrs_attrs,
'style': attrs_i18n + [
('type', 'ContentType', 1),
('media', 'MediaDesc', 0),
('title', 'Text', 0),
('xml:space', ['preserve'], 'preserve'),
],
'sub': attrs_attrs,
'sup': attrs_attrs,
'table': attrs_attrs + [
('summary', 'Text', 0),
('width', 'Length', 0),
('border', 'Pixels', 0),
('frame', type_TFrame, 0),
('rules', type_TRules, 0),
('cellspacing', 'Length', 0),
('cellpadding', 'Length', 0),
('align', 'TAlign', 0),
('bgcolor', 'Color', 0),
],
'tbody': attrs_attrs + attrs_cellhalign + attrs_cellvalign,
'td': attrs_attrs + attrs_cellhalign + attrs_cellvalign + [
('abbr', 'Text', 0),
('axis', 'CDATA', 0),
('headers', 'IDREFS', 0),
('scope', type_Scope, 0),
('rowspan', 'Number', "1"),
('colspan', 'Number', "1"),
('nowrap', ['nowrap'], 0),
('bgcolor', 'Color', 0),
('width', 'Pixels', 0),
('height', 'Pixels', 0),
],
'textarea': attrs_attrs + attrs_focus + [
('name', 'CDATA', 0),
('rows', 'Number', 1),
('cols', 'Number', 1),
('disabled', ['disabled'], 0),
('readonly', ['readonly'], 0),
('onselect', 'Script', 0),
('onchange', 'Script', 0),
],
'tfoot': attrs_attrs + attrs_cellhalign + attrs_cellvalign,
'th': attrs_attrs + attrs_cellhalign + attrs_cellvalign + [
('abbr', 'Text', 0),
('axis', 'CDATA', 0),
('headers', 'IDREFS', 0),
('scope', type_Scope, 0),
('rowspan', 'Number', "1"),
('colspan', 'Number', "1"),
('nowrap', ['nowrap'], 0),
('bgcolor', 'Color', 0),
('width', 'Pixels', 0),
('height', 'Pixels', 0),
],
'thead': attrs_attrs + attrs_cellhalign + attrs_cellvalign,
'title': attrs_i18n,
'tr': attrs_attrs + attrs_cellhalign + attrs_cellvalign + [
('bgcolor', 'Color', 0),
],
'tt': attrs_attrs,
'u': attrs_attrs,
'ul': attrs_attrs + [
('type', ['disc', 'square', 'circle'], 0),
('compact', ['compact'], 0),
],
'var': attrs_attrs,
}
def make_attribute_map():
a = {}
for element, attrs in attributes.items():
a[element] = {}
for name, type, disposition in attrs:
a[element][name] = (type, disposition)
return a
legal_attributes = make_attribute_map()
# The recommended_attributes dictionary maps element name to a list of
# attributes that are recommended by [Chisholm 2000-11-06] and [HTML
# 4.01].
recommended_attributes = {
'abbr': ['title'],
'acronym': ['title'],
'applet': ['alt'],
'area': ['alt'],
'img': ['height', 'width'],
'input': ['alt'],
}
# 2.7. Attribute checkers
#
# The attribute_checkers dictionary maps attribute type to a function of
# one argument that checks that an attribute value is legal. See [HTML
# 4.01, 6].
character_re = re.compile("^.$")
color_re = re.compile("^#[0-9A-Fa-f]+$")
colors = ['black', 'green', 'silver', 'lime', 'gray', 'olive', 'white',
'yellow', 'maroon', 'navy', 'red', 'blue', 'purple', 'teal',
'fuchsia', 'aqua']
datetime_re = re.compile("^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}"
"(Z|[+-]\\d{2}:\\d{2})$")
id_re = re.compile("^[A-Za-z][A-Za-z0-9-_:.]*$")
idref_re = re.compile("^[A-Za-z][A-Za-z0-9-_:.]*$")
idrefs_re = re.compile("^[A-Za-z]([A-Za-z0-9-_:.]| +[A-Za-z])*$")
length_re = re.compile("^\\d+%?$")
multilength_re = re.compile("^\\d+[%*]?$")
number_re = re.compile("^\\d+$")
attribute_checkers = {
'Character': lambda v: character_re.match(v),
'Color': lambda v: color_re.match(v) or string.lower(v) in colors,
'Datetime': lambda v: datetime_re.match(v),
'ID': lambda v: id_re.match(v),
'IDREF': lambda v: idref_re.match(v),
'IDREFS': lambda v: idrefs_re.match(v),
'Length': lambda v: length_re.match(v),
'MultiLength': lambda v: multilength_re.match(v),
'NMTOKEN': lambda v: id_re.match(v),
'Number': lambda v: number_re.match(v),
'Pixels': lambda v: number_re.match(v),
}
# 3. CHECK XHTML
#
# The error_sets dictionary defines sets of error messages.
# 'accessibility' errors violate rules in [Chisholm 2000-11-06];
# 'ravenbrook' errors violate rules in [Rules]; 'xhtml-1.0' errors
# violate rules in [XHTML 1.0].
error_sets = {
'accessibility': [ 26, ],
'ravenbrook': [ 5, 6, 17, 18, 19, 20, 23, 21, 25, ],
'xhtml-1.0': [ 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 15, 16, 22,
27, ],
}
class handler(xml.sax.handler.ContentHandler):
locator = None
current_heading_level = 0
current_section = None
exception = "Fatal XML parsing error."
error_stream = None
def __init__(self, doc, noerrors = [], error_stream = sys.stderr):
xml.sax.handler.ContentHandler.__init__(self)
self.doc = doc
self.noerrors = noerrors
self.stack = [[]]
self.stack2 = []
self.ids = {}
self.nocase_ids = {} # Lower-case keys.
self.cross_refs = []
self.refs = {}
self.idrefs = []
self.error_stream = error_stream
def err(self, code, msg):
self.write_error(code, msg, self.locator.getLineNumber())
def write_error(self, code, msg, line):
if code not in self.noerrors:
msg = "%s(%d) [%d] %s\n" % (self.doc, line, code, msg)
self.error_stream.write(msg)
def setDocumentLocator(self, locator):
self.locator = locator
def startElement(self, element, attrs):
if len(self.stack2) == 0:
parent = None
else:
parent = self.stack2[-1][0]
self.stack2.append((element, self.locator.getLineNumber()))
self.stack[-1].append(element)
self.stack.append([])
# Check that element is legal in XHTML.
if not legal_elements.has_key(element):
self.err(1, "<%s> element is not legal in XHTML 1.0 "
"Transitional." % element)
return
# Check that the element may legally appear at this point in the
# document.
if (parent and legal_elements.has_key(parent)
and element not in legal_elements[parent]):
self.err(2, "Element <%s> appears in <%s> (not allowed)."
% (element, parent))
if not parent and element != 'html':
self.err(3, "Top-level element is <%s> (should be <html>)."
% element)
if illegal_ancestors.has_key(element):
for ancestor, line in self.stack2[0:-1]:
if ancestor in illegal_ancestors[element]:
self.err(16, "Element <%s> appears below <%s> on "
"line %d (not allowed)."
% (element, ancestor, line))
# Check attributes.
for attr, value in attrs.items():
# Check that the attributes is legal for the element.
if not legal_attributes[element].has_key(attr):
self.err(4, "Attribute '%s' is not allowed in <%s>."
% (attr, element))
continue
# Check that the value for the attribute is legal.
type, disposition = legal_attributes[element][attr]
if isinstance(type, types.ListType):
if value not in type:
self.err(10, "Attribute '%s' for element <%s> has "
"value '%s' (must be one of [%s])."
% (attr, element, value,
string.join(type, ', ')))
elif attribute_checkers.has_key(type):
if not attribute_checkers[type](value):
self.err(11, "Attribute '%s' for element <%s> has "
"illegal value '%s' (not a %s)."
% (attr, element, value, type))
# Check that id is unique (ignoring case) within the
# document.
if attr == 'id':
nocase_value = string.lower(value)
orig_line = self.nocase_ids.get(nocase_value, None)
if orig_line:
self.err(12, "Duplicate id '%s' (original on line "
"%d)." % (value, orig_line))
self.ids[value] = 1
line = self.locator.getLineNumber()
self.nocase_ids[nocase_value] = line
# Remember IDREF and IDREFS for checking later.
if type in ['IDREF', 'IDREFS']:
idrefs = re.split(" +", value)
for i in idrefs:
line = self.locator.getLineNumber()
self.idrefs.append((i, line))
# Check that required attributes are present.
legal_attrs = legal_attributes[element].items()
for attr, (type, disposition) in legal_attrs:
if disposition == 1 and attr not in attrs.keys():
self.err(13, "Attribute '%s' is required for <%s> but "
"not present." % (attr, element))
# Check that recommended attributes are present.
if recommended_attributes.has_key(element):
for a in recommended_attributes[element]:
if not attrs.has_key(a):
self.err(26, "Attribute '%s' is recommended for "
"<%s> but not present." % (a, element))
# <td valign="..."> is deprecated: should set same valign for
# all cells in the row using <tr valign="...">.
if element in ['td','th'] and attrs.has_key('valign'):
self.err(20, "<%s> has valign attribute: better in the "
"<tr>." % element)
# Test against rule xhtml/id.
if (element == 'a' and attrs.has_key('id')
and not attrs.has_key('name')):
self.err(5, "<%s> element has 'id' attribute but no 'name' "
"attribute." % element)
if (element == 'a' and attrs.has_key('name')
and not attrs.has_key('id')):
self.err(6, "<%s> element has 'name' attribute but no 'id' "
"attribute." % element)
if (attrs.has_key('name') and attrs.has_key('id')
and attrs['name'] != attrs['id']):
self.err(7, "<%s> element has id '%s' but name '%s'."
% (element, attrs['id'], attrs['name']))
# Test against rule xhtml/section. Remember current section
# number.
if element == 'a' and parent in ['h2', 'h3', 'h4', 'h5', 'h6']:
level = int(re.match('h([0-9])$', parent).group(1))
if not attrs.has_key('id'):
self.err(17, "Section anchor has no 'id' attribute.")
elif re.match("section-", attrs['id']):
ref_re = re.compile("section-(\\w+"
+ "\\.[0-9]+" * max(0,level-2)
+ ")$")
ref_example = "section-1" + ".1" * max(0,level-2)
match = ref_re.match(attrs['id'])
if not match:
self.err(18, "Anchor for <%s> has id '%s': should "
"look like '%s'."
% (parent, attrs['id'], ref_example))
else:
self.current_section = match.group(1)
# Check (some of) rule xhtml/ref-id.
if (self.current_section == 'A' and len(self.stack2) >= 4
and element == 'a' and self.stack2[-2][0] == 'td'
and self.stack2[-3][0] == 'tr'
and self.stack2[-4][0] == 'table'):
if (attrs.has_key('id')
and not re.match("ref-", attrs['id'])):
self.err(21, "Reference anchor has id '%s': should "
"start with 'ref-'." % attrs['id'])
# Remember the target of the reference so that we can check
# the rule xhtml/ref-link in the endDocument() method.
if attrs.has_key('id') and attrs.has_key('href'):
self.refs[attrs['id']] = attrs['href']
# Remember cross-refs for later checking against the set of ids.
if (element == 'a' and attrs.has_key('href') and attrs['href']
and attrs['href'][0] == '#'):
self.cross_refs.append((attrs['href'][1:],
self.locator.getLineNumber()))
# Check ordering of sections (<h3> can't follow <h1>).
if element in elt_heading:
level = int(re.match('h([0-9])$', element).group(1))
if level > self.current_heading_level + 1:
self.err(25, "<%s> follows <h%d>."
% (element, self.current_heading_level))
self.current_heading_level = level
def characters(self, content):
if len(self.stack2) > 0:
element = self.stack2[-1][0]
# Check that non-blank character data is part of an element
# only when #PCDATA is legal contents.
if (legal_elements.has_key(element)
and '#PCDATA' not in legal_elements[element]
and not re.match('^\\s+$', content)):
self.err(8, "<%s> element contains character data '%s'."
% (element, content))
def endElement(self, element):
contents = self.stack[-1]
# Check that required contents are present.
if not contents and element in nonempty_elements:
self.err(9, "<%s> element is empty." % element)
# Extra contraints. See [2.4].
if contents and constraints.has_key(element):
if not re.match(constraints[element][1],
string.join(contents,'<') + '<'):
self.err(15, "Contents of <%s> element [%s] doesn't "
"match XHTML specification %s."
% (element, string.join(contents, ', '),
constraints[element][0]))
# Section headings (except <h1>) must have anchors so they can
# be referred to (rule xhtml/section).
if (element in ['h2', 'h3', 'h4', 'h5', 'h6']
and 'a' not in contents):
self.err(19, "<%s> has no section anchor." % element)
# Pop the stacks.
self.stack2 = self.stack2[0:-1]
self.stack = self.stack[0:-1]
assert self.stack[-1][-1] == element
def endDocument(self):
# Check that cross-references have a target.
for target, line in self.cross_refs:
if not self.ids.has_key(target):
self.write_error(22, "Cross-reference '#%s' has no "
"target." % target, line)
# Check the xhtml/ref-link rule.
elif self.refs.has_key(target):
self.write_error(23, "Cross-reference '#%s' to "
"references section should link to "
"target '%s' instead."
% (target, self.refs[target]), line)
# Check that IDREFs have a target.
for idref, line in self.idrefs:
if not self.ids.has_key(idref):
self.write_error(27, "IDREF '%s' has no target."
% (idref,), line)
def error(self, exception):
line = exception.getLineNumber()
message = exception.getMessage()
# This unfortunately depends on the XML parser producing this
# error.
if message == 'mismatched tag' and self.stack2:
message = ("Mismatched closing tag (opening tag was <%s> "
"at line %d)." % self.stack2[-1])
self.error_stream.write("%s(%d) %s\n"
% (self.doc, line, message))
def fatalError(self, exception):
self.error(exception)
raise self.exception
def warning(self, exception):
self.error(exception)
def check(self, path_or_stream):
try:
xml.sax.parse(path_or_stream, self, self)
except self.exception:
pass
# 4. CHECK DIRECTORIES AND FILES
class checker:
noerrors = []
skip = []
def __init__(self, skip = [], noerrors = [],
error_stream = sys.stderr):
self.skip = skip
self.noerrors = noerrors
self.error_stream = error_stream
def check(self, path):
if os.path.isdir(path):
for f in dircache.listdir(path):
if f not in self.skip:
self.check(os.path.join(path, f))
elif (os.path.isfile(path) and re.search("\\.html$", path)
and re.match("<\\?xml", open(path).readline())):
handler(path, self.noerrors, self.error_stream).check(path)
def check_stream(self, name, stream):
handler(name, self.noerrors, self.error_stream).check(stream)
# 5. COMMAND-LINE INTERFACE
def run():
opts, paths = getopt.getopt(sys.argv[1:], 's:n:',
['skip=', 'noerror='])
skip = []
noerrors = []
for o, a in opts:
if o in ('-s', '--skip'):
skip.extend(string.split(a, ','))
if o in ('-n', '--noerror'):
noerrors.extend(map(int, string.split(a, ',')))
c = checker(skip, noerrors)
for p in paths:
c.check(p)
if __name__ == "__main__":
run()
# A. REFERENCES
#
# [Chisholm 2000-11-06] "HTML Techniques for Web Content Accessibility
# Guidelines 1.0"; Wendy Chisholm, Gregg Vanderheiden, Ian Jacobs;
# 2000-11-06; <http://www.w3.org/TR/WCAG10-TECHS/>.
#
# [HTML 4.01] "HTML 4.01 Specification"; World Wide Web Consortium;
# 1999-12-24; <http://www.w3.org/TR/html4/>.
#
# [Jacobs 2001-04-09] "User Agent Accessibility Guidelines 1.0" (W3C
# Working Draft); Ian Jacobs, Jon Gunderson, Eric Hansen; 2001-04-09;
# <http://www.w3.org/TR/2001/WD-UAAG10-20010409/>.
#
# [Rules] "Rules"; Gareth Rees; Ravenbrook Limited; 2001-04-22;
# <http://info.ravenbrook.com/rule/>.
#
# [SAX] "xml.sax -- Support for SAX2 parsers"; Python;
# <http://www.python.org/doc/current/lib/module-xml.sax.html>.
#
# [XHTML 1.0] "XHTML 1.0: The Extensible HyperText Markup Language";
# World Wide Web Consortium; 2000-01-26; <http://www.w3.org/TR/xhtml1/>.
#
# [XHTML 1.0 DTD] "XHTML 1.0 Transitional Document Type Definition";
# World Wide Web Consortium; 2000-01-26;
# <http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd>.
#
#
# B. DOCUMENT HISTORY
#
# 2001-04-29 GDR Created.
#
# 2001-04-30 GDR Added checking of attributes to the check_xhtml.py
# script. Added Ravenbrook document format checks for section headers
# (rule xhtml/section) and reference ids (rule xhtml/ref-id).
#
# 2001-05-01 GDR Added checks for attribute values, cross-references,
# links to the references section, ordering of sections, XHTML
# constraints for <head>, <html>, <map> and <table>, recommended
# attributes, IDREFs.
#
# 2001-05-03 GDR For mismatched tags, report the opening tag and its
# line number.
#
# 2001-05-07 GDR Use the 'handler' class as both ContentHandler and
# ErrorHandler. New method check_stream takes a stream argument, not a
# path.
#
# 2001-07-25 GDR Handler and checker classes take error_stream object as
# parameter (defaults to sys.stderr) so that they can be used in other
# checking situations (e.g., under unittest.py). Added section on use.
#
#
# C. COPYRIGHT AND LICENCE
#
# Copyright 2001 Gareth Rees. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the
# distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDERS AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
# DAMAGE.
#
#
# $Id: //info.ravenbrook.com/project/p4dti/version/2.0/test/check_xhtml.py#1 $