# Author: David Goodger
# Contact: goodger@users.sourceforge.net
# Revision: $Revision: 4219 $
# Date: $Date: 2005-12-15 15:32:01 +0100 (Thu, 15 Dec 2005) $
# Copyright: This module has been placed in the public domain.
"""
Simple HyperText Markup Language document tree Writer.
The output conforms to the XHTML version 1.0 Transitional DTD
(*almost* strict). The output contains a minimum of formatting
information. The cascading style sheet "html4css1.css" is required
for proper viewing with a modern graphical browser.
"""
__docformat__ = 'reStructuredText'
import sys
import os
import os.path
import time
import re
from types import ListType
try:
import Image # check for the Python Imaging Library
except ImportError:
Image = None
import docutils
from docutils import frontend, nodes, utils, writers, languages
class Writer(writers.Writer):
supported = ('html', 'html4css1', 'xhtml')
"""Formats this writer supports."""
default_stylesheet = 'html4css1.css'
default_stylesheet_path = utils.relative_path(
os.path.join(os.getcwd(), 'dummy'),
os.path.join(os.path.dirname(__file__), default_stylesheet))
settings_spec = (
'HTML-Specific Options',
None,
(('Specify a stylesheet URL, used verbatim. Overrides '
'--stylesheet-path.',
['--stylesheet'],
{'metavar': '', 'overrides': 'stylesheet_path'}),
('Specify a stylesheet file, relative to the current working '
'directory. The path is adjusted relative to the output HTML '
'file. Overrides --stylesheet. Default: "%s"'
% default_stylesheet_path,
['--stylesheet-path'],
{'metavar': '', 'overrides': 'stylesheet',
'default': default_stylesheet_path}),
('Embed the stylesheet in the output HTML file. The stylesheet '
'file must be accessible during processing (--stylesheet-path is '
'recommended). This is the default.',
['--embed-stylesheet'],
{'default': 1, 'action': 'store_true',
'validator': frontend.validate_boolean}),
('Link to the stylesheet in the output HTML file. Default: '
'embed the stylesheet, do not link to it.',
['--link-stylesheet'],
{'dest': 'embed_stylesheet', 'action': 'store_false',
'validator': frontend.validate_boolean}),
('Specify the initial header level. Default is 1 for "
". '
'Does not affect document title & subtitle (see --no-doc-title).',
['--initial-header-level'],
{'choices': '1 2 3 4 5 6'.split(), 'default': '1',
'metavar': ''}),
('Specify the maximum width (in characters) for one-column field '
'names. Longer field names will span an entire row of the table '
'used to render the field list. Default is 14 characters. '
'Use 0 for "no limit".',
['--field-name-limit'],
{'default': 14, 'metavar': '',
'validator': frontend.validate_nonnegative_int}),
('Specify the maximum width (in characters) for options in option '
'lists. Longer options will span an entire row of the table used '
'to render the option list. Default is 14 characters. '
'Use 0 for "no limit".',
['--option-limit'],
{'default': 14, 'metavar': '',
'validator': frontend.validate_nonnegative_int}),
('Format for footnote references: one of "superscript" or '
'"brackets". Default is "brackets".',
['--footnote-references'],
{'choices': ['superscript', 'brackets'], 'default': 'brackets',
'metavar': '',
'overrides': 'trim_footnote_reference_space'}),
('Format for block quote attributions: one of "dash" (em-dash '
'prefix), "parentheses"/"parens", or "none". Default is "dash".',
['--attribution'],
{'choices': ['dash', 'parentheses', 'parens', 'none'],
'default': 'dash', 'metavar': ''}),
('Remove extra vertical whitespace between items of "simple" bullet '
'lists and enumerated lists. Default: enabled.',
['--compact-lists'],
{'default': 1, 'action': 'store_true',
'validator': frontend.validate_boolean}),
('Disable compact simple bullet and enumerated lists.',
['--no-compact-lists'],
{'dest': 'compact_lists', 'action': 'store_false'}),
('Remove extra vertical whitespace between items of simple field '
'lists. Default: enabled.',
['--compact-field-lists'],
{'default': 1, 'action': 'store_true',
'validator': frontend.validate_boolean}),
('Disable compact simple field lists.',
['--no-compact-field-lists'],
{'dest': 'compact_field_lists', 'action': 'store_false'}),
('Omit the XML declaration. Use with caution.',
['--no-xml-declaration'],
{'dest': 'xml_declaration', 'default': 1, 'action': 'store_false',
'validator': frontend.validate_boolean}),
('Obfuscate email addresses to confuse harvesters while still '
'keeping email links usable with standards-compliant browsers.',
['--cloak-email-addresses'],
{'action': 'store_true', 'validator': frontend.validate_boolean}),))
settings_defaults = {'output_encoding_error_handler': 'xmlcharrefreplace'}
relative_path_settings = ('stylesheet_path',)
config_section = 'html4css1 writer'
config_section_dependencies = ('writers',)
def __init__(self):
writers.Writer.__init__(self)
self.translator_class = HTMLTranslator
def translate(self):
self.visitor = visitor = self.translator_class(self.document)
self.document.walkabout(visitor)
self.output = visitor.astext()
for attr in ('head_prefix', 'stylesheet', 'head', 'body_prefix',
'body_pre_docinfo', 'docinfo', 'body', 'fragment',
'body_suffix'):
setattr(self, attr, getattr(visitor, attr))
def assemble_parts(self):
writers.Writer.assemble_parts(self)
for part in ('title', 'subtitle', 'docinfo', 'body', 'header',
'footer', 'meta', 'stylesheet', 'fragment',
'html_prolog', 'html_head', 'html_title', 'html_subtitle',
'html_body'):
self.parts[part] = ''.join(getattr(self.visitor, part))
class HTMLTranslator(nodes.NodeVisitor):
"""
This HTML writer has been optimized to produce visually compact
lists (less vertical whitespace). HTML's mixed content models
allow list items to contain "
body elements
" or
"
just text
" or even "
text
and body
elements
combined
", each with different effects. It would
be best to stick with strict body elements in list items, but they
affect vertical spacing in browsers (although they really
shouldn't).
Here is an outline of the optimization:
- Check for and omit
tags in "simple" lists: list items
contain either a single paragraph, a nested simple list, or a
paragraph followed by a nested simple list. This means that
this list can be compact:
- Item 1.
- Item 2.
But this list cannot be compact:
- Item 1.
This second paragraph forces space between list items.
- Item 2.
- In non-list contexts, omit
tags on a paragraph if that
paragraph is the only child of its parent (footnotes & citations
are allowed a label first).
- Regardless of the above, in definitions, table cells, field bodies,
option descriptions, and list items, mark the first child with
'class="first"' and the last child with 'class="last"'. The stylesheet
sets the margins (top & bottom respectively) to 0 for these elements.
The ``no_compact_lists`` setting (``--no-compact-lists`` command-line
option) disables list whitespace optimization.
"""
xml_declaration = '\n'
doctype = ('\n')
head_prefix_template = ('\n
\n')
content_type = ('\n')
generator = ('\n')
stylesheet_link = '\n'
embedded_stylesheet = '\n'
named_tags = ['a', 'applet', 'form', 'frame', 'iframe', 'img', 'map']
words_and_spaces = re.compile(r'\S+| +|\n')
def __init__(self, document):
nodes.NodeVisitor.__init__(self, document)
self.settings = settings = document.settings
lcode = settings.language_code
self.language = languages.get_language(lcode)
self.meta = [self.content_type % settings.output_encoding,
self.generator % docutils.__version__]
self.head_prefix = []
self.html_prolog = []
if settings.xml_declaration:
self.head_prefix.append(self.xml_declaration
% settings.output_encoding)
# encoding not interpolated:
self.html_prolog.append(self.xml_declaration)
self.head_prefix.extend([self.doctype,
self.head_prefix_template % (lcode, lcode)])
self.html_prolog.append(self.doctype)
self.head = self.meta[:]
stylesheet = utils.get_stylesheet_reference(settings)
self.stylesheet = []
if stylesheet:
if settings.embed_stylesheet:
stylesheet = utils.get_stylesheet_reference(
settings, os.path.join(os.getcwd(), 'dummy'))
settings.record_dependencies.add(stylesheet)
stylesheet_text = open(stylesheet).read()
self.stylesheet = [self.embedded_stylesheet % stylesheet_text]
else:
self.stylesheet = [self.stylesheet_link
% self.encode(stylesheet)]
self.body_prefix = ['\n\n']
# document title, subtitle display
self.body_pre_docinfo = []
# author, date, etc.
self.docinfo = []
self.body = []
self.fragment = []
self.body_suffix = ['\n\n']
self.section_level = 0
self.initial_header_level = int(settings.initial_header_level)
# A heterogenous stack used in conjunction with the tree traversal.
# Make sure that the pops correspond to the pushes:
self.context = []
self.topic_classes = []
self.colspecs = []
self.compact_p = 1
self.compact_simple = None
self.compact_field_list = None
self.in_docinfo = None
self.in_sidebar = None
self.title = []
self.subtitle = []
self.header = []
self.footer = []
self.html_head = [self.content_type] # charset not interpolated
self.html_title = []
self.html_subtitle = []
self.html_body = []
self.in_document_title = 0
self.in_mailto = 0
self.author_in_authors = None
def astext(self):
return ''.join(self.head_prefix + self.head
+ self.stylesheet + self.body_prefix
+ self.body_pre_docinfo + self.docinfo
+ self.body + self.body_suffix)
def encode(self, text):
"""Encode special characters in `text` & return."""
# @@@ A codec to do these and all other HTML entities would be nice.
text = text.replace("&", "&")
text = text.replace("<", "<")
text = text.replace('"', """)
text = text.replace(">", ">")
text = text.replace("@", "@") # may thwart some address harvesters
# Replace the non-breaking space character with the HTML entity:
text = text.replace(u'\u00a0', " ")
return text
def cloak_mailto(self, uri):
"""Try to hide a mailto: URL from harvesters."""
# Encode "@" using a URL octet reference (see RFC 1738).
# Further cloaking with HTML entities will be done in the
# `attval` function.
return uri.replace('@', '%40')
def cloak_email(self, addr):
"""Try to hide the link text of a email link from harversters."""
# Surround at-signs and periods with tags. ("@" has
# already been encoded to "@" by the `encode` method.)
addr = addr.replace('@', '@')
addr = addr.replace('.', '.')
return addr
def attval(self, text,
whitespace=re.compile('[\n\r\t\v\f]')):
"""Cleanse, HTML encode, and return attribute value text."""
encoded = self.encode(whitespace.sub(' ', text))
if self.in_mailto and self.settings.cloak_email_addresses:
# Cloak at-signs ("%40") and periods with HTML entities.
encoded = encoded.replace('%40', '%40')
encoded = encoded.replace('.', '.')
return encoded
def starttag(self, node, tagname, suffix='\n', empty=0, **attributes):
"""
Construct and return a start tag given a node (id & class attributes
are extracted), tag name, and optional attributes.
"""
tagname = tagname.lower()
prefix = []
atts = {}
ids = []
for (name, value) in attributes.items():
atts[name.lower()] = value
classes = node.get('classes', [])
if atts.has_key('class'):
classes.append(atts['class'])
if classes:
atts['class'] = ' '.join(classes)
assert not atts.has_key('id')
ids.extend(node.get('ids', []))
if atts.has_key('ids'):
ids.extend(atts['ids'])
del atts['ids']
if ids:
atts['id'] = ids[0]
for id in ids[1:]:
# Add empty "span" elements for additional IDs. Note
# that we cannot use empty "a" elements because there
# may be targets inside of references, but nested "a"
# elements aren't allowed in XHTML (even if they do
# not all have a "href" attribute).
if empty:
# Empty tag. Insert target right in front of element.
prefix.append('' % id)
else:
# Non-empty tag. Place the auxiliary tag
# *inside* the element, as the first child.
suffix += '' % id
# !!! next 2 lines to be removed in Docutils 0.5:
if atts.has_key('id') and tagname in self.named_tags:
atts['name'] = atts['id'] # for compatibility with old browsers
attlist = atts.items()
attlist.sort()
parts = [tagname]
for name, value in attlist:
# value=None was used for boolean attributes without
# value, but this isn't supported by XHTML.
assert value is not None
if isinstance(value, ListType):
values = [unicode(v) for v in value]
parts.append('%s="%s"' % (name.lower(),
self.attval(' '.join(values))))
else:
try:
uval = unicode(value)
except TypeError: # for Python 2.1 compatibility:
uval = unicode(str(value))
parts.append('%s="%s"' % (name.lower(), self.attval(uval)))
if empty:
infix = ' /'
else:
infix = ''
return ''.join(prefix) + '<%s%s>' % (' '.join(parts), infix) + suffix
def emptytag(self, node, tagname, suffix='\n', **attributes):
"""Construct and return an XML-compatible empty tag."""
return self.starttag(node, tagname, suffix, empty=1, **attributes)
# !!! to be removed in Docutils 0.5 (change calls to use "starttag"):
def start_tag_with_title(self, node, tagname, **atts):
"""ID and NAME attributes will be handled in the title."""
node = {'classes': node.get('classes', [])}
return self.starttag(node, tagname, **atts)
def set_class_on_child(self, node, class_, index=0):
"""
Set class `class_` on the visible child no. index of `node`.
Do nothing if node has fewer children than `index`.
"""
children = [n for n in node if not isinstance(n, nodes.Invisible)]
try:
child = children[index]
except IndexError:
return
child['classes'].append(class_)
def set_first_last(self, node):
self.set_class_on_child(node, 'first', 0)
self.set_class_on_child(node, 'last', -1)
def visit_Text(self, node):
text = node.astext()
encoded = self.encode(text)
if self.in_mailto and self.settings.cloak_email_addresses:
encoded = self.cloak_email(encoded)
self.body.append(encoded)
def depart_Text(self, node):
pass
def visit_abbreviation(self, node):
# @@@ implementation incomplete ("title" attribute)
self.body.append(self.starttag(node, 'abbr', ''))
def depart_abbreviation(self, node):
self.body.append('')
def visit_acronym(self, node):
# @@@ implementation incomplete ("title" attribute)
self.body.append(self.starttag(node, 'acronym', ''))
def depart_acronym(self, node):
self.body.append('')
def visit_address(self, node):
self.visit_docinfo_item(node, 'address', meta=None)
self.body.append(self.starttag(node, 'pre', CLASS='address'))
def depart_address(self, node):
self.body.append('\n\n')
self.depart_docinfo_item()
def visit_admonition(self, node, name=''):
self.body.append(self.start_tag_with_title(
node, 'div', CLASS=(name or 'admonition')))
if name:
node.insert(0, nodes.title(name, self.language.labels[name]))
self.set_first_last(node)
def depart_admonition(self, node=None):
self.body.append('\n')
def visit_attention(self, node):
self.visit_admonition(node, 'attention')
def depart_attention(self, node):
self.depart_admonition()
attribution_formats = {'dash': ('—', ''),
'parentheses': ('(', ')'),
'parens': ('(', ')'),
'none': ('', '')}
def visit_attribution(self, node):
prefix, suffix = self.attribution_formats[self.settings.attribution]
self.context.append(suffix)
self.body.append(
self.starttag(node, 'p', prefix, CLASS='attribution'))
def depart_attribution(self, node):
self.body.append(self.context.pop() + '
\n')
def visit_author(self, node):
if isinstance(node.parent, nodes.authors):
if self.author_in_authors:
self.body.append('\n ')
else:
self.visit_docinfo_item(node, 'author')
def depart_author(self, node):
if isinstance(node.parent, nodes.authors):
self.author_in_authors += 1
else:
self.depart_docinfo_item()
def visit_authors(self, node):
self.visit_docinfo_item(node, 'authors')
self.author_in_authors = 0 # initialize counter
def depart_authors(self, node):
self.depart_docinfo_item()
self.author_in_authors = None
def visit_block_quote(self, node):
self.body.append(self.starttag(node, 'blockquote'))
def depart_block_quote(self, node):
self.body.append('\n')
def check_simple_list(self, node):
"""Check for a simple list that can be rendered compactly."""
visitor = SimpleListChecker(self.document)
try:
node.walk(visitor)
except nodes.NodeFound:
return None
else:
return 1
def is_compactable(self, node):
return ('compact' in node['classes']
or (self.settings.compact_lists
and 'open' not in node['classes']
and (self.compact_simple
or self.topic_classes == ['contents']
or self.check_simple_list(node))))
def visit_bullet_list(self, node):
atts = {}
old_compact_simple = self.compact_simple
self.context.append((self.compact_simple, self.compact_p))
self.compact_p = None
self.compact_simple = self.is_compactable(node)
if self.compact_simple and not old_compact_simple:
atts['class'] = 'simple'
self.body.append(self.starttag(node, 'ul', **atts))
def depart_bullet_list(self, node):
self.compact_simple, self.compact_p = self.context.pop()
self.body.append('\n')
def visit_caption(self, node):
self.body.append(self.starttag(node, 'p', '', CLASS='caption'))
def depart_caption(self, node):
self.body.append('\n')
def visit_caution(self, node):
self.visit_admonition(node, 'caution')
def depart_caution(self, node):
self.depart_admonition()
def visit_citation(self, node):
self.body.append(self.starttag(node, 'table',
CLASS='docutils citation',
frame="void", rules="none"))
self.body.append('
'
% self.language.labels[name])
if len(node):
if isinstance(node[0], nodes.Element):
node[0]['classes'].append('first')
if isinstance(node[-1], nodes.Element):
node[-1]['classes'].append('last')
def depart_docinfo_item(self):
self.body.append('
\n')
def visit_doctest_block(self, node):
self.body.append(self.starttag(node, 'pre', CLASS='doctest-block'))
def depart_doctest_block(self, node):
self.body.append('\n\n')
def visit_document(self, node):
self.head.append('%s\n'
% self.encode(node.get('title', '')))
def depart_document(self, node):
self.fragment.extend(self.body)
self.body_prefix.append(self.starttag(node, 'div', CLASS='document'))
self.body_suffix.insert(0, '\n')
# skip content-type meta tag with interpolated charset value:
self.html_head.extend(self.head[1:])
self.html_body.extend(self.body_prefix[1:] + self.body_pre_docinfo
+ self.docinfo + self.body
+ self.body_suffix[:-1])
def visit_emphasis(self, node):
self.body.append('')
def depart_emphasis(self, node):
self.body.append('')
def visit_entry(self, node):
atts = {'class': []}
if isinstance(node.parent.parent, nodes.thead):
atts['class'].append('head')
if node.parent.parent.parent.stubs[node.parent.column]:
# "stubs" list is an attribute of the tgroup element
atts['class'].append('stub')
if atts['class']:
tagname = 'th'
atts['class'] = ' '.join(atts['class'])
else:
tagname = 'td'
del atts['class']
node.parent.column += 1
if node.has_key('morerows'):
atts['rowspan'] = node['morerows'] + 1
if node.has_key('morecols'):
atts['colspan'] = node['morecols'] + 1
node.parent.column += node['morecols']
self.body.append(self.starttag(node, tagname, '', **atts))
self.context.append('%s>\n' % tagname.lower())
if len(node) == 0: # empty cell
self.body.append(' ')
self.set_first_last(node)
def depart_entry(self, node):
self.body.append(self.context.pop())
def visit_enumerated_list(self, node):
"""
The 'start' attribute does not conform to HTML 4.01's strict.dtd, but
CSS1 doesn't help. CSS2 isn't widely enough supported yet to be
usable.
"""
atts = {}
if node.has_key('start'):
atts['start'] = node['start']
if node.has_key('enumtype'):
atts['class'] = node['enumtype']
# @@@ To do: prefix, suffix. How? Change prefix/suffix to a
# single "format" attribute? Use CSS2?
old_compact_simple = self.compact_simple
self.context.append((self.compact_simple, self.compact_p))
self.compact_p = None
self.compact_simple = self.is_compactable(node)
if self.compact_simple and not old_compact_simple:
atts['class'] = (atts.get('class', '') + ' simple').strip()
self.body.append(self.starttag(node, 'ol', **atts))
def depart_enumerated_list(self, node):
self.compact_simple, self.compact_p = self.context.pop()
self.body.append('\n')
def visit_error(self, node):
self.visit_admonition(node, 'error')
def depart_error(self, node):
self.depart_admonition()
def visit_field(self, node):
self.body.append(self.starttag(node, 'tr', '', CLASS='field'))
def depart_field(self, node):
self.body.append('\n')
def visit_field_body(self, node):
self.body.append(self.starttag(node, 'td', '', CLASS='field-body'))
self.set_class_on_child(node, 'first', 0)
field = node.parent
if (self.compact_field_list or
isinstance(field.parent, nodes.docinfo) or
field.parent.index(field) == len(field.parent) - 1):
# If we are in a compact list, the docinfo, or if this is
# the last field of the field list, do not add vertical
# space after last element.
self.set_class_on_child(node, 'last', -1)
def depart_field_body(self, node):
self.body.append('\n')
def visit_field_list(self, node):
self.context.append((self.compact_field_list, self.compact_p))
self.compact_p = None
if 'compact' in node['classes']:
self.compact_field_list = 1
elif (self.settings.compact_field_lists
and 'open' not in node['classes']):
self.compact_field_list = 1
if self.compact_field_list:
for field in node:
field_body = field[-1]
assert isinstance(field_body, nodes.field_body)
children = [n for n in field_body
if not isinstance(n, nodes.Invisible)]
if not (len(children) == 0 or
len(children) == 1 and
isinstance(children[0], nodes.paragraph)):
self.compact_field_list = 0
break
self.body.append(self.starttag(node, 'table', frame='void',
rules='none',
CLASS='docutils field-list'))
self.body.append('
tags around paragraph ``node`` can be omitted.
"""
if (isinstance(node.parent, nodes.document) or
isinstance(node.parent, nodes.compound)):
# Never compact paragraphs in document or compound.
return 0
for key, value in node.attlist():
if (node.is_not_default(key) and
not (key == 'classes' and value in
([], ['first'], ['last'], ['first', 'last']))):
# Attribute which needs to survive.
return 0
first = isinstance(node.parent[0], nodes.label) # skip label
for child in node.parent.children[first:]:
# only first paragraph can be compact
if isinstance(child, nodes.Invisible):
continue
if child is node:
break
return 0
if ( self.compact_simple
or self.compact_field_list
or (self.compact_p
and (len(node.parent) == 1
or len(node.parent) == 2
and isinstance(node.parent[0], nodes.label)))):
return 1
return 0
def visit_paragraph(self, node):
if self.should_be_compact_paragraph(node):
self.context.append('')
else:
self.body.append(self.starttag(node, 'p', ''))
self.context.append('