"""HTML form handling for web clients.
ClientForm is a Python module for handling HTML forms on the client
side, useful for parsing HTML forms, filling them in and returning the
completed forms to the server. It has developed from a port of Gisle
Aas' Perl module HTML::Form, from the libwww-perl library, but the
interface is not the same.
The most useful docstring is the one for HTMLForm.
RFC 1866: HTML 2.0
RFC 1867: Form-based File Upload in HTML
RFC 2388: Returning Values from Forms: multipart/form-data
HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX)
HTML 4.01 Specification, W3C Recommendation 24 December 1999
Copyright 2002-2006 John J. Lee
Copyright 2005 Gary Poster
Copyright 2005 Zope Corporation
Copyright 1998-2000 Gisle Aas.
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD License (see the file COPYING included with
the distribution).
"""
# XXX
# Remove unescape_attr method
# Remove parser testing hack
# safeUrl()-ize action
# Really should to merge CC, CF, pp and mechanize as soon as mechanize
# goes to beta...
# Add url attribute to ParseError
# Switch to unicode throughout (would be 0.3.x)
# See Wichert Akkerman's 2004-01-22 message to c.l.py.
# Add charset parameter to Content-type headers? How to find value??
# Add some more functional tests
# Especially single and multiple file upload on the internet.
# Does file upload work when name is missing? Sourceforge tracker form
# doesn't like it. Check standards, and test with Apache. Test
# binary upload with Apache.
# Controls can have name=None (e.g. forms constructed partly with
# JavaScript), but find_control can't be told to find a control
# with that name, because None there means 'unspecified'. Can still
# get at by nr, but would be nice to be able to specify something
# equivalent to name=None, too.
# mailto submission & enctype text/plain
# I'm not going to fix this unless somebody tells me what real servers
# that want this encoding actually expect: If enctype is
# application/x-www-form-urlencoded and there's a FILE control present.
# Strictly, it should be 'name=data' (see HTML 4.01 spec., section
# 17.13.2), but I send "name=" ATM. What about multiple file upload??
# Would be nice, but I'm not going to do it myself:
# -------------------------------------------------
# Maybe a 0.4.x?
# Replace by_label etc. with moniker / selector concept. Allows, eg.,
# a choice between selection by value / id / label / element
# contents. Or choice between matching labels exactly or by
# substring. Etc.
# Remove deprecated methods.
# ...what else?
# Work on DOMForm.
# XForms? Don't know if there's a need here.
try: True
except NameError:
True = 1
False = 0
try: bool
except NameError:
def bool(expr):
if expr: return True
else: return False
try:
import logging
except ImportError:
def debug(msg, *args, **kwds):
pass
else:
_logger = logging.getLogger("ClientForm")
OPTIMIZATION_HACK = True
def debug(msg, *args, **kwds):
if OPTIMIZATION_HACK:
return
try:
raise Exception()
except:
caller_name = (
sys.exc_info()[2].tb_frame.f_back.f_back.f_code.co_name)
extended_msg = '%%s %s' % msg
extended_args = (caller_name,)+args
debug = _logger.debug(extended_msg, *extended_args, **kwds)
def _show_debug_messages():
global OPTIMIZATION_HACK
OPTIMIZATION_HACK = False
_logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
_logger.addHandler(handler)
import sys, urllib, urllib2, types, mimetools, copy, urlparse, \
htmlentitydefs, re, random
from urlparse import urljoin
from cStringIO import StringIO
try:
import warnings
except ImportError:
def deprecation(message):
pass
else:
def deprecation(message):
warnings.warn(message, DeprecationWarning, stacklevel=2)
VERSION = "0.2.2"
CHUNK = 1024 # size of chunks fed to parser, in bytes
DEFAULT_ENCODING = "latin-1"
_compress_re = re.compile(r"\s+")
def compress_text(text): return _compress_re.sub(" ", text.strip())
# This version of urlencode is from my Python 1.5.2 back-port of the
# Python 2.1 CVS maintenance branch of urllib. It will accept a sequence
# of pairs instead of a mapping -- the 2.0 version only accepts a mapping.
def urlencode(query,doseq=False,):
"""Encode a sequence of two-element tuples or dictionary into a URL query \
string.
If any values in the query arg are sequences and doseq is true, each
sequence element is converted to a separate parameter.
If the query arg is a sequence of two-element tuples, the order of the
parameters in the output will match the order of parameters in the
input.
"""
if hasattr(query,"items"):
# mapping objects
query = query.items()
else:
# it's a bother at times that strings and string-like objects are
# sequences...
try:
# non-sequence items should not work with len()
x = len(query)
# non-empty strings will fail this
if len(query) and type(query[0]) != types.TupleType:
raise TypeError()
# zero-length sequences of all types will get here and succeed,
# but that's a minor nit - since the original implementation
# allowed empty dicts that type of behavior probably should be
# preserved for consistency
except TypeError:
ty,va,tb = sys.exc_info()
raise TypeError("not a valid non-string sequence or mapping "
"object", tb)
l = []
if not doseq:
# preserve old behavior
for k, v in query:
k = urllib.quote_plus(str(k))
v = urllib.quote_plus(str(v))
l.append(k + '=' + v)
else:
for k, v in query:
k = urllib.quote_plus(str(k))
if type(v) == types.StringType:
v = urllib.quote_plus(v)
l.append(k + '=' + v)
elif type(v) == types.UnicodeType:
# is there a reasonable way to convert to ASCII?
# encode generates a string, but "replace" or "ignore"
# lose information and "strict" can raise UnicodeError
v = urllib.quote_plus(v.encode("ASCII","replace"))
l.append(k + '=' + v)
else:
try:
# is this a sufficient test for sequence-ness?
x = len(v)
except TypeError:
# not a sequence
v = urllib.quote_plus(str(v))
l.append(k + '=' + v)
else:
# loop over the sequence
for elt in v:
l.append(k + '=' + urllib.quote_plus(str(elt)))
return '&'.join(l)
def unescape(data, entities, encoding=DEFAULT_ENCODING):
if data is None or "&" not in data:
return data
def replace_entities(match, entities=entities, encoding=encoding):
ent = match.group()
if ent[1] == "#":
return unescape_charref(ent[2:-1], encoding)
repl = entities.get(ent)
if repl is not None:
if type(repl) != type(""):
try:
repl = repl.encode(encoding)
except UnicodeError:
repl = ent
else:
repl = ent
return repl
return re.sub(r"?[A-Za-z0-9]+?;", replace_entities, data)
def unescape_charref(data, encoding):
name, base = data, 10
if name.startswith("x"):
name, base= name[1:], 16
uc = unichr(int(name, base))
if encoding is None:
return uc
else:
try:
repl = uc.encode(encoding)
except UnicodeError:
repl = "%s;" % data
return repl
def get_entitydefs():
import htmlentitydefs
from codecs import latin_1_decode
entitydefs = {}
try:
htmlentitydefs.name2codepoint
except AttributeError:
entitydefs = {}
for name, char in htmlentitydefs.entitydefs.items():
uc = latin_1_decode(char)[0]
if uc.startswith("") and uc.endswith(";"):
uc = unescape_charref(uc[2:-1], None)
entitydefs["&%s;" % name] = uc
else:
for name, codepoint in htmlentitydefs.name2codepoint.items():
entitydefs["&%s;" % name] = unichr(codepoint)
return entitydefs
def issequence(x):
try:
x[0]
except (TypeError, KeyError):
return False
except IndexError:
pass
return True
def isstringlike(x):
try: x+""
except: return False
else: return True
def choose_boundary():
"""Return a string usable as a multipart boundary."""
# follow IE and firefox
nonce = "".join([str(random.randint(0, sys.maxint-1)) for i in 0,1,2])
return "-"*27 + nonce
# This cut-n-pasted MimeWriter from standard library is here so can add
# to HTTP headers rather than message body when appropriate. It also uses
# \r\n in place of \n. This is a bit nasty.
class MimeWriter:
"""Generic MIME writer.
Methods:
__init__()
addheader()
flushheaders()
startbody()
startmultipartbody()
nextpart()
lastpart()
A MIME writer is much more primitive than a MIME parser. It
doesn't seek around on the output file, and it doesn't use large
amounts of buffer space, so you have to write the parts in the
order they should occur on the output file. It does buffer the
headers you add, allowing you to rearrange their order.
General usage is:
f =
w = MimeWriter(f)
...call w.addheader(key, value) 0 or more times...
followed by either:
f = w.startbody(content_type)
...call f.write(data) for body data...
or:
w.startmultipartbody(subtype)
for each part:
subwriter = w.nextpart()
...use the subwriter's methods to create the subpart...
w.lastpart()
The subwriter is another MimeWriter instance, and should be
treated in the same way as the toplevel MimeWriter. This way,
writing recursive body parts is easy.
Warning: don't forget to call lastpart()!
XXX There should be more state so calls made in the wrong order
are detected.
Some special cases:
- startbody() just returns the file passed to the constructor;
but don't use this knowledge, as it may be changed.
- startmultipartbody() actually returns a file as well;
this can be used to write the initial 'if you can read this your
mailer is not MIME-aware' message.
- If you call flushheaders(), the headers accumulated so far are
written out (and forgotten); this is useful if you don't need a
body part at all, e.g. for a subpart of type message/rfc822
that's (mis)used to store some header-like information.
- Passing a keyword argument 'prefix=' to addheader(),
start*body() affects where the header is inserted; 0 means
append at the end, 1 means insert at the start; default is
append for addheader(), but insert for start*body(), which use
it to determine where the Content-type header goes.
"""
def __init__(self, fp, http_hdrs=None):
self._http_hdrs = http_hdrs
self._fp = fp
self._headers = []
self._boundary = []
self._first_part = True
def addheader(self, key, value, prefix=0,
add_to_http_hdrs=0):
"""
prefix is ignored if add_to_http_hdrs is true.
"""
lines = value.split("\r\n")
while lines and not lines[-1]: del lines[-1]
while lines and not lines[0]: del lines[0]
if add_to_http_hdrs:
value = "".join(lines)
self._http_hdrs.append((key, value))
else:
for i in range(1, len(lines)):
lines[i] = " " + lines[i].strip()
value = "\r\n".join(lines) + "\r\n"
line = key + ": " + value
if prefix:
self._headers.insert(0, line)
else:
self._headers.append(line)
def flushheaders(self):
self._fp.writelines(self._headers)
self._headers = []
def startbody(self, ctype=None, plist=[], prefix=1,
add_to_http_hdrs=0, content_type=1):
"""
prefix is ignored if add_to_http_hdrs is true.
"""
if content_type and ctype:
for name, value in plist:
ctype = ctype + ';\r\n %s=%s' % (name, value)
self.addheader("Content-type", ctype, prefix=prefix,
add_to_http_hdrs=add_to_http_hdrs)
self.flushheaders()
if not add_to_http_hdrs: self._fp.write("\r\n")
self._first_part = True
return self._fp
def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1,
add_to_http_hdrs=0, content_type=1):
boundary = boundary or choose_boundary()
self._boundary.append(boundary)
return self.startbody("multipart/" + subtype,
[("boundary", boundary)] + plist,
prefix=prefix,
add_to_http_hdrs=add_to_http_hdrs,
content_type=content_type)
def nextpart(self):
boundary = self._boundary[-1]
if self._first_part:
self._first_part = False
else:
self._fp.write("\r\n")
self._fp.write("--" + boundary + "\r\n")
return self.__class__(self._fp)
def lastpart(self):
if self._first_part:
self.nextpart()
boundary = self._boundary.pop()
self._fp.write("\r\n--" + boundary + "--\r\n")
class LocateError(ValueError): pass
class AmbiguityError(LocateError): pass
class ControlNotFoundError(LocateError): pass
class ItemNotFoundError(LocateError): pass
class ItemCountError(ValueError): pass
class ParseError(Exception): pass
class _AbstractFormParser:
"""forms attribute contains HTMLForm instances on completion."""
# thanks to Moshe Zadka for an example of sgmllib/htmllib usage
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
if entitydefs is None:
entitydefs = get_entitydefs()
self._entitydefs = entitydefs
self._encoding = encoding
self.base = None
self.forms = []
self.labels = []
self._current_label = None
self._current_form = None
self._select = None
self._optgroup = None
self._option = None
self._textarea = None
def do_base(self, attrs):
debug("%s", attrs)
for key, value in attrs:
if key == "href":
self.base = value
def end_body(self):
debug("")
if self._current_label is not None:
self.end_label()
if self._current_form is not None:
self.end_form()
def start_form(self, attrs):
debug("%s", attrs)
if self._current_form is not None:
raise ParseError("nested FORMs")
name = None
action = None
enctype = "application/x-www-form-urlencoded"
method = "GET"
d = {}
for key, value in attrs:
if key == "name":
name = value
elif key == "action":
action = value
elif key == "method":
method = value.upper()
elif key == "enctype":
enctype = value.lower()
d[key] = value
controls = []
self._current_form = (name, action, method, enctype), d, controls
def end_form(self):
debug("")
if self._current_label is not None:
self.end_label()
if self._current_form is None:
raise ParseError("end of FORM before start")
self.forms.append(self._current_form)
self._current_form = None
def start_select(self, attrs):
debug("%s", attrs)
if self._current_form is None:
raise ParseError("start of SELECT before start of FORM")
if self._select is not None:
raise ParseError("nested SELECTs")
if self._textarea is not None:
raise ParseError("SELECT inside TEXTAREA")
d = {}
for key, val in attrs:
d[key] = val
self._select = d
self._add_label(d)
self._append_select_control({"__select": d})
def end_select(self):
debug("")
if self._current_form is None:
raise ParseError("end of SELECT before start of FORM")
if self._select is None:
raise ParseError("end of SELECT before start")
if self._option is not None:
self._end_option()
self._select = None
def start_optgroup(self, attrs):
debug("%s", attrs)
if self._select is None:
raise ParseError("OPTGROUP outside of SELECT")
d = {}
for key, val in attrs:
d[key] = val
self._optgroup = d
def end_optgroup(self):
debug("")
if self._optgroup is None:
raise ParseError("end of OPTGROUP before start")
self._optgroup = None
def _start_option(self, attrs):
debug("%s", attrs)
if self._select is None:
raise ParseError("OPTION outside of SELECT")
if self._option is not None:
self._end_option()
d = {}
for key, val in attrs:
d[key] = val
self._option = {}
self._option.update(d)
if (self._optgroup and self._optgroup.has_key("disabled") and
not self._option.has_key("disabled")):
self._option["disabled"] = None
def _end_option(self):
debug("")
if self._option is None:
raise ParseError("end of OPTION before start")
contents = self._option.get("contents", "").strip()
self._option["contents"] = contents
if not self._option.has_key("value"):
self._option["value"] = contents
if not self._option.has_key("label"):
self._option["label"] = contents
# stuff dict of SELECT HTML attrs into a special private key
# (gets deleted again later)
self._option["__select"] = self._select
self._append_select_control(self._option)
self._option = None
def _append_select_control(self, attrs):
debug("%s", attrs)
controls = self._current_form[2]
name = self._select.get("name")
controls.append(("select", name, attrs))
def start_textarea(self, attrs):
debug("%s", attrs)
if self._current_form is None:
raise ParseError("start of TEXTAREA before start of FORM")
if self._textarea is not None:
raise ParseError("nested TEXTAREAs")
if self._select is not None:
raise ParseError("TEXTAREA inside SELECT")
d = {}
for key, val in attrs:
d[key] = val
self._add_label(d)
self._textarea = d
def end_textarea(self):
debug("")
if self._current_form is None:
raise ParseError("end of TEXTAREA before start of FORM")
if self._textarea is None:
raise ParseError("end of TEXTAREA before start")
controls = self._current_form[2]
name = self._textarea.get("name")
controls.append(("textarea", name, self._textarea))
self._textarea = None
def start_label(self, attrs):
debug("%s", attrs)
if self._current_label:
self.end_label()
d = {}
for key, val in attrs:
d[key] = val
taken = bool(d.get("for")) # empty id is invalid
d["__text"] = ""
d["__taken"] = taken
if taken:
self.labels.append(d)
self._current_label = d
def end_label(self):
debug("")
label = self._current_label
if label is None:
# something is ugly in the HTML, but we're ignoring it
return
self._current_label = None
label["__text"] = label["__text"]
# if it is staying around, it is True in all cases
del label["__taken"]
def _add_label(self, d):
#debug("%s", d)
if self._current_label is not None:
if self._current_label["__taken"]:
self.end_label() # be fuzzy
else:
self._current_label["__taken"] = True
d["__label"] = self._current_label
def handle_data(self, data):
# according to http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.1
# line break immediately after start tags or immediately before end
# tags must be ignored, but real browsers only ignore a line break
# after a start tag, so we'll do that.
if data[0:1] == '\n':
data = data[1:]
debug("%s", data)
if self._option is not None:
# self._option is a dictionary of the OPTION element's HTML
# attributes, but it has two special keys, one of which is the
# special "contents" key contains text between OPTION tags (the
# other is the "__select" key: see the end_option method)
map = self._option
key = "contents"
elif self._textarea is not None:
map = self._textarea
key = "value"
# not if within option or textarea
elif self._current_label is not None:
map = self._current_label
key = "__text"
else:
return
if not map.has_key(key):
map[key] = data
else:
map[key] = map[key] + data
def do_button(self, attrs):
debug("%s", attrs)
if self._current_form is None:
raise ParseError("start of BUTTON before start of FORM")
d = {}
d["type"] = "submit" # default
for key, val in attrs:
d[key] = val
controls = self._current_form[2]
type = d["type"]
name = d.get("name")
# we don't want to lose information, so use a type string that
# doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON}
# e.g. type for BUTTON/RESET is "resetbutton"
# (type for INPUT/RESET is "reset")
type = type+"button"
self._add_label(d)
controls.append((type, name, d))
def do_input(self, attrs):
debug("%s", attrs)
if self._current_form is None:
raise ParseError("start of INPUT before start of FORM")
d = {}
d["type"] = "text" # default
for key, val in attrs:
d[key] = val
controls = self._current_form[2]
type = d["type"]
name = d.get("name")
self._add_label(d)
controls.append((type, name, d))
def do_isindex(self, attrs):
debug("%s", attrs)
if self._current_form is None:
raise ParseError("start of ISINDEX before start of FORM")
d = {}
for key, val in attrs:
d[key] = val
controls = self._current_form[2]
self._add_label(d)
# isindex doesn't have type or name HTML attributes
controls.append(("isindex", None, d))
def handle_entityref(self, name):
#debug("%s", name)
self.handle_data(unescape(
'&%s;' % name, self._entitydefs, self._encoding))
def handle_charref(self, name):
#debug("%s", name)
self.handle_data(unescape_charref(name, self._encoding))
def unescape_attr(self, name):
#debug("%s", name)
return unescape(name, self._entitydefs, self._encoding)
def unescape_attrs(self, attrs):
#debug("%s", attrs)
escaped_attrs = {}
for key, val in attrs.items():
try:
val.items
except AttributeError:
escaped_attrs[key] = self.unescape_attr(val)
else:
# e.g. "__select" -- yuck!
escaped_attrs[key] = self.unescape_attrs(val)
return escaped_attrs
def unknown_entityref(self, ref): self.handle_data("&%s;" % ref)
def unknown_charref(self, ref): self.handle_data("%s;" % ref)
# HTMLParser.HTMLParser is recent, so live without it if it's not available
# (also, htmllib.HTMLParser is much more tolerant of bad HTML)
try:
import HTMLParser
except ImportError:
class XHTMLCompatibleFormParser:
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
raise ValueError("HTMLParser could not be imported")
else:
class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser):
"""Good for XHTML, bad for tolerance of incorrect HTML."""
# thanks to Michael Howitz for this!
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
HTMLParser.HTMLParser.__init__(self)
_AbstractFormParser.__init__(self, entitydefs, encoding)
def start_option(self, attrs):
_AbstractFormParser._start_option(self, attrs)
def end_option(self):
_AbstractFormParser._end_option(self)
def handle_starttag(self, tag, attrs):
try:
method = getattr(self, "start_" + tag)
except AttributeError:
try:
method = getattr(self, "do_" + tag)
except AttributeError:
pass # unknown tag
else:
method(attrs)
else:
method(attrs)
def handle_endtag(self, tag):
try:
method = getattr(self, "end_" + tag)
except AttributeError:
pass # unknown tag
else:
method()
def unescape(self, name):
# Use the entitydefs passed into constructor, not
# HTMLParser.HTMLParser's entitydefs.
return self.unescape_attr(name)
def unescape_attr_if_required(self, name):
return name # HTMLParser.HTMLParser already did it
def unescape_attrs_if_required(self, attrs):
return attrs # ditto
import sgmllib
# monkeypatch to fix http://www.python.org/sf/803422 :-(
sgmllib.charref = re.compile("(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
class _AbstractSgmllibParser(_AbstractFormParser):
def do_option(self, attrs):
_AbstractFormParser._start_option(self, attrs)
def unescape_attr_if_required(self, name):
return self.unescape_attr(name)
def unescape_attrs_if_required(self, attrs):
return self.unescape_attrs(attrs)
class FormParser(_AbstractSgmllibParser, sgmllib.SGMLParser):
"""Good for tolerance of incorrect HTML, bad for XHTML."""
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
sgmllib.SGMLParser.__init__(self)
_AbstractFormParser.__init__(self, entitydefs, encoding)
try:
if sys.version_info[:2] < (2, 2):
raise ImportError # BeautifulSoup uses generators
import BeautifulSoup
except ImportError:
pass
else:
class _AbstractBSFormParser(_AbstractSgmllibParser):
bs_base_class = None
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
_AbstractFormParser.__init__(self, entitydefs, encoding)
self.bs_base_class.__init__(self)
def handle_data(self, data):
_AbstractFormParser.handle_data(self, data)
self.bs_base_class.handle_data(self, data)
class RobustFormParser(_AbstractBSFormParser, BeautifulSoup.BeautifulSoup):
"""Tries to be highly tolerant of incorrect HTML."""
bs_base_class = BeautifulSoup.BeautifulSoup
class NestingRobustFormParser(_AbstractBSFormParser,
BeautifulSoup.ICantBelieveItsBeautifulSoup):
"""Tries to be highly tolerant of incorrect HTML.
Different from RobustFormParser in that it more often guesses nesting
above missing end tags (see BeautifulSoup docs).
"""
bs_base_class = BeautifulSoup.ICantBelieveItsBeautifulSoup
#FormParser = XHTMLCompatibleFormParser # testing hack
#FormParser = RobustFormParser # testing hack
def ParseResponse(response, select_default=False,
ignore_errors=False, # ignored!
form_parser_class=FormParser,
request_class=urllib2.Request,
entitydefs=None,
backwards_compat=True,
encoding=DEFAULT_ENCODING,
):
"""Parse HTTP response and return a list of HTMLForm instances.
The return value of urllib2.urlopen can be conveniently passed to this
function as the response parameter.
ClientForm.ParseError is raised on parse errors.
response: file-like object (supporting read() method) with a method
geturl(), returning the URI of the HTTP response
select_default: for multiple-selection SELECT controls and RADIO controls,
pick the first item as the default if none are selected in the HTML
form_parser_class: class to instantiate and use to pass
request_class: class to return from .click() method (default is
urllib2.Request)
entitydefs: mapping like {"&": "&", ...} containing HTML entity
definitions (a sensible default is used)
encoding: character encoding used for encoding numeric character references
when matching link text. ClientForm does not attempt to find the encoding
in a META HTTP-EQUIV attribute in the document itself (mechanize, for
example, does do that and will pass the correct value to ClientForm using
this parameter).
backwards_compat: boolean that determines whether the returned HTMLForm
objects are backwards-compatible with old code. If backwards_compat is
true:
- ClientForm 0.1 code will continue to work as before.
- Label searches that do not specify a nr (number or count) will always
get the first match, even if other controls match. If
backwards_compat is False, label searches that have ambiguous results
will raise an AmbiguityError.
- Item label matching is done by strict string comparison rather than
substring matching.
- De-selecting individual list items is allowed even if the Item is
disabled.
The backwards_compat argument will be deprecated in a future release.
Pass a true value for select_default if you want the behaviour specified by
RFC 1866 (the HTML 2.0 standard), which is to select the first item in a
RADIO or multiple-selection SELECT control if none were selected in the
HTML. Most browsers (including Microsoft Internet Explorer (IE) and
Netscape Navigator) instead leave all items unselected in these cases. The
W3C HTML 4.0 standard leaves this behaviour undefined in the case of
multiple-selection SELECT controls, but insists that at least one RADIO
button should be checked at all times, in contradiction to browser
behaviour.
There is a choice of parsers. ClientForm.XHTMLCompatibleFormParser (uses
HTMLParser.HTMLParser) works best for XHTML, ClientForm.FormParser (uses
sgmllib.SGMLParser) (the default) works better for ordinary grubby HTML.
Note that HTMLParser is only available in Python 2.2 and later. You can
pass your own class in here as a hack to work around bad HTML, but at your
own risk: there is no well-defined interface.
"""
return ParseFile(response, response.geturl(), select_default,
False,
form_parser_class,
request_class,
entitydefs,
backwards_compat,
encoding,
)
def ParseFile(file, base_uri, select_default=False,
ignore_errors=False, # ignored!
form_parser_class=FormParser,
request_class=urllib2.Request,
entitydefs=None,
backwards_compat=True,
encoding=DEFAULT_ENCODING,
):
"""Parse HTML and return a list of HTMLForm instances.
ClientForm.ParseError is raised on parse errors.
file: file-like object (supporting read() method) containing HTML with zero
or more forms to be parsed
base_uri: the URI of the document (note that the base URI used to submit
the form will be that given in the BASE element if present, not that of
the document)
For the other arguments and further details, see ParseResponse.__doc__.
"""
if backwards_compat:
deprecation("operating in backwards-compatibility mode")
fp = form_parser_class(entitydefs, encoding)
while 1:
data = file.read(CHUNK)
try:
fp.feed(data)
except ParseError, e:
e.base_uri = base_uri
raise
if len(data) != CHUNK: break
if fp.base is not None:
# HTML BASE element takes precedence over document URI
base_uri = fp.base
labels = [] # Label(label) for label in fp.labels]
id_to_labels = {}
for l in fp.labels:
label = Label(l)
labels.append(label)
for_id = l["for"]
coll = id_to_labels.get(for_id)
if coll is None:
id_to_labels[for_id] = [label]
else:
coll.append(label)
forms = []
for (name, action, method, enctype), attrs, controls in fp.forms:
if action is None:
action = base_uri
else:
action = urljoin(base_uri, action)
action = fp.unescape_attr_if_required(action)
name = fp.unescape_attr_if_required(name)
attrs = fp.unescape_attrs_if_required(attrs)
# would be nice to make HTMLForm class (form builder) pluggable
form = HTMLForm(
action, method, enctype, name, attrs, request_class,
forms, labels, id_to_labels, backwards_compat)
for ii in range(len(controls)):
type, name, attrs = controls[ii]
attrs = fp.unescape_attrs_if_required(attrs)
name = fp.unescape_attr_if_required(name)
# index=ii*10 allows ImageControl to return multiple ordered pairs
form.new_control(type, name, attrs, select_default=select_default,
index=ii*10)
forms.append(form)
for form in forms:
form.fixup()
return forms
class Label:
def __init__(self, attrs):
self.id = attrs.get("for")
self._text = attrs.get("__text").strip()
self._ctext = compress_text(self._text)
self.attrs = attrs
self._backwards_compat = False # maintained by HTMLForm
def __getattr__(self, name):
if name == "text":
if self._backwards_compat:
return self._text
else:
return self._ctext
return getattr(Label, name)
def __setattr__(self, name, value):
if name == "text":
# don't see any need for this, so make it read-only
raise AttributeError("text attribute is read-only")
self.__dict__[name] = value
def __str__(self):
return "