"""HTML handling.
Copyright 2003-2006 John J. Lee
This code is free software; you can redistribute it and/or modify it under
the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
included with the distribution).
"""
import re, copy, urllib, htmlentitydefs
from urlparse import urljoin
import _request
from _headersutil import split_header_words, is_html as _is_html
## # XXXX miserable hack
## def urljoin(base, url):
## if url.startswith("?"):
## return base+url
## else:
## return urlparse.urljoin(base, url)
## def chr_range(a, b):
## return "".join(map(chr, range(ord(a), ord(b)+1)))
## RESERVED_URL_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
## "abcdefghijklmnopqrstuvwxyz"
## "-_.~")
## UNRESERVED_URL_CHARS = "!*'();:@&=+$,/?%#[]"
# we want (RESERVED_URL_CHARS+UNRESERVED_URL_CHARS), minus those
# 'safe'-by-default characters that urllib.urlquote never quotes
URLQUOTE_SAFE_URL_CHARS = "!*'();:@&=+$,/?%#[]~"
DEFAULT_ENCODING = "latin-1"
class CachingGeneratorFunction(object):
"""Caching wrapper around a no-arguments iterable.
>>> i = [1]
>>> func = CachingGeneratorFunction(i)
>>> list(func())
[1]
>>> list(func())
[1]
>>> i = [1, 2, 3]
>>> func = CachingGeneratorFunction(i)
>>> list(func())
[1, 2, 3]
>>> i = func()
>>> i.next()
1
>>> i.next()
2
>>> i.next()
3
>>> i = func()
>>> j = func()
>>> i.next()
1
>>> j.next()
1
>>> i.next()
2
>>> j.next()
2
>>> j.next()
3
>>> i.next()
3
>>> i.next()
Traceback (most recent call last):
...
StopIteration
>>> j.next()
Traceback (most recent call last):
...
StopIteration
"""
def __init__(self, iterable):
def make_gen():
for item in iterable:
yield item
self._cache = []
self._generator = make_gen()
def __call__(self):
cache = self._cache
for item in cache:
yield item
for item in self._generator:
cache.append(item)
yield item
def encoding_finder(default_encoding):
def encoding(response):
# HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
# headers may be in the response. HTTP-EQUIV headers come last,
# so try in order from first to last.
for ct in response.info().getheaders("content-type"):
for k, v in split_header_words([ct])[0]:
if k == "charset":
return v
return default_encoding
return encoding
def make_is_html(allow_xhtml):
def is_html(response, encoding):
ct_hdrs = response.info().getheaders("content-type")
url = response.geturl()
# XXX encoding
return _is_html(ct_hdrs, url, allow_xhtml)
return is_html
# idea for this argument-processing trick is from Peter Otten
class Args:
def __init__(self, args_map):
self.dictionary = dict(args_map)
def __getattr__(self, key):
try:
return self.dictionary[key]
except KeyError:
return getattr(self.__class__, key)
def form_parser_args(
select_default=False,
form_parser_class=None,
request_class=None,
backwards_compat=False,
):
return Args(locals())
class Link:
def __init__(self, base_url, url, text, tag, attrs):
assert None not in [url, tag, attrs]
self.base_url = base_url
self.absolute_url = urljoin(base_url, url)
self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
def __cmp__(self, other):
try:
for name in "url", "text", "tag", "attrs":
if getattr(self, name) != getattr(other, name):
return -1
except AttributeError:
return -1
return 0
def __repr__(self):
return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
self.base_url, self.url, self.text, self.tag, self.attrs)
def clean_url(url, encoding):
# percent-encode illegal URL characters
# Trying to come up with test cases for this gave me a headache, revisit
# when do switch to unicode.
# Somebody else's comments (lost the attribution):
## - IE will return you the url in the encoding you send it
## - Mozilla/Firefox will send you latin-1 if there's no non latin-1
## characters in your link. It will send you utf-8 however if there are...
if type(url) == type(""):
url = url.decode(encoding, "replace")
url = url.strip()
return urllib.quote(url.encode(encoding), URLQUOTE_SAFE_URL_CHARS)
class LinksFactory:
def __init__(self,
link_parser_class=None,
link_class=Link,
urltags=None,
):
import _pullparser
if link_parser_class is None:
link_parser_class = _pullparser.TolerantPullParser
self.link_parser_class = link_parser_class
self.link_class = link_class
if urltags is None:
urltags = {
"a": "href",
"area": "href",
"frame": "src",
"iframe": "src",
}
self.urltags = urltags
self._response = None
self._encoding = None
def set_response(self, response, base_url, encoding):
self._response = response
self._encoding = encoding
self._base_url = base_url
def links(self):
"""Return an iterator that provides links of the document."""
response = self._response
encoding = self._encoding
base_url = self._base_url
p = self.link_parser_class(response, encoding=encoding)
for token in p.tags(*(self.urltags.keys()+["base"])):
if token.data == "base":
base_url = dict(token.attrs).get("href")
continue
if token.type == "endtag":
continue
attrs = dict(token.attrs)
tag = token.data
name = attrs.get("name")
text = None
# XXX use attr_encoding for ref'd doc if that doc does not provide
# one by other means
#attr_encoding = attrs.get("charset")
url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
if not url:
# Probably an link or .
# For our purposes a link is something with a URL, so ignore
# this.
continue
url = clean_url(url, encoding)
if tag == "a":
if token.type != "startendtag":
# hmm, this'd break if end tag is missing
text = p.get_compressed_text(("endtag", tag))
# but this doesn't work for eg. Andy
#text = p.get_compressed_text()
yield Link(base_url, url, text, tag, token.attrs)
class FormsFactory:
"""Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
For constructor argument docs, see ClientForm.ParseResponse
argument docs.
"""
def __init__(self,
select_default=False,
form_parser_class=None,
request_class=None,
backwards_compat=False,
):
import ClientForm
self.select_default = select_default
if form_parser_class is None:
form_parser_class = ClientForm.FormParser
self.form_parser_class = form_parser_class
if request_class is None:
request_class = _request.Request
self.request_class = request_class
self.backwards_compat = backwards_compat
self._response = None
self.encoding = None
def set_response(self, response, encoding):
self._response = response
self.encoding = encoding
def forms(self):
import ClientForm
encoding = self.encoding
return ClientForm.ParseResponse(
self._response,
select_default=self.select_default,
form_parser_class=self.form_parser_class,
request_class=self.request_class,
backwards_compat=self.backwards_compat,
encoding=encoding,
)
class TitleFactory:
def __init__(self):
self._response = self._encoding = None
def set_response(self, response, encoding):
self._response = response
self._encoding = encoding
def title(self):
import _pullparser
p = _pullparser.TolerantPullParser(
self._response, encoding=self._encoding)
try:
p.get_tag("title")
except _pullparser.NoMoreTokensError:
return None
else:
return p.get_text()
def unescape(data, entities, encoding):
if data is None or "&" not in data:
return data
def replace_entities(match):
ent = match.group()
if ent[1] == "#":
return unescape_charref(ent[2:-1], encoding)
repl = entities.get(ent[1:-1])
if repl is not None:
repl = unichr(repl)
if type(repl) != type(""):
try:
repl = repl.encode(encoding)
except UnicodeError:
repl = ent
else:
repl = ent
return repl
return re.sub(r"?[A-Za-z0-9]+?;", replace_entities, data)
def unescape_charref(data, encoding):
name, base = data, 10
if name.startswith("x"):
name, base= name[1:], 16
uc = unichr(int(name, base))
if encoding is None:
return uc
else:
try:
repl = uc.encode(encoding)
except UnicodeError:
repl = "%s;" % data
return repl
try:
import BeautifulSoup
except ImportError:
pass
else:
import sgmllib
# monkeypatch to fix http://www.python.org/sf/803422 :-(
sgmllib.charref = re.compile("(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
class MechanizeBs(BeautifulSoup.BeautifulSoup):
_entitydefs = htmlentitydefs.name2codepoint
# don't want the magic Microsoft-char workaround
PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
lambda(x):x.group(1) + ' />'),
(re.compile(']*)>'),
lambda(x):'')
]
def __init__(self, encoding, text=None, avoidParserProblems=True,
initialTextIsEverything=True):
self._encoding = encoding
BeautifulSoup.BeautifulSoup.__init__(
self, text, avoidParserProblems, initialTextIsEverything)
def handle_charref(self, ref):
t = unescape("%s;"%ref, self._entitydefs, self._encoding)
self.handle_data(t)
def handle_entityref(self, ref):
t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
self.handle_data(t)
def unescape_attrs(self, attrs):
escaped_attrs = []
for key, val in attrs:
val = unescape(val, self._entitydefs, self._encoding)
escaped_attrs.append((key, val))
return escaped_attrs
class RobustLinksFactory:
compress_re = re.compile(r"\s+")
def __init__(self,
link_parser_class=None,
link_class=Link,
urltags=None,
):
import BeautifulSoup
if link_parser_class is None:
link_parser_class = MechanizeBs
self.link_parser_class = link_parser_class
self.link_class = link_class
if urltags is None:
urltags = {
"a": "href",
"area": "href",
"frame": "src",
"iframe": "src",
}
self.urltags = urltags
self._bs = None
self._encoding = None
self._base_url = None
def set_soup(self, soup, base_url, encoding):
self._bs = soup
self._base_url = base_url
self._encoding = encoding
def links(self):
import BeautifulSoup
bs = self._bs
base_url = self._base_url
encoding = self._encoding
gen = bs.recursiveChildGenerator()
for ch in bs.recursiveChildGenerator():
if (isinstance(ch, BeautifulSoup.Tag) and
ch.name in self.urltags.keys()+["base"]):
link = ch
attrs = bs.unescape_attrs(link.attrs)
attrs_dict = dict(attrs)
if link.name == "base":
base_url = attrs_dict.get("href")
continue
url_attr = self.urltags[link.name]
url = attrs_dict.get(url_attr)
if not url:
continue
url = clean_url(url, encoding)
text = link.firstText(lambda t: True)
if text is BeautifulSoup.Null:
# follow _pullparser's weird behaviour rigidly
if link.name == "a":
text = ""
else:
text = None
else:
text = self.compress_re.sub(" ", text.strip())
yield Link(base_url, url, text, link.name, attrs)
class RobustFormsFactory(FormsFactory):
def __init__(self, *args, **kwds):
import ClientForm
args = form_parser_args(*args, **kwds)
if args.form_parser_class is None:
args.form_parser_class = ClientForm.RobustFormParser
FormsFactory.__init__(self, **args.dictionary)
def set_response(self, response, encoding):
self._response = response
self.encoding = encoding
class RobustTitleFactory:
def __init__(self):
self._bs = self._encoding = None
def set_soup(self, soup, encoding):
self._bs = soup
self._encoding = encoding
def title(soup):
import BeautifulSoup
title = self._bs.first("title")
if title == BeautifulSoup.Null:
return None
else:
return title.firstText(lambda t: True)
class Factory:
"""Factory for forms, links, etc.
This interface may expand in future.
Public methods:
set_request_class(request_class)
set_response(response)
forms()
links()
Public attributes:
encoding: string specifying the encoding of response if it contains a text
document (this value is left unspecified for documents that do not have
an encoding, e.g. an image file)
is_html: true if response contains an HTML document (XHTML may be
regarded as HTML too)
title: page title, or None if no title or not HTML
"""
def __init__(self, forms_factory, links_factory, title_factory,
get_encoding=encoding_finder(DEFAULT_ENCODING),
is_html_p=make_is_html(allow_xhtml=False),
):
"""
Pass keyword arguments only.
default_encoding: character encoding to use if encoding cannot be
determined (or guessed) from the response. You should turn on
HTTP-EQUIV handling if you want the best chance of getting this right
without resorting to this default. The default value of this
parameter (currently latin-1) may change in future.
"""
self._forms_factory = forms_factory
self._links_factory = links_factory
self._title_factory = title_factory
self._get_encoding = get_encoding
self._is_html_p = is_html_p
self.set_response(None)
def set_request_class(self, request_class):
"""Set urllib2.Request class.
ClientForm.HTMLForm instances returned by .forms() will return
instances of this class when .click()ed.
"""
self._forms_factory.request_class = request_class
def set_response(self, response):
"""Set response.
The response must implement the same interface as objects returned by
urllib2.urlopen().
"""
self._response = response
self._forms_genf = self._links_genf = None
self._get_title = None
for name in ["encoding", "is_html", "title"]:
try:
delattr(self, name)
except AttributeError:
pass
def __getattr__(self, name):
if name not in ["encoding", "is_html", "title"]:
return getattr(self.__class__, name)
try:
if name == "encoding":
self.encoding = self._get_encoding(self._response)
return self.encoding
elif name == "is_html":
self.is_html = self._is_html_p(self._response, self.encoding)
return self.is_html
elif name == "title":
if self.is_html:
self.title = self._title_factory.title()
else:
self.title = None
return self.title
finally:
self._response.seek(0)
def forms(self):
"""Return iterable over ClientForm.HTMLForm-like objects."""
if self._forms_genf is None:
self._forms_genf = CachingGeneratorFunction(
self._forms_factory.forms())
return self._forms_genf()
def links(self):
"""Return iterable over mechanize.Link-like objects."""
if self._links_genf is None:
self._links_genf = CachingGeneratorFunction(
self._links_factory.links())
return self._links_genf()
class DefaultFactory(Factory):
"""Based on sgmllib."""
def __init__(self, i_want_broken_xhtml_support=False):
Factory.__init__(
self,
forms_factory=FormsFactory(),
links_factory=LinksFactory(),
title_factory=TitleFactory(),
is_html_p=make_is_html(allow_xhtml=i_want_broken_xhtml_support),
)
def set_response(self, response):
Factory.set_response(self, response)
if response is not None:
self._forms_factory.set_response(
copy.copy(response), self.encoding)
self._links_factory.set_response(
copy.copy(response), self._response.geturl(), self.encoding)
self._title_factory.set_response(
copy.copy(response), self.encoding)
class RobustFactory(Factory):
"""Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
DefaultFactory.
"""
def __init__(self, i_want_broken_xhtml_support=False,
soup_class=None):
Factory.__init__(
self,
forms_factory=RobustFormsFactory(),
links_factory=RobustLinksFactory(),
title_factory=RobustTitleFactory(),
is_html_p=make_is_html(allow_xhtml=i_want_broken_xhtml_support),
)
if soup_class is None:
soup_class = MechanizeBs
self._soup_class = soup_class
def set_response(self, response):
import BeautifulSoup
Factory.set_response(self, response)
if response is not None:
data = response.read()
soup = self._soup_class(self.encoding, data)
self._forms_factory.set_response(response, self.encoding)
self._links_factory.set_soup(
soup, response.geturl(), self.encoding)
self._title_factory.set_soup(soup, self.encoding)