"""Integration with Python standard library module urllib2: OpenerDirector class. Copyright 2004-2006 John J Lee This code is free software; you can redistribute it and/or modify it under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt included with the distribution). """ import urllib2, string, bisect, urlparse from _util import startswith, isstringlike from _request import Request try: set except NameError: import sets set = sets.Set def methnames(obj): """Return method names of class instance. dir(obj) doesn't work across Python versions, this does. """ return methnames_of_instance_as_dict(obj).keys() def methnames_of_instance_as_dict(inst): """ It is possible for an attribute to be present in the results of dir(inst), but for getattr(inst, attr_name) to raise an Attribute error, that should be handled gracefully. >>> class BadAttr(object): ... def error(self): ... raise AttributeError ... error = property(error) >>> inst = BadAttr() >>> 'error' in dir(inst) True >>> inst.error Traceback (most recent call last): ... AttributeError >>> result = methnames_of_instance_as_dict(inst) # no exception """ names = {} names.update(methnames_of_class_as_dict(inst.__class__)) for methname in dir(inst): try: candidate = getattr(inst, methname) except AttributeError: continue if callable(candidate): names[methname] = None return names def methnames_of_class_as_dict(klass): """ It is possible for an attribute to be present in the results of dir(inst), but for getattr(inst, attr_name) to raise an Attribute error, that should be handled gracefully. >>> class BadClass(object): ... def error(self): ... raise AttributeError ... error = property(error) ... __bases__ = [] >>> klass = BadClass() >>> 'error' in dir(klass) True >>> klass.error Traceback (most recent call last): ... AttributeError >>> result = methnames_of_class_as_dict(klass) # no exception """ names = {} for methname in dir(klass): try: candidate = getattr(klass, methname) except AttributeError: continue if callable(candidate): names[methname] = None for baseclass in klass.__bases__: names.update(methnames_of_class_as_dict(baseclass)) return names class OpenerDirector(urllib2.OpenerDirector): def __init__(self): urllib2.OpenerDirector.__init__(self) # really none of these are (sanely) public -- the lack of initial # underscore on some is just due to following urllib2 self.process_response = {} self.process_request = {} self._any_request = {} self._any_response = {} self._handler_index_valid = True def add_handler(self, handler): if handler in self.handlers: return # XXX why does self.handlers need to be sorted? bisect.insort(self.handlers, handler) handler.add_parent(self) self._handler_index_valid = False def _maybe_reindex_handlers(self): if self._handler_index_valid: return handle_error = {} handle_open = {} process_request = {} process_response = {} any_request = set() any_response = set() unwanted = [] for handler in self.handlers: added = False for meth in methnames(handler): if meth in ["redirect_request", "do_open", "proxy_open"]: # oops, coincidental match continue if meth == "any_request": any_request.add(handler) added = True continue elif meth == "any_response": any_response.add(handler) added = True continue ii = meth.find("_") scheme = meth[:ii] condition = meth[ii+1:] if startswith(condition, "error"): jj = string.find(meth[ii+1:], "_") + ii + 1 kind = meth[jj+1:] try: kind = int(kind) except ValueError: pass lookup = handle_error.setdefault(scheme, {}) elif condition == "open": kind = scheme lookup = handle_open elif condition == "request": kind = scheme lookup = process_request elif condition == "response": kind = scheme lookup = process_response else: continue lookup.setdefault(kind, set()).add(handler) added = True if not added: unwanted.append(handler) for handler in unwanted: self.handlers.remove(handler) # sort indexed methods # XXX could be cleaned up for lookup in [process_request, process_response]: for scheme, handlers in lookup.iteritems(): lookup[scheme] = handlers for scheme, lookup in handle_error.iteritems(): for code, handlers in lookup.iteritems(): handlers = list(handlers) handlers.sort() lookup[code] = handlers for scheme, handlers in handle_open.iteritems(): handlers = list(handlers) handlers.sort() handle_open[scheme] = handlers # cache the indexes self.handle_error = handle_error self.handle_open = handle_open self.process_request = process_request self.process_response = process_response self._any_request = any_request self._any_response = any_response def _request(self, url_or_req, data): if isstringlike(url_or_req): req = Request(url_or_req, data) else: # already a urllib2.Request or mechanize.Request instance req = url_or_req if data is not None: req.add_data(data) return req def open(self, fullurl, data=None): req = self._request(fullurl, data) req_scheme = req.get_type() self._maybe_reindex_handlers() # pre-process request # XXX should we allow a Processor to change the URL scheme # of the request? request_processors = set(self.process_request.get(req_scheme, [])) request_processors.update(self._any_request) request_processors = list(request_processors) request_processors.sort() for processor in request_processors: for meth_name in ["any_request", req_scheme+"_request"]: meth = getattr(processor, meth_name, None) if meth: req = meth(req) # In Python >= 2.4, .open() supports processors already, so we must # call ._open() instead. urlopen = getattr(urllib2.OpenerDirector, "_open", urllib2.OpenerDirector.open) response = urlopen(self, req, data) # post-process response response_processors = set(self.process_response.get(req_scheme, [])) response_processors.update(self._any_response) response_processors = list(response_processors) response_processors.sort() for processor in response_processors: for meth_name in ["any_response", req_scheme+"_response"]: meth = getattr(processor, meth_name, None) if meth: response = meth(req, response) return response def error(self, proto, *args): if proto in ['http', 'https']: # XXX http[s] protocols are special-cased dict = self.handle_error['http'] # https is not different than http proto = args[2] # YUCK! meth_name = 'http_error_%s' % proto http_err = 1 orig_args = args else: dict = self.handle_error meth_name = proto + '_error' http_err = 0 args = (dict, proto, meth_name) + args result = apply(self._call_chain, args) if result: return result if http_err: args = (dict, 'default', 'http_error_default') + orig_args return apply(self._call_chain, args) def retrieve(self, fullurl, filename=None, reporthook=None, data=None): """Returns (filename, headers). For remote objects, the default filename will refer to a temporary file. """ req = self._request(fullurl, data) type_ = req.get_type() fp = self.open(req) headers = fp.info() if filename is None and type == 'file': return url2pathname(req.get_selector()), headers if filename: tfp = open(filename, 'wb') else: path = urlparse(fullurl)[2] suffix = os.path.splitext(path)[1] tfp = tempfile.TemporaryFile("wb", suffix=suffix) result = filename, headers bs = 1024*8 size = -1 read = 0 blocknum = 1 if reporthook: if headers.has_key("content-length"): size = int(headers["Content-Length"]) reporthook(0, bs, size) while 1: block = fp.read(bs) read += len(block) if reporthook: reporthook(blocknum, bs, size) blocknum = blocknum + 1 if not block: break tfp.write(block) fp.close() tfp.close() del fp del tfp if size>=0 and read