############################################################################## # # Copyright (c) 2005 Zope Corporation and Contributors. # All Rights Reserved. # # This software is subject to the provisions of the Zope Public License, # Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS # FOR A PARTICULAR PURPOSE. # ############################################################################## """Retrieve Static APIDOC $Id$ """ __docformat__ = "reStructuredText" import base64 import os import sys import time import optparse import urllib2 import warnings import HTMLParser import zope.testbrowser import mechanize from zope.app.testing import functional from zope.app.apidoc import classregistry VERBOSITY_MAP = {1: 'ERROR', 2: 'WARNING', 3: 'INFO'} # A mapping of HTML elements that can contain links to the attribute that # actually contains the link urltags = { "a": "href", "area": "href", "frame": "src", "iframe": "src", "link": "href", "img": "src", "script": "src", } def getMaxWidth(): try: import curses except ImportError: pass else: try: curses.setupterm() cols = curses.tigetnum('cols') if cols > 0: return cols except curses.error: pass return 80 def cleanURL(url): """Clean a URL from parameters.""" if '?' in url: url = url.split('?')[0] if '#' in url: url = url.split('#')[0] return url def completeURL(url): """Add file to URL, if not provided.""" if url.endswith('/'): url += 'index.html' if '.' not in url.split('/')[-1]: url += '/index.html' filename = url.split('/')[-1] if filename.startswith('@@'): url = url.replace(filename, filename[2:]) return url class Link(object): """A link in the page.""" def __init__(self, mechLink, rootURL, referenceURL='None'): self.rootURL = rootURL self.referenceURL = referenceURL self.originalURL = mechLink.url self.callableURL = mechLink.absolute_url self.url = completeURL(cleanURL(mechLink.url)) self.absoluteURL = completeURL(cleanURL(mechLink.absolute_url)) def isLocalURL(self): """Determine whether the passed in URL is local and accessible.""" # Javascript function call if self.url.startswith('javascript:'): return False # Mail Link if self.url.startswith('mailto:'): return False # External Link if self.url.startswith('http://') and \ not self.url.startswith(self.rootURL): return False return True def isApidocLink(self): # Make sure that only apidoc links are loaded if self.absoluteURL.startswith(self.rootURL+'++apidoc++/'): return True if self.absoluteURL.startswith(self.rootURL+'@@/'): return True return False class OnlineBrowser(mechanize.Browser, object): def setUserAndPassword(self, user, pw): """Specify the username and password to use for the retrieval.""" hash = base64.encodestring(user+':'+pw).strip() self.addheaders.append(('Authorization', 'Basic '+hash)) @property def contents(self): """Get the content of the returned data""" response = self.response() old_location = response.tell() response.seek(0) contents = response.read() response.seek(old_location) return contents class PublisherBrowser(zope.testbrowser.testing.PublisherMechanizeBrowser, object): def __init__(self, *args, **kw): functional.Functional.setUp() super(PublisherBrowser, self).__init__(*args, **kw) def setUserAndPassword(self, user, pw): """Specify the username and password to use for the retrieval.""" self.addheaders.append(('Authorization', 'Basic %s:%s' %(user, pw))) @property def contents(self): """Get the content of the returned data""" response = self.response() old_location = response.tell() response.seek(0) # Remove HTTP Headers for line in iter(lambda: response.readline().strip(), ''): pass contents = response.read() response.seek(old_location) return contents class StaticAPIDocGenerator(object): """Static API doc Maker""" def __init__(self, options): self.options = options self.linkQueue = [] for url in self.options.additional_urls + [self.options.startpage]: link = Link(mechanize.Link(self.options.url, url, '', '', ()), self.options.url) self.linkQueue.append(link) self.rootDir = os.path.join(os.path.dirname(__file__), self.options.target_dir) self.maxWidth = getMaxWidth()-13 self.needNewLine = False def start(self): """Start the retrieval of the apidoc.""" t0 = time.time() self.visited = [] self.counter = 0 self.linkErrors = 0 self.htmlErrors = 0 # Turn off deprecation warnings warnings.filterwarnings("ignore", category=DeprecationWarning) if not os.path.exists(self.rootDir): os.mkdir(self.rootDir) if self.options.use_publisher: self.browser = PublisherBrowser() if self.options.use_webserver: self.browser = OnlineBrowser() self.browser.setUserAndPassword(self.options.username, self.options.password) self.browser._links_factory.urltags = urltags if self.options.debug: self.browser.addheaders.append(('X-zope-handle-errors', False)) classregistry.IGNORE_MODULES = self.options.ignore_modules if self.options.import_unknown_modules: classregistry.__import_unknown_modules__ = True # Work through all links until there are no more to work on. self.sendMessage('Starting retrieval.') while self.linkQueue: link = self.linkQueue.pop() # Sometimes things are placed many times into the queue, for example # if the same link appears twice in a page. In those cases, we can # check at this point whether the URL has been already handled. if link.absoluteURL not in self.visited: self.showProgress(link) self.processLink(link) t1 = time.time() self.sendMessage("Run time: %.3f sec" % (t1-t0)) self.sendMessage("Links: %i" %self.counter) self.sendMessage("Link Retrieval Errors: %i" %self.linkErrors) self.sendMessage("HTML ParsingErrors: %i" %self.htmlErrors) def showProgress(self, link): self.counter += 1 if self.options.progress: url = link.absoluteURL[-(self.maxWidth):] sys.stdout.write('\r' + ' '*(self.maxWidth+13)) sys.stdout.write('\rLink %5d: %s' % (self.counter, url)) sys.stdout.flush() self.needNewLine = True def sendMessage(self, msg, verbosity=4): if self.options.verbosity >= verbosity: if self.needNewLine: sys.stdout.write('\n') sys.stdout.write(VERBOSITY_MAP.get(verbosity, 'INFO')+': ') sys.stdout.write(msg) sys.stdout.write('\n') sys.stdout.flush() self.needNewLine = False def processLink(self, link): """Process a link.""" url = link.absoluteURL # Whatever will happen, we have looked at the URL self.visited.append(url) # Retrieve the content try: self.browser.open(link.callableURL) except urllib2.HTTPError, error: # Something went wrong with retrieving the page. self.linkErrors += 1 self.sendMessage( '%s (%i): %s' % (error.msg, error.code, link.callableURL), 2) self.sendMessage('+-> Reference: ' + link.referenceURL, 2) # Now set the error page as the response from ClientCookie._Util import response_seek_wrapper self.browser._response = response_seek_wrapper(error) except (urllib2.URLError, ValueError): # We had a bad URL running the publisher browser self.linkErrors += 1 self.sendMessage('Bad URL: ' + link.callableURL, 2) self.sendMessage('+-> Reference: ' + link.referenceURL, 2) return except Exception, error: # This should never happen outside the debug mode. We really want # to catch all exceptions, so that we can investigate them. if self.options.debug: import pdb; pdb.set_trace() return # Get the response content contents = self.browser.contents # Make sure the directory exists and get a file path. relativeURL = url.replace(self.options.url, '') dir = self.rootDir segments = relativeURL.split('/') filename = segments.pop() for segment in segments: dir = os.path.join(dir, segment) if not os.path.exists(dir): os.mkdir(dir) filepath = os.path.join(dir, filename) # Now retrieve all links if self.browser.viewing_html(): try: links = self.browser.links() except HTMLParser.HTMLParseError, error: self.htmlErrors += 1 self.sendMessage('Failed to parse HTML: ' + url, 1) self.sendMessage('+-> %s: line %i, column %s' % ( error.msg, error.lineno, error.offset), 1) links = [] links = [Link(mech_link, self.options.url, url) for mech_link in links] for link in links: # Make sure we do not handle unwanted links. if not (link.isLocalURL() and link.isApidocLink()): continue # Add link to the queue if link.absoluteURL not in self.visited: self.linkQueue.insert(0, link) # Rewrite URLs parts = ['..']*len(segments) parts.append(link.absoluteURL.replace(self.options.url, '')) contents = contents.replace(link.originalURL, '/'.join(parts)) # Write the data into the file try: file = open(filepath, 'w') file.write(contents) file.close() except IOError: # The file already exists, so it is a duplicate and a bad one, # since the URL misses `index.hml`. ReST can produce strange URLs # that produce this problem, and we have little control over it. pass # Cleanup; this is very important, otherwise we are opening too many # files. self.browser.close() ############################################################################### # Command-line UI parser = optparse.OptionParser("%prog [options] TARGET_DIR") ###################################################################### # Retrieval retrieval = optparse.OptionGroup( parser, "Retrieval", "Options that deal with setting up the generator") retrieval.add_option( '--publisher', '-p', action="store_true", dest='use_publisher', help="""\ Use the publisher directly to retrieve the data. The program will bring up Zope 3 for you. """) retrieval.add_option( '--webserver', '-w', action="store_true", dest='use_webserver', help="""\ Use and external Web server that is connected to Zope 3. """) retrieval.add_option( '--url', '-u', action="store", dest='url', help="""\ The URL that will be used to retrieve the HTML pages. This option is meaningless, if you are using the publisher as backend. Also, the value of this option should *not* include the `++apidoc++` namespace. """) retrieval.add_option( '--startpage', '-s', action="store", dest='startpage', help="""\ The startpage specifies the path (after the URL) that is used as the starting point to retrieve the contents. The default is `++apidoc++/static.html`. This option can be very useful for debugging, since it allows you to select specific pages. """) retrieval.add_option( '--username', '--user', action="store", dest='username', help="""\ Username to access the Web site. """) retrieval.add_option( '--password', '--pwd', action="store", dest='password', help="""\ Password to access the Web site. """) retrieval.add_option( '--add', '-a', action="append", dest='additional_urls', help="""\ Add an additional URL to the list of URLs to retrieve. Specifying those is sometimes necessary, if the links are hidden in cryptic JAvascript code. """) retrieval.add_option( '--ignore', '-i', action="append", dest='ignore_modules', help="""\ Add modules that should be ignored during retrieval. That allows you to limit the scope of the generated API documentation. """) retrieval.add_option( '--load-all', '-l', action="store_true", dest='import_unknown_modules', help="""\ Retrieve all referenced modules, even if they have not been imported during the startup process. """) parser.add_option_group(retrieval) ###################################################################### # Reporting reporting = optparse.OptionGroup( parser, "Reporting", "Options that configure the user output information.") reporting.add_option( '--verbosity', '-v', type="int", dest='verbosity', help="""\ Specifies the reporting detail level. """) reporting.add_option( '--progress', '-b', action="store_true", dest='progress', help="""\ Output progress status """) reporting.add_option( '--debug', '-d', action="store_true", dest='debug', help="""\ Run in debug mode. This will allow you to use the debugger, if the publisher experienced an error. """) parser.add_option_group(reporting) ###################################################################### # Command-line processing # Default setup default_setup_args = [ '--verbosity', 5, '--publisher', '--url', 'http://localhost:8080/', '--startpage', '++apidoc++/static.html', '--username', 'mgr', '--password', 'mgrpw', '--progress', '--add', '@@/varrow.png', '--add', '@@/harrow.png', '--add', '@@/tree_images/minus.png', '--add', '@@/tree_images/plus.png', '--add', '@@/tree_images/minus_vline.png', '--add', '@@/tree_images/plus_vline.png', '--ignore', 'twisted', '--ignore', 'zope.app.twisted.ftp.test', '--load-all' ] def merge_options(options, defaults): odict = options.__dict__ for name, value in defaults.__dict__.items(): if (value is not None) and (odict[name] is None): odict[name] = value def get_options(args=None, defaults=None): default_setup, _ = parser.parse_args(default_setup_args) assert not _ if defaults: defaults, _ = parser.parse_args(defaults) assert not _ merge_options(defaults, default_setup) else: defaults = default_setup if args is None: args = sys.argv original_testrunner_args = args args = args[1:] options, positional = parser.parse_args(args) merge_options(options, defaults) options.original_testrunner_args = original_testrunner_args if not positional: parser.error("No target directory specified.") options.target_dir = positional.pop() return options # Command-line UI ############################################################################### def main(): options = get_options() maker = StaticAPIDocGenerator(options) maker.start() sys.exit(0) if __name__ == '__main__': main()