#!/usr/local/bin/python """ HTML - tag a HTML string (Version 0.6) Copyright (c) 2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2001, eGenix.com Software GmbH; mailto:info@egenix.com See the documentation for further information on copyrights, or contact the author. All Rights Reserved. """ import sys,string # constants + engine from mx.TextTools import * # ErrorTag error = '*syntax error' # error tag obj tagname_set = set(alpha+'-'+number) tagattrname_set = set(alpha+'-'+number) tagvalue_set = set('"\'> ',0) white_set = set(' \r\n\t') tagname_set = set(alpha+'-'+number) tagattrname_set = set(alpha+'-'+number) tagvalue_set = set('"\'> ',0) white_set = set(' \r\n\t') tagattr = ( # name ('name',AllInSet,tagattrname_set), # with value ? (None,Is,'=',MatchOk), # skip junk (None,AllInSet,white_set,+1), # unquoted value ('value',AllInSet,tagvalue_set,+1,MatchOk), # double quoted value (None,Is,'"',+5), ('value',AllNotIn,'"',+1,+2), ('value',Skip,0), (None,Is,'"'), (None,Jump,To,MatchOk), # single quoted value (None,Is,'\''), ('value',AllNotIn,'\'',+1,+2), ('value',Skip,0), (None,Is,'\'') ) valuetable = ( # ignore whitespace + '=' (None,AllInSet,set(' \r\n\t='),+1), # unquoted value ('value',AllInSet,tagvalue_set,+1,MatchOk), # double quoted value (None,Is,'"',+5), ('value',AllNotIn,'"',+1,+2), ('value',Skip,0), (None,Is,'"'), (None,Jump,To,MatchOk), # single quoted value (None,Is,'\''), ('value',AllNotIn,'\'',+1,+2), ('value',Skip,0), (None,Is,'\'') ) allattrs = ( # look for attributes (None,AllInSet,white_set,+4), (None,Is,'>',+1,MatchOk), ('tagattr',Table,tagattr), (None,Jump,To,-3), (None,Is,'>',+1,MatchOk), # handle incorrect attributes (error,AllNotIn,'> \r\n\t'), (None,Jump,To,-6) ) htmltag = ( (None,Is,'<'), # is this a closing tag ? ('closetag',Is,'/',+1), # a coment ? ('comment',Is,'!',+8), (None,Word,'--',+4), ('text',sWordStart,FS('-->'),+1), (None,Skip,3), (None,Jump,To,MatchOk), # a SGML-Tag ? ('other',AllNotIn,'>',+1), (None,Is,'>'), (None,Jump,To,MatchOk), # XMP-Tag ? ('tagname',Word,'xmp',+5), (None,Is,'>'), ('text',sWordStart,FS('',to_lower)), (None,Skip,len('')), (None,Jump,To,MatchOk), # get the tag name ('tagname',AllInSet,tagname_set), # look for attributes (None,AllInSet,white_set,+4), (None,Is,'>',+1,MatchOk), ('tagattr',Table,tagattr), (None,Jump,To,-3), (None,Is,'>',+1,MatchOk), # handle incorrect attributes (error,AllNotIn,'> \n\r\t'), (None,Jump,To,-6) ) htmltable = (# HTML-Tag ('htmltag',Table,htmltag,+1,+4), # not HTML, but still using this syntax: error or inside XMP-tag ! (error,Is,'<',+3), (error,AllNotIn,'>',+1), (error,Is,'>'), # normal text ('text',AllNotIn,'<',+1), # end of file ('eof',EOF,Here,-5), ) if __name__ == '__main__': t = TextTools._timer() # read file f = open(sys.argv[1]) text = f.read() try: count = string.atoi(sys.argv[2]) except: count = 1000 print 'Starting to parse the file %i times...' % count # parse file t.start() for i in range(count): utext = upper(text) result, taglist, nextindex = tag(utext,htmltable) if not result: print ' parsing failed; aborting' break t = t.stop()[0] mean = t/count print result, nextindex, mean*1000,'msec',nextindex/mean,'bytes/sec.' print print 'Hit return to see the tags...' raw_input() print print_tags(text,taglist)