#! /usr/bin/env python -- # -*- python -*- ############################################################################## # # Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved. # # This software is subject to the provisions of the Zope Public License, # Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS # FOR A PARTICULAR PURPOSE # ############################################################################## '''Structured Text Manipulation Parse a structured text string into a form that can be used with structured formats, like html. Structured text is text that uses indentation and simple symbology to indicate the structure of a document. A structured string consists of a sequence of paragraphs separated by one or more blank lines. Each paragraph has a level which is defined as the minimum indentation of the paragraph. A paragraph is a sub-paragraph of another paragraph if the other paragraph is the last preceding paragraph that has a lower level. Special symbology is used to indicate special constructs: - A single-line paragraph whose immediately succeeding paragraphs are lower level is treated as a header. - A paragraph that begins with a '-', '*', or 'o' is treated as an unordered list (bullet) element. - A paragraph that begins with a sequence of digits followed by a white-space character is treated as an ordered list element. - A paragraph that begins with a sequence of sequences, where each sequence is a sequence of digits or a sequence of letters followed by a period, is treated as an ordered list element. - A paragraph with a first line that contains some text, followed by some white-space and '--' is treated as a descriptive list element. The leading text is treated as the element title. - Sub-paragraphs of a paragraph that ends in the word 'example' or the word 'examples', or '::' is treated as example code and is output as is. - Text enclosed single quotes (with white-space to the left of the first quote and whitespace or punctuation to the right of the second quote) is treated as example code. - Text surrounded by '*' characters (with white-space to the left of the first '*' and whitespace or punctuation to the right of the second '*') is emphasized. - Text surrounded by '**' characters (with white-space to the left of the first '**' and whitespace or punctuation to the right of the second '**') is made strong. - Text surrounded by '_' underscore characters (with whitespace to the left and whitespace or punctuation to the right) is made underlined. - Text encloded by double quotes followed by a colon, a URL, and concluded by punctuation plus white space, *or* just white space, is treated as a hyper link. For example: "Zope":http://www.zope.org/ is ... Is interpreted as 'Zope is ....' Note: This works for relative as well as absolute URLs. - Text enclosed by double quotes followed by a comma, one or more spaces, an absolute URL and concluded by punctuation plus white space, or just white space, is treated as a hyper link. For example: "mail me", mailto:amos@digicool.com. Is interpreted as 'mail me.' - Text enclosed in brackets which consists only of letters, digits, underscores and dashes is treated as hyper links within the document. For example: As demonstrated by Smith [12] this technique is quite effective. Is interpreted as '... by Smith [12] this ...'. Together with the next rule this allows easy coding of references or end notes. - Text enclosed in brackets which is preceded by the start of a line, two periods and a space is treated as a named link. For example: .. [12] "Effective Techniques" Smith, Joe ... Is interpreted as '[12] "Effective Techniques" ...'. Together with the previous rule this allows easy coding of references or end notes. - A paragraph that has blocks of text enclosed in '||' is treated as a table. The text blocks correspond to table cells and table rows are denoted by newlines. By default the cells are center aligned. A cell can span more than one column by preceding a block of text with an equivalent number of cell separators '||'. Newlines and '|' cannot be a part of the cell text. For example: |||| **Ingredients** || || *Name* || *Amount* || ||Spam||10|| ||Eggs||3|| is interpreted as::
Ingredients | |
Name | Amount |
Spam | 10 |
Eggs | 3 |
\2
\3',s)
s=em.sub( r'\1\2\3',s)
return s
class HTML(StructuredText):
'''\
An HTML structured text formatter.
'''\
def __str__(self,
extra_dl=re.compile("\n%s
" % ctag(p).strip() return ('%s%s
" % ctag(p).strip() return ('%s%s
\n%s\n%s
" % ctag(t).strip() return ('%s%s
\n%s\n' % (before,ctag(p),after) def pre(self,structure,tagged=0): if not structure: return '' if tagged: r='' else: r='\n' for s in structure: r="%s%s\n\n%s" % (r,html_quote(s[0]),self.pre(s[1],1)) if not tagged: r=r.rstrip()+'\n\n' return r def table(self,before,table,after): return '%s
%s
\n%s\n' % (before,ctag(table),after) def _str(self,structure,level, # Static bullet=ts_regex.compile('[ \t\n]*[o*-][ \t\n]+\([^\0]*\)' ).match_group, example=ts_regex.compile('[\0- ]examples?:[\0- ]*$' ).search, dl=ts_regex.compile('\([^\n]+\)[ \t]+--[ \t\n]+\([^\0]*\)' ).match_group, nl=ts_regex.compile('\n').search, ol=ts_regex.compile( '[ \t]*\(\([0-9]+\|[%s]+\)[.)]\)+[ \t\n]+\([^\0]*\|$\)' % string.letters ).match_group, olp=ts_regex.compile('[ \t]*([0-9]+)[ \t\n]+\([^\0]*\|$\)' ).match_group, ): r='' for s in structure: ts_results = bullet(s[0], (1,)) if ts_results: p = ts_results[1] if s[0][-2:]=='::' and s[1]: ps=self.pre(s[1]) else: ps=self._str(s[1],level) r=self.ul(r,p,ps) continue ts_results = ol(s[0], (3,)) if ts_results: p = ts_results[1] if s[0][-2:]=='::' and s[1]: ps=self.pre(s[1]) else: ps=self._str(s[1],level) r=self.ol(r,p,ps) continue ts_results = olp(s[0], (1,)) if ts_results: p = ts_results[1] if s[0][-2:]=='::' and s[1]: ps=self.pre(s[1]) else: ps=self._str(s[1],level) r=self.ol(r,p,ps) continue ts_results = dl(s[0], (1,2)) if ts_results: t,d = ts_results[1] r=self.dl(r,t,d,self._str(s[1],level)) continue if example(s[0]) >= 0 and s[1]: # Introduce an example, using pre tags: r=self.normal(r,s[0],self.pre(s[1])) continue if s[0][-2:]=='::' and s[1]: # Introduce an example, using pre tags: r=self.normal(r,s[0][:-1],self.pre(s[1])) continue if table.create(s[0]): ## table support. r=self.table(r,table.html(),self._str(s[1],level)) continue else: if nl(s[0]) < 0 and s[1] and s[0][-1:] != ':': # Treat as a heading t=s[0] r=self.head(r,t,level, self._str(s[1],level and level+1)) else: r=self.normal(r,s[0],self._str(s[1],level)) return r def html_quote(v, character_entities=( (re.compile('&'), '&'), (re.compile("<"), '<' ), (re.compile(">"), '>' ), (re.compile('"'), '"') )): #" text=str(v) for re,name in character_entities: text=re.sub(name,text) return text def html_with_references(text, level=1): text = re.sub( r'[\0\n]\.\. \[([0-9_%s-]+)\]' % string.letters, r'\n [\1]', text) text = re.sub( r'([\x00- ,])\[(?P[0-9_%s-]+)\]([\x00- ,.:])' % string.letters, r'\1[\2]\3', text) text = re.sub( r'([\0- ,])\[([^]]+)\.html\]([\0- ,.:])', r'\1[\2]\3', text) return HTML(text,level=level) def main(): import sys, getopt opts,args=getopt.getopt(sys.argv[1:],'twl') if args: [infile]=args s=open(infile,'r').read() else: s=sys.stdin.read() if opts: if filter(lambda o: o[0]=='-w', opts): print 'Content-Type: text/html\n' if filter(lambda o: o[0]=='-l', opts): import locale locale.setlocale(locale.LC_ALL,"") if s[:2]=='#!': s=re.sub('^#![^\n]+','',s) mo = re.compile('([\0-\n]*\n)').match(s) if mo is not None: s = s[len(mo.group(0)) :] s=str(html_with_references(s)) if s[:4]=='