############################################################################## # # Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved. # # This software is subject to the provisions of the Zope Public License, # Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS # FOR A PARTICULAR PURPOSE # ############################################################################## """ Some helper methods $Id: ZopePageTemplate.py 71579 2006-12-17 20:26:10Z andreasjung $ """ import re xml_preamble_reg = re.compile(r'^<\?xml.*?encoding="(.*?)".*?\?>', re.M) http_equiv_reg = re.compile(r'()', re.I|re.M|re.S) http_equiv_reg2 = re.compile(r'charset.*?=.*?(?P[\w\-]*)', re.I|re.M|re.S) def encodingFromXMLPreamble(xml): """ Extract the encoding from a xml preamble. Return 'utf-8' if not available """ mo = xml_preamble_reg.match(xml) if not mo: return 'utf-8' else: return mo.group(1).lower() def charsetFromMetaEquiv(html): """ Return the value of the 'charset' from a html document containing tag mo = http_equiv_reg.search(html) if mo: # extract the meta tag meta = mo.group(1) # search for the charset value mo = http_equiv_reg2.search(meta) if mo: # return charset return mo.group(1).lower() return None def convertToUnicode(source, content_type, preferred_encodings): """ Convert 'source' to unicode. Returns (unicode_str, source_encoding). """ if content_type.startswith('text/xml'): encoding = encodingFromXMLPreamble(source) return unicode(source, encoding), encoding elif content_type.startswith('text/html'): encoding = charsetFromMetaEquiv(source) if encoding: return unicode(source, encoding), encoding # Try to detect the encoding by converting it unicode without raising # exceptions. There are some smarter Python-based sniffer methods # available however we have to check their licenses first before # including them into the Zope 2 core for enc in preferred_encodings: try: return unicode(source, enc), enc except UnicodeDecodeError: continue return unicode(source), None