############################################################################## # # Copyright (c) 2002 Zope Corporation and Contributors. # All Rights Reserved. # # This software is subject to the provisions of the Zope Public License, # Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS # FOR A PARTICULAR PURPOSE. # ############################################################################## """Widcode A byte-aligned encoding for lists of non-negative ints, using fewer bytes for smaller ints. This is intended for lists of word ids (wids). The ordinary string .find() method can be used to find the encoded form of a desired wid-string in an encoded wid-string. As in UTF-8, the initial byte of an encoding can't appear in the interior of an encoding, so find() can't be fooled into starting a match "in the middle" of an encoding. Unlike UTF-8, the initial byte does not tell you how many continuation bytes follow; and there's no ASCII superset property. Details: + Only the first byte of an encoding has the sign bit set. + The first byte has 7 bits of data. + Bytes beyond the first in an encoding have the sign bit clear, followed by 7 bits of data. + The first byte doesn't tell you how many continuation bytes are following. You can tell by searching for the next byte with the high bit set (or the end of the string). The int to be encoded can contain no more than 28 bits. If it contains no more than 7 bits, 0abcdefg, the encoding is 1abcdefg If it contains 8 thru 14 bits, 00abcdef ghijkLmn the encoding is 1abcdefg 0hijkLmn Static tables _encoding and _decoding capture all encodes and decodes for 14 or fewer bits. If it contains 15 thru 21 bits, 000abcde fghijkLm nopqrstu the encoding is 1abcdefg 0hijkLmn 0opqrstu If it contains 22 thru 28 bits, 0000abcd efghijkL mnopqrst uvwxyzAB the encoding is 1abcdefg 0hijkLmn 0opqrstu 0vwxyzAB $Id$ """ assert 0x80**2 == 0x4000 assert 0x80**4 == 0x10000000 import re def encode(wids): # Encode a list of wids as a string. wid2enc = _encoding n = len(wid2enc) return "".join([w < n and wid2enc[w] or _encode(w) for w in wids]) _encoding = [None] * 0x4000 # Filled later, and converted to a tuple def _encode(w): assert 0x4000 <= w < 0x10000000 b, c = divmod(w, 0x80) a, b = divmod(b, 0x80) s = chr(b) + chr(c) if a < 0x80: # no more than 21 data bits return chr(a + 0x80) + s a, b = divmod(a, 0x80) assert a < 0x80, (w, a, b, s) # else more than 28 data bits return (chr(a + 0x80) + chr(b)) + s _prog = re.compile(r"[\x80-\xFF][\x00-\x7F]*") def decode(code): # Decode a string into a list of wids. get = _decoding.get # Obscure: while _decoding does have the key '\x80', its value is 0, # so the "or" here calls _decode('\x80') anyway. return [get(p) or _decode(p) for p in _prog.findall(code)] _decoding = {} # Filled later def _decode(s): if s == '\x80': # See comment in decode(). This is here to allow a trick to work. return 0 if len(s) == 3: a, b, c = map(ord, s) assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80 return ((a & 0x7F) << 14) | (b << 7) | c assert len(s) == 4, `s` a, b, c, d = map(ord, s) assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80 and not d & 0x80 return ((a & 0x7F) << 21) | (b << 14) | (c << 7) | d def _fill(): global _encoding for i in range(0x80): s = chr(i + 0x80) _encoding[i] = s _decoding[s] = i for i in range(0x80, 0x4000): hi, lo = divmod(i, 0x80) s = chr(hi + 0x80) + chr(lo) _encoding[i] = s _decoding[s] = i _encoding = tuple(_encoding) _fill() def test(): for i in range(2**20): if i % 1000 == 0: print i wids = [i] code = encode(wids) assert decode(code) == wids, (wids, code, decode(code)) if __name__ == "__main__": test()