##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Widcode

A byte-aligned encoding for lists of non-negative ints, using fewer bytes
for smaller ints.  This is intended for lists of word ids (wids).  The
ordinary string .find() method can be used to find the encoded form of a
desired wid-string in an encoded wid-string.  As in UTF-8, the initial byte
of an encoding can't appear in the interior of an encoding, so find() can't
be fooled into starting a match "in the middle" of an encoding. Unlike
UTF-8, the initial byte does not tell you how many continuation bytes
follow; and there's no ASCII superset property.

Details:

+ Only the first byte of an encoding has the sign bit set.

+ The first byte has 7 bits of data.

+ Bytes beyond the first in an encoding have the sign bit clear, followed
  by 7 bits of data.

+ The first byte doesn't tell you how many continuation bytes are
  following.  You can tell by searching for the next byte with the
  high bit set (or the end of the string).

The int to be encoded can contain no more than 28 bits.

If it contains no more than 7 bits, 0abcdefg, the encoding is
    1abcdefg

If it contains 8 thru 14 bits,
    00abcdef ghijkLmn
the encoding is
    1abcdefg 0hijkLmn

Static tables _encoding and _decoding capture all encodes and decodes for
14 or fewer bits.

If it contains 15 thru 21 bits,
   000abcde fghijkLm nopqrstu
the encoding is
   1abcdefg 0hijkLmn 0opqrstu

If it contains 22 thru 28 bits,
   0000abcd efghijkL mnopqrst uvwxyzAB
the encoding is
   1abcdefg 0hijkLmn 0opqrstu 0vwxyzAB

$Id$
"""
assert 0x80**2 == 0x4000
assert 0x80**4 == 0x10000000

import re

def encode(wids):
    # Encode a list of wids as a string.
    wid2enc = _encoding
    n = len(wid2enc)
    return "".join([w < n and wid2enc[w] or _encode(w) for w in wids])

_encoding = [None] * 0x4000 # Filled later, and converted to a tuple

def _encode(w):
    assert 0x4000 <= w < 0x10000000
    b, c = divmod(w, 0x80)
    a, b = divmod(b, 0x80)
    s = chr(b) + chr(c)
    if a < 0x80:    # no more than 21 data bits
        return chr(a + 0x80) + s
    a, b = divmod(a, 0x80)
    assert a < 0x80, (w, a, b, s)  # else more than 28 data bits
    return (chr(a + 0x80) + chr(b)) + s

_prog = re.compile(r"[\x80-\xFF][\x00-\x7F]*")

def decode(code):
    # Decode a string into a list of wids.
    get = _decoding.get
    # Obscure:  while _decoding does have the key '\x80', its value is 0,
    # so the "or" here calls _decode('\x80') anyway.
    return [get(p) or _decode(p) for p in _prog.findall(code)]

_decoding = {} # Filled later

def _decode(s):
    if s == '\x80':
        # See comment in decode().  This is here to allow a trick to work.
        return 0
    if len(s) == 3:
        a, b, c = map(ord, s)
        assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80
        return ((a & 0x7F) << 14) | (b << 7) | c
    assert len(s) == 4, `s`
    a, b, c, d = map(ord, s)
    assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80 and not d & 0x80
    return ((a & 0x7F) << 21) | (b << 14) | (c << 7) | d

def _fill():
    global _encoding
    for i in range(0x80):
        s = chr(i + 0x80)
        _encoding[i] = s
        _decoding[s] = i
    for i in range(0x80, 0x4000):
        hi, lo = divmod(i, 0x80)
        s = chr(hi + 0x80) + chr(lo)
        _encoding[i] = s
        _decoding[s] = i
    _encoding = tuple(_encoding)

_fill()

def test():
    for i in range(2**20):
        if i % 1000 == 0: print i
        wids = [i]
        code = encode(wids)
        assert decode(code) == wids, (wids, code, decode(code))

if __name__ == "__main__":
    test()