##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################

# A byte-aligned encoding for lists of non-negative ints, using fewer bytes
# for smaller ints.  This is intended for lists of word ids (wids).  The
# ordinary string .find() method can be used to find the encoded form of a
# desired wid-string in an encoded wid-string.  As in UTF-8, the initial byte
# of an encoding can't appear in the interior of an encoding, so find() can't
# be fooled into starting a match "in the middle" of an encoding. Unlike
# UTF-8, the initial byte does not tell you how many continuation bytes
# follow; and there's no ASCII superset property.

# Details:
#
# + Only the first byte of an encoding has the sign bit set.
#
# + The first byte has 7 bits of data.
#
# + Bytes beyond the first in an encoding have the sign bit clear, followed
#   by 7 bits of data.
#
# + The first byte doesn't tell you how many continuation bytes are
#   following.  You can tell by searching for the next byte with the
#   high bit set (or the end of the string).
#
# The int to be encoded can contain no more than 28 bits.
#
# If it contains no more than 7 bits, 0abcdefg, the encoding is
#     1abcdefg
#
# If it contains 8 thru 14 bits,
#     00abcdef ghijkLmn
# the encoding is
#     1abcdefg 0hijkLmn
#
# Static tables _encoding and _decoding capture all encodes and decodes for
# 14 or fewer bits.
#
# If it contains 15 thru 21 bits,
#    000abcde fghijkLm nopqrstu
# the encoding is
#    1abcdefg 0hijkLmn 0opqrstu
#
# If it contains 22 thru 28 bits,
#    0000abcd efghijkL mnopqrst uvwxyzAB
# the encoding is
#    1abcdefg 0hijkLmn 0opqrstu 0vwxyzAB

assert 0x80**2 == 0x4000
assert 0x80**4 == 0x10000000

import re

def encode(wids):
    # Encode a list of wids as a string.
    wid2enc = _encoding
    n = len(wid2enc)
    return "".join([w < n and wid2enc[w] or _encode(w) for w in wids])

_encoding = [None] * 0x4000 # Filled later, and converted to a tuple

def _encode(w):
    assert 0x4000 <= w < 0x10000000
    b, c = divmod(w, 0x80)
    a, b = divmod(b, 0x80)
    s = chr(b) + chr(c)
    if a < 0x80:    # no more than 21 data bits
        return chr(a + 0x80) + s
    a, b = divmod(a, 0x80)
    assert a < 0x80, (w, a, b, s)  # else more than 28 data bits
    return (chr(a + 0x80) + chr(b)) + s

_prog = re.compile(r"[\x80-\xFF][\x00-\x7F]*")

def decode(code):
    # Decode a string into a list of wids.
    get = _decoding.get
    # Obscure:  while _decoding does have the key '\x80', its value is 0,
    # so the "or" here calls _decode('\x80') anyway.
    return [get(p) or _decode(p) for p in _prog.findall(code)]

_decoding = {} # Filled later

def _decode(s):
    if s == '\x80':
        # See comment in decode().  This is here to allow a trick to work.
        return 0
    if len(s) == 3:
        a, b, c = map(ord, s)
        assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80
        return ((a & 0x7F) << 14) | (b << 7) | c
    assert len(s) == 4, `s`
    a, b, c, d = map(ord, s)
    assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80 and not d & 0x80
    return ((a & 0x7F) << 21) | (b << 14) | (c << 7) | d

def _fill():
    global _encoding
    for i in range(0x80):
        s = chr(i + 0x80)
        _encoding[i] = s
        _decoding[s] = i
    for i in range(0x80, 0x4000):
        hi, lo = divmod(i, 0x80)
        s = chr(hi + 0x80) + chr(lo)
        _encoding[i] = s
        _decoding[s] = i
    _encoding = tuple(_encoding)

_fill()

def test():
    for i in range(2**20):
        if i % 1000 == 0: print i
        wids = [i]
        code = encode(wids)
        assert decode(code) == wids, (wids, code, decode(code))

if __name__ == "__main__":
    test()