#! /usr/bin/env python
##############################################################################
#
# Copyright (c) 2003 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Index a collection of HTML files on the filesystem.

usage: indexhtml.py [options] dir

Will create an index of all files in dir or its subdirectories.

options:
-f data.fs  -- the path to the filestorage datafile

$Id$
"""
import os
from time import clock

from ZODB.Storage.FileStorage import FileStorage
from ZODB.BTrees.IOBTree import IOBTree
import transaction

from zope.index.text.htmlsplitter import HTMLWordSplitter
from zope.index.text.lexicon import Lexicon, StopWordRemover

def make_zc_index():
    # there's an elaborate dance necessary to construct an index
    class Struct(object):
        pass
    extra = Struct()
    extra.doc_attr = "read"
    extra.lexicon_id = "lexicon"
    caller = Struct()
    caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
    return ZCTextIndex("read", extra, caller)

# TODO: make a splitter more like the HTMLSplitter for TextIndex
# signature is
# Splitter(string, stop_words, encoding,
#          singlechar, indexnumbers, casefolding)

class MySplitter(object):
    def __init__(self):
        self._v_splitter = HTMLWordSplitter()
    def __call__(self, text, stopdict, *args, **kwargs):
        words = self._v_splitter._split(text)
        def lookup(w):
            return stopdict.get(w, w)
        return filter(None, map(lookup, words))

def make_old_index():
    from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
    from Products.PluginIndexes.TextIndex.Lexicon  import Lexicon
    from zope.index.text.stopdict import get_stopdict

    l = Lexicon(get_stopdict())
    l.SplitterFunc = MySplitter()
    return TextIndex("read", lexicon=l)

def main(db, root, dir):
    rt["index"] = index = INDEX()
    rt["files"] = paths = IOBTree()
    transaction.commit()

    zodb_time = 0.0
    pack_time = 0.0

    files = [os.path.join(dir, file) for file in os.listdir(dir)]
    docid = 0
    t0 = clock()
    for file in files:
        if os.path.isdir(file):
            files += [os.path.join(file, sub) for sub in os.listdir(file)]
        else:
            if not file.endswith(".html"):
                continue
            docid += 1
            if LIMIT is not None and docid > LIMIT:
                break
            if VERBOSE:
                print "%5d" % docid, file
            f = open(file, "rb")
            paths[docid] = file
            index.index_object(docid, f)
            f.close()
            if docid % TXN_INTERVAL == 0:
                z0 = clock()
                transaction.commit()
                z1 = clock()
                zodb_time += z1 - z0
                if VERBOSE:
                    print "commit took", z1 - z0, zodb_time
            if docid % PACK_INTERVAL == 0:
                p0 = clock()
                db.pack()
                p1 = clock()
                zodb_time += p1 - p0
                pack_time += p1 - p0
                if VERBOSE:
                    print "pack took", p1 - p0, pack_time
    z0 = clock()
    transaction.commit()
    z1 = t1 = clock()
    total_time = t1 - t0
    zodb_time += z1 - z0
    if VERBOSE:
        print "Total index time", total_time
        print "Non-pack time", total_time - pack_time
        print "Non-ZODB time", total_time - zodb_time

if __name__ == "__main__":
    import sys
    import getopt

    VERBOSE = 0
    FSPATH = "Data.fs"
    TXN_INTERVAL = 100
    PACK_INTERVAL = 500
    LIMIT = None
    INDEX = make_zc_index
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'vf:t:p:n:T')
    except getopt.error, msg:
        print msg
        print __doc__
        sys.exit(2)

    for o, v in opts:
        if o == '-v':
            VERBOSE += 1
        if o == '-f':
            FSPATH = v
        if o == '-t':
            TXN_INTERVAL = int(v)
        if o == '-p':
            PACK_INTERVAL = int(v)
        if o == '-n':
            LIMIT = int(v)
        if o == '-T':
            INDEX = make_old_index

    if len(args) != 1:
        print "Expected on argument"
        print __doc__
        sys.exit(2)
    dir = args[0]

    fs = FileStorage(FSPATH)
    db = ZODB.DB(fs)
    cn = db.open()
    rt = cn.root()
    dir = os.path.join(os.getcwd(), dir)
    print dir
    main(db, rt, dir)
    cn.close()
    fs.close()