#! /usr/bin/env python ############################################################################## # # Copyright (c) 2003 Zope Corporation and Contributors. # All Rights Reserved. # # This software is subject to the provisions of the Zope Public License, # Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS # FOR A PARTICULAR PURPOSE. # ############################################################################## """Index a collection of HTML files on the filesystem. usage: indexhtml.py [options] dir Will create an index of all files in dir or its subdirectories. options: -f data.fs -- the path to the filestorage datafile $Id$ """ import os from time import clock from ZODB.Storage.FileStorage import FileStorage from ZODB.BTrees.IOBTree import IOBTree import transaction from zope.index.text.htmlsplitter import HTMLWordSplitter from zope.index.text.lexicon import Lexicon, StopWordRemover def make_zc_index(): # there's an elaborate dance necessary to construct an index class Struct(object): pass extra = Struct() extra.doc_attr = "read" extra.lexicon_id = "lexicon" caller = Struct() caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover()) return ZCTextIndex("read", extra, caller) # TODO: make a splitter more like the HTMLSplitter for TextIndex # signature is # Splitter(string, stop_words, encoding, # singlechar, indexnumbers, casefolding) class MySplitter(object): def __init__(self): self._v_splitter = HTMLWordSplitter() def __call__(self, text, stopdict, *args, **kwargs): words = self._v_splitter._split(text) def lookup(w): return stopdict.get(w, w) return filter(None, map(lookup, words)) def make_old_index(): from Products.PluginIndexes.TextIndex.TextIndex import TextIndex from Products.PluginIndexes.TextIndex.Lexicon import Lexicon from zope.index.text.stopdict import get_stopdict l = Lexicon(get_stopdict()) l.SplitterFunc = MySplitter() return TextIndex("read", lexicon=l) def main(db, root, dir): rt["index"] = index = INDEX() rt["files"] = paths = IOBTree() transaction.commit() zodb_time = 0.0 pack_time = 0.0 files = [os.path.join(dir, file) for file in os.listdir(dir)] docid = 0 t0 = clock() for file in files: if os.path.isdir(file): files += [os.path.join(file, sub) for sub in os.listdir(file)] else: if not file.endswith(".html"): continue docid += 1 if LIMIT is not None and docid > LIMIT: break if VERBOSE: print "%5d" % docid, file f = open(file, "rb") paths[docid] = file index.index_object(docid, f) f.close() if docid % TXN_INTERVAL == 0: z0 = clock() transaction.commit() z1 = clock() zodb_time += z1 - z0 if VERBOSE: print "commit took", z1 - z0, zodb_time if docid % PACK_INTERVAL == 0: p0 = clock() db.pack() p1 = clock() zodb_time += p1 - p0 pack_time += p1 - p0 if VERBOSE: print "pack took", p1 - p0, pack_time z0 = clock() transaction.commit() z1 = t1 = clock() total_time = t1 - t0 zodb_time += z1 - z0 if VERBOSE: print "Total index time", total_time print "Non-pack time", total_time - pack_time print "Non-ZODB time", total_time - zodb_time if __name__ == "__main__": import sys import getopt VERBOSE = 0 FSPATH = "Data.fs" TXN_INTERVAL = 100 PACK_INTERVAL = 500 LIMIT = None INDEX = make_zc_index try: opts, args = getopt.getopt(sys.argv[1:], 'vf:t:p:n:T') except getopt.error, msg: print msg print __doc__ sys.exit(2) for o, v in opts: if o == '-v': VERBOSE += 1 if o == '-f': FSPATH = v if o == '-t': TXN_INTERVAL = int(v) if o == '-p': PACK_INTERVAL = int(v) if o == '-n': LIMIT = int(v) if o == '-T': INDEX = make_old_index if len(args) != 1: print "Expected on argument" print __doc__ sys.exit(2) dir = args[0] fs = FileStorage(FSPATH) db = ZODB.DB(fs) cn = db.open() rt = cn.root() dir = os.path.join(os.getcwd(), dir) print dir main(db, rt, dir) cn.close() fs.close()