############################################################################## # # Copyright (c) 2002 Zope Corporation and Contributors. # All Rights Reserved. # # This software is subject to the provisions of the Zope Public License, # Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS # FOR A PARTICULAR PURPOSE. # ############################################################################## """Plug in text index for ZCatalog with relevance ranking. $Id: ZCTextIndex.py 40218 2005-11-18 14:39:19Z andreasjung $ """ from cgi import escape from Persistence import Persistent import Acquisition from Acquisition import aq_base, aq_inner, aq_parent from OFS.SimpleItem import SimpleItem from Globals import DTMLFile, InitializeClass from AccessControl.SecurityInfo import ClassSecurityInfo from AccessControl.Permissions import manage_zcatalog_indexes, search_zcatalog from zope.interface import implements from Products.PluginIndexes.common.PluggableIndex import \ PluggableIndexInterface from Products.PluginIndexes.common.util import parseIndexRequest from Products.PluginIndexes.common import safe_callable from Products.PluginIndexes.interfaces import IPluggableIndex from Products.ZCTextIndex.Lexicon import \ Lexicon, Splitter, CaseNormalizer, StopWordRemover from Products.ZCTextIndex.NBest import NBest from Products.ZCTextIndex.QueryParser import QueryParser from CosineIndex import CosineIndex from ILexicon import ILexicon as z2ILexicon from interfaces import ILexicon from interfaces import IZCLexicon from interfaces import IZCTextIndex from OkapiIndex import OkapiIndex from PipelineFactory import element_factory index_types = {'Okapi BM25 Rank':OkapiIndex, 'Cosine Measure':CosineIndex} class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem): """Persistent text index. """ __implements__ = PluggableIndexInterface implements(IZCTextIndex, IPluggableIndex) ## Magic class attributes ## meta_type = 'ZCTextIndex' manage_options = ( {'label': 'Overview', 'action': 'manage_main'}, ) query_options = ['query'] security = ClassSecurityInfo() security.declareObjectProtected(manage_zcatalog_indexes) ## Constructor ## def __init__(self, id, extra=None, caller=None, index_factory=None, field_name=None, lexicon_id=None): self.id = id # Arguments can be passed directly to the constructor or # via the silly "extra" record. self._fieldname = field_name or getattr(extra, 'doc_attr', '') or id self._indexed_attrs = self._fieldname.split(',') self._indexed_attrs = [ attr.strip() for attr in self._indexed_attrs if attr ] lexicon_id = lexicon_id or getattr(extra, 'lexicon_id', '') lexicon = getattr(caller, lexicon_id, None) if lexicon is None: raise LookupError, 'Lexicon "%s" not found' % escape(lexicon_id) if not (ILexicon.providedBy(lexicon) or z2ILexicon.isImplementedBy(lexicon)): raise ValueError('Object "%s" does not implement ' 'ZCTextIndex Lexicon interface' % lexicon.getId()) self.lexicon_id = lexicon.getId() self._v_lexicon = lexicon if index_factory is None: if extra.index_type not in index_types.keys(): raise ValueError, 'Invalid index type "%s"' % escape( extra.index_type) self._index_factory = index_types[extra.index_type] self._index_type = extra.index_type else: self._index_factory = index_factory self.index = self._index_factory(aq_base(self.getLexicon())) ## Private Methods ## security.declarePrivate('getLexicon') def getLexicon(self): """Get the lexicon for this index """ if hasattr(aq_base(self), 'lexicon'): # Fix up old ZCTextIndexes by removing direct lexicon ref # and changing it to an ID lexicon = getattr(aq_parent(aq_inner(self)), self.lexicon.getId()) self.lexicon_id = lexicon.getId() del self.lexicon if getattr(aq_base(self), 'lexicon_path', None): # Fix up slightly less old ZCTextIndexes by removing # the physical path and changing it to an ID. # There's no need to use a physical path, which otherwise # makes it difficult to move or rename ZCatalogs. self.lexicon_id = self.lexicon_path[-1] del self.lexicon_path try: return self._v_lexicon except AttributeError: lexicon = getattr(aq_parent(aq_inner(self)), self.lexicon_id) if not (ILexicon.providedBy(lexicon) or z2ILexicon.isImplementedBy(lexicon)): raise TypeError('Object "%s" is not a ZCTextIndex Lexicon' % repr(lexicon)) self._v_lexicon = lexicon return lexicon ## External methods not in the Pluggable Index API ## security.declareProtected(search_zcatalog, 'query') def query(self, query, nbest=10): """Return pair (mapping from docids to scores, num results). The num results is the total number of results before trimming to the nbest results. """ tree = QueryParser(self.getLexicon()).parseQuery(query) results = tree.executeQuery(self.index) if results is None: return [], 0 chooser = NBest(nbest) chooser.addmany(results.items()) return chooser.getbest(), len(results) ## Pluggable Index APIs ## def index_object(self, documentId, obj, threshold=None): """Wrapper for index_doc() handling indexing of multiple attributes. Enter the document with the specified documentId in the index under the terms extracted from the indexed text attributes, each of which should yield either a string or a list of strings (Unicode or otherwise) to be passed to index_doc(). """ # XXX We currently ignore subtransaction threshold # needed for backward compatibility try: fields = self._indexed_attrs except: fields = [ self._fieldname ] res = 0 all_texts = [] for attr in fields: text = getattr(obj, attr, None) if text is None: continue if safe_callable(text): text = text() if text is None: continue if text: if isinstance(text, (list, tuple, )): all_texts.extend(text) else: all_texts.append(text) # Check that we're sending only strings all_texts = filter(lambda text: isinstance(text, basestring), \ all_texts) if all_texts: return self.index.index_doc(documentId, all_texts) return res def unindex_object(self, docid): if self.index.has_doc(docid): self.index.unindex_doc(docid) def _apply_index(self, request, cid=''): """Apply query specified by request, a mapping containing the query. Returns two object on success, the resultSet containing the matching record numbers and a tuple containing the names of the fields used Returns None if request is not valid for this index. """ record = parseIndexRequest(request, self.id, self.query_options) if record.keys is None: return None query_str = ' '.join(record.keys) if not query_str: return None tree = QueryParser(self.getLexicon()).parseQuery(query_str) results = tree.executeQuery(self.index) return results, (self.id,) def getEntryForObject(self, documentId, default=None): """Return the list of words indexed for documentId""" try: word_ids = self.index.get_words(documentId) except KeyError: return default get_word = self.getLexicon().get_word return [get_word(wid) for wid in word_ids] def uniqueValues(self, name=None, withLengths=0): raise NotImplementedError ## The ZCatalog Index management screen uses these methods ## def numObjects(self): """Return number of unique words in the index""" return self.index.length() def indexSize(self): """Return the number of indexes objects """ return self.index.document_count() def clear(self): """reinitialize the index (but not the lexicon)""" try: # Remove the cached reference to the lexicon # So that it is refreshed del self._v_lexicon except (AttributeError, KeyError): pass self.index = self._index_factory(self.getLexicon()) ## User Interface Methods ## manage_main = DTMLFile('dtml/manageZCTextIndex', globals()) def getIndexSourceNames(self): """Return sequence of names of indexed attributes""" try: return self._indexed_attrs except: return [self._fieldname] def getIndexType(self): """Return index type string""" return getattr(self, '_index_type', self._index_factory.__name__) def getLexiconURL(self): """Return the url of the lexicon used by the index""" try: lex = self.getLexicon() except (KeyError, AttributeError): return None else: return lex.absolute_url() InitializeClass(ZCTextIndex) def manage_addZCTextIndex(self, id, extra=None, REQUEST=None, RESPONSE=None): """Add a text index""" if REQUEST is None: URL3 = None else: URL3 = REQUEST.URL3 return self.manage_addIndex(id, 'ZCTextIndex', extra, REQUEST, RESPONSE, URL3) manage_addZCTextIndexForm = DTMLFile('dtml/addZCTextIndex', globals()) manage_addLexiconForm = DTMLFile('dtml/addLexicon', globals()) def manage_addLexicon(self, id, title='', elements=[], REQUEST=None): """Add ZCTextIndex Lexicon""" pipeline = [] for el_record in elements: if not hasattr(el_record, 'name'): continue # Skip over records that only specify element group element = element_factory.instantiate(el_record.group, el_record.name) if element is not None: if el_record.group == 'Word Splitter': # I don't like hardcoding this, but its a simple solution # to get the splitter element first in the pipeline pipeline.insert(0, element) else: pipeline.append(element) lexicon = PLexicon(id, title, *pipeline) self._setObject(id, lexicon) if REQUEST is not None: return self.manage_main(self, REQUEST, update_menu=1) # I am borrowing the existing vocabulary permissions for now to avoid # adding new permissions. This may change when old style Vocabs go away LexiconQueryPerm = 'Query Vocabulary' LexiconMgmtPerm = 'Manage Vocabulary' class PLexicon(Lexicon, Acquisition.Implicit, SimpleItem): """Lexicon for ZCTextIndex. """ implements(IZCLexicon) meta_type = 'ZCTextIndex Lexicon' manage_options = ({'label':'Overview', 'action':'manage_main'}, {'label':'Query', 'action':'queryLexicon'}, ) + SimpleItem.manage_options security = ClassSecurityInfo() security.declareObjectProtected(LexiconQueryPerm) def __init__(self, id, title='', *pipeline): self.id = str(id) self.title = str(title) PLexicon.inheritedAttribute('__init__')(self, *pipeline) ## User Interface Methods ## def getPipelineNames(self): """Return list of names of pipeline element classes""" return [element.__class__.__name__ for element in self._pipeline] _queryLexicon = DTMLFile('dtml/queryLexicon', globals()) security.declareProtected(LexiconQueryPerm, 'queryLexicon') def queryLexicon(self, REQUEST, words=None, page=0, rows=20, cols=4): """Lexicon browser/query user interface """ if words: wids = [] for word in words: wids.extend(self.globToWordIds(word)) words = [self.get_word(wid) for wid in wids] else: words = self.words() word_count = len(words) rows = max(min(rows, 500), 1) cols = max(min(cols, 12), 1) page_count = word_count / (rows * cols) + \ (word_count % (rows * cols) > 0) page = max(min(page, page_count - 1), 0) start = rows * cols * page end = min(rows * cols * (page + 1), word_count) if word_count: words = list(words[start:end]) else: words = [] columns = [] i = 0 while i < len(words): columns.append(words[i:i + rows]) i += rows return self._queryLexicon(self, REQUEST, page=page, rows=rows, cols=cols, start_word=start+1, end_word=end, word_count=word_count, page_count=page_count, page_range=xrange(page_count), page_columns=columns) security.declareProtected(LexiconMgmtPerm, 'manage_main') manage_main = DTMLFile('dtml/manageLexicon', globals()) InitializeClass(PLexicon)