import string, re, sys import convert, language class Query: ids = { 'an' : 'an', 'accno' : 'an', 'db' : 'db', 'database' : 'db', 'zn' : 'an', 'zblno' : 'an', 'bi' : 'bi', 'basic' : 'bi', 'au' : 'au', 'author' : 'au', 'ai' : 'ai', 'authorid' : 'ai', 'ar' : 'au', 'authorref' : 'au', 'ti' : 'ti', 'title' : 'ti', 'la' : 'la', 'language' : 'la', 'so' : 'so', 'source' : 'so', 'is' : 'is', 'issn' : 'is', 'py' : 'py', 'year' : 'py', 'dt' : 'dt', 'doctype' : 'dt', 'cc' : 'cc', 'msc' : 'cc', 'class' : 'cc', 'ut' : 'ut', 'kw' : 'ut', 'keyword' : 'ut', 'ab' : 'bi', 'abstract' : 'bi', 're' : 'bi', 'review' : 'bi', 'rv' : 'rv', 'reviewer' : 'rv', 'ci' : 'ci', 'citation' : 'ci', 'se' : 'se', 'serial' : 'se', 'vn' : 'vn', 'volume' : 'vn', 'in' : 'in', 'issue' : 'in', 'ps' : 'ps', 'spage' : 'ps', 'pe' : 'pe', 'epage' : 'pe', 'co' : 'co', 'conference' : 'co', } class Node: def __init__(self, type=None, left=None, right=None): self.type = type self.left = left self.right = right return def __str__(self): if not self: return '' if self.type == '=': if self.left == 'py': text = '%s = %s' % (self.left, self.right) else: text = '%s = (%s)' % (self.left, self.right) elif self.type == '(': text = '( %s )' % (self.left) elif self.type == '&': text = '%s & %s' % (self.left, self.right) elif self.type == '|': text = '%s | %s' % (self.left, self.right) elif self.type == '^': text = '%s ^ %s' % (self.left, self.right) else: text = '' return text def __repr__(self): return self._repr(self, 1) def _repr(self, node, count, parenthesis=0): text = '' indent = ' '*count if parenthesis: indent = '%s( ' % (' '*(count-1)) if type(node) == type(self): if node.type == '(': text += self._repr(node.left, count+1, 1) elif node.type == '=': text += self._repr(node.left, count+1) text += '%s=\n' % (indent) text += self._repr(node.right, count+1) elif node.type in '&|^': text += self._repr(node.left, count+1) text += '%s%s\n' % (indent, node.type) text += self._repr(node.right, count+1) else: text += '%s%s\n' % (indent, node) return text def __init__(self, text, init='bi'): self.debug = 0 text = convert.utf82ascii(text) text = convert.tex2ascii(text) text = re.sub(';', ' ', text) text = re.sub(r'\s*([^\w\*\(\)])\s*', '\\1', text) text = re.sub('\s+', ' ', text.strip()) self.text = text self.init = init self.max = len(text) return def analyse(self): query, pos = self._query(0) return query def parse(self): query, pos = self._query(0) query = self.edbm(query) return query def edbm(self, node, id=None): if not node: return node if self._type(node) == '-': if id == 'py': node = node.replace('-', ':') node = re.sub(r'[^\d:]', '', node) node = node.strip() else: if id == 'an': if node.startswith('pre'): node = node[3:] elif node.startswith('DE'): node = node[2:-1] node = re.sub(r'[^\w\*\&\|\^\.]', ' ', node) elif id == 'au' or id == 'ar': node = re.sub(r'[^\w\*\&\|\^\,]', ' ', node) node = re.sub(r'\,', ', ', node) elif id == 'ai': node = re.sub(r'[^\w\*\&\|\^\.\-]', ' ', node) elif id == 'ci': node = re.sub(r'[^\w\*\&\|\^\.]', ' ', node) elif id == 'cc': node = re.sub(r'[^\w\*\&\|\^\-]', ' ', node) elif id == 'la': node = language.codes(node) else: node = re.sub(r'[^\w\*\&\|\^]', ' ', node) node = re.sub(r'\s*([\&\|\^])[\s\&\|\^]+', ' \\1 ', node) node = re.sub(r'^[\s\&\|\^]+', '', node) node = re.sub(r'[\s\&\|\^]+$', '', node) node = re.sub(r'\s+', ' ', node) node = re.sub(r'_', ' ', node) node = node.strip() elif self._type(node) == '=': id = node.left node.right = self.edbm(node.right, id) if not node.right: node = '' elif self._type(node) == '(': node.left = self.edbm(node.left, id) else: node.left = self.edbm(node.left, id) node.right = self.edbm(node.right, id) if not node.left and not node.right: node = '' elif not node.left: node = node.right elif not node.right: node = node.left return node def _query(self, pos): if self.debug: print 'QUERY ', pos, self.text[:pos] pos = self._space(pos) if pos >= self.max: return None, pos expr, pos = self._expr(pos) while pos < self.max: node = expr operator, pos = self._operator(pos) if pos >= self.max: break elif self.text[pos] == '(': query, pos = self._query(pos+1) if query: expr = self.Node('(', query) elif self.text[pos] == ')': return expr, pos+1 else: expr, pos = self._expr(pos) if expr and node: expr = self.Node(operator, node, expr) elif node: expr = node return expr, pos def _expr(self, pos): if self.debug: print 'EXPR ', pos, self.text[:pos] pos = self._space(pos) if pos >= self.max: return None, pos id, pos = self._id(pos) term, pos = self._term(pos) while pos < self.max: tmp = pos operator, pos = self._operator(pos) if self._id(pos)[0]: pos = tmp break node = term if pos >= self.max: break elif self.text[pos] == '(': if self._id(pos+1)[0]: pos = tmp break expr, pos = self._expr(pos+1) if expr: term = self.Node('(', expr) elif self.text[pos] == ')': if id: term = self.Node('=', id, term) return term, pos+1 else: term, pos = self._term(pos) if pos == tmp: if node and pos < self.max and self.text[pos] == ':': pos += 1 id = self.ids.get(node, '') continue break if term and node: term = self.Node(operator, node, term) elif node: term = node if not id: id = self.init term = self.Node('=', id, term) return term, pos def _term(self, pos): if self.debug: print 'TERM ', pos, self.text[:pos] pos = self._space(pos) if pos >= self.max: return None, pos tmp = pos if self.text[pos] == '"': pos += 1 tmp = pos while pos < self.max and self.text[pos] != '"': pos += 1 term = self.text[tmp:pos] if pos < self.max and self.text[pos] == '"': pos += 1 else: while pos < self.max and self.text[pos] not in ':&|^()" ': pos += 1 term = self.text[tmp:pos] return term, pos def _operator(self, pos): if self.debug: print 'OPERATOR', pos, self.text[:pos] pos = self._space(pos) if pos >= self.max: return '', pos operator = '&' if self.text[pos] in '&|^': operator = self.text[pos] pos += 1 elif self.text[pos] == ' ': operator = '&' pos += 1 return operator, pos def _id(self, pos): if self.debug: print 'ID ', pos, self.text[:pos] pos = self._space(pos) if pos >= self.max: return '', pos tmp = pos while pos < self.max and self.text[pos] in string.letters: pos += 1 pos = self._space(pos) if pos < self.max and self.text[pos] == ':': id = self.ids.get(self.text[tmp:pos], self.init) pos += 1 return id, pos return '', tmp def _space(self, pos): while pos < self.max and self.text[pos] == ' ': pos += 1 return pos def _type(self, node): if type(node) == type(self): return node.type return '-'