Source code for rewriter

from lxml import etree
import abc
import re
import requests

import gensim.models as models
import gensim.models.phrases as phrases
import gensim.models.word2vec as word2vec


[docs]class Rewriter(object): """ Abstract class around a query rewriter, which takes a given term and rewrites it to a set of semantically related terms. This, hopefully, helps search engines return more, and more useful, results. """ # make it an abstract class __metaclass__ = abc.ABCMeta
[docs] def rewrite(self, term): """ Rewrites a term to a list of new terms to search with. Abstract base! :param str term: a string to rewrite :return: a list of semantically related strings, including ``term`` :rtype: list(str) """ raise NotImplementedError("Subclasses must implement!")
[docs]class ControlRewriter(Rewriter): """ A rewriter that's basically a no-op. Just returns the term you give it. This is mostly useful for testing purposes. """
[docs] def rewrite(self, term): """ Rewrites a term to a list containing just itself. This is the degenerate case of query rewriting - the original term isn't actually rewritten at all. ``rewrite(x) == [x]`` for all x. :param str term: a string to rewrite :return: a list containing just ``term`` :rtype: list(str) """ return [term]
[docs]class WikipediaRewriter(Rewriter): """ A class to rewrite queries using Wikipedia's Category API. """ WIKI_BASE = 'https://en.wikipedia.org/w/api.php?format=xml&action=query&prop=categories&titles=' def clean_category(self, x): return x.replace('Category:', '')
[docs] def rewrite(self, term): """ Given a base term, returns a list of related terms based on the Wikipedia category API. For example, visit your favorite Wikipedia page and look for the list of Categories at the very bottom of the page. :param str term: a string to rewrite :return: a list of semantically related strings, including ``term`` :rtype: list(str) """ api_url = self.WIKI_BASE + term try: raw_result = requests.get(api_url) tree = etree.fromstring(raw_result.text) # TODO join this and the below wikipedia_results = [self.clean_category( x.get('title')) for x in tree.findall('.//cl')] # Words that identify a need to drop the category dropwords = ['articles', 'wikipedia', 'accuracy', 'articles', 'statements', 'magic', 'pages', 'authors', 'editors', 'appearances', 'redirects', 'cs1'] dropwords.append(term) wikipedia_results = [w.split('Category:')[-1].lower() for w in wikipedia_results if not any(d in w.lower() for d in dropwords)] # append the original term just for completeness raw_results = wikipedia_results + [term] # convert to unicode for consistency w/ other rewriters # TODO this doesn't work # return [unicode(rr) for rr in raw_results] return raw_results except Exception as e: # TODO more fine grained exception handling # this'll probably happen if the user is offline # and we can't connect to wikipedia # in this case, just make this rewriter a no-op return [term]
[docs]class Word2VecRewriter(Rewriter): """ A class to rewrite queries using Word2Vec, an NLP package that finds semantically related words and phrases to inputted words and phrases. Word2Vec must be trained on a user-provided dataset before it is used. """ # TODO use kwargs or something to make creating this less insane # http://stackoverflow.com/questions/1098549/proper-way-to-use-kwargs-in-python#1098556
[docs] def __init__(self, model_path, create=False, corpus=None, bigrams=True): """ Initializes the rewriter, given a particular Word2Vec corpus. A good example corpus is the Wikipedia Text8Corpus. You only need the corpus if you are recreating the model from scratch. If ``create == True``, this generates a new Word2Vec model (which takes a really long time to build.) If ``False``, this loads an existing model we already saved. :param str model_path: where to store the model files. This file needn't exist, but its parent folder should. :param bool create: True to create a new Word2Vec model, False to use the one stored at ``model_path``. :param Iterable corpus: only needed if ``create=True``. Defines a corpus for Word2Vec to learn from. :param bool bigrams: only needed if ``create=True``. If True, takes some more time to build a model that supports bigrams (e.g. `new_york`). Otherwise, it'll only support one-word searches. ``bigram=True`` makes this slower but more complete. """ self.model_path = model_path # TODO: add logic around defaulting to creating or not if create: # generate a new Word2Vec model... takes a while! # TODO optimize parameters transformed_corpus = None if bigrams: # TODO save the phraser somewhere... but that requires # even more arguments. # the Phrases class lets you generate bigrams, but the # Phraser class is a more compact version of the same # TODO making the phrases takes forever, making the phraser # takes forever, turning it into a list takes forever... this # is really annoying. is there any way to speed it up? bigram_generator = phrases.Phraser(phrases.Phrases(corpus)) # weird bug where the bigram generator won't work unless # it's turned into a list first. if you try to do it straight, # it'll give you total gibberish. FIXME bigram_corpus = list(bigram_generator[corpus]) transformed_corpus = bigram_corpus else: # no bigrams, same old corpus transformed_corpus = corpus self.model = word2vec.Word2Vec(transformed_corpus, workers=8) self.model.save(self.model_path) else: self.model = word2vec.Word2Vec.load(self.model_path)
[docs] def rewrite(self, term): """ Rewrites a term to a list of new terms to search with. These words are the `k` most similar words and phrases to the inputted term, as judged by Word2Vec. Here, ``k==10``. :param str term: a string to rewrite :return: a list of semantically related strings, including ``term`` :rtype: list(str) """ # try using the model to rewrite the term results = [] try: # preprocess the term so it's more palatable to word2vec encoded_term = self.encode_term(term) # most_similar returns an array of tuples, each representing a term/phrase # that is close to the original # TODO consider choosing fewer results! or have a higher bar on how # related they need to be raw_results = self.model.similar_by_word(encoded_term, topn=10) # extract just the name, which is index 0 # and decode all the results we get from word2vec results = [self.decode_term(r[0]) for r in raw_results] except KeyError as k: # the word wasn't found in the model... must be too niche. # no results then results = [] # finally, tack on the original term to the results for completeness return results + [term.decode("utf8")]
[docs] def encode_term(self, term): """ Converts a search term like `Hadrian's Wall` to `hadrians_wall`, which plays better with Word2Vec. Primarily for internal use. :param str term: a search term you'd normally feed into `Word2VecRewriter.rewrite`. :return: a cleaned up version of the term, which works better in `rewrite`. :rtype: str """ # remove anything that isn't alphanumeric or space alphanum_pattern = re.compile(r'[^\w\d\s]') cleaned = alphanum_pattern.sub('', term) # sub out spaces for underscores space_pattern = re.compile(' ') cleaned = space_pattern.sub('_', cleaned) # finally lowercase it all # TODO replace numbers with words (mp3 => mp_three) return cleaned.lower()
[docs] def decode_term(self, encoded): """ Converts an encoded search term into something more human readable, like `hadrians_wall` to `hadrians wall`. :param str term: a cleaned term from `Word2VecRewriter:encode_term`. :return: a more human-readable version of the inputted term. :rtype: str """ # not much we can do besides replace underscores with spaces underscore_pattern = re.compile('_') return underscore_pattern.sub(' ', encoded)