from lxml import etree
import abc
import re
import requests
import gensim.models as models
import gensim.models.phrases as phrases
import gensim.models.word2vec as word2vec
[docs]class Rewriter(object):
"""
Abstract class around a query rewriter, which takes a given term and
rewrites it to a set of semantically related terms. This, hopefully,
helps search engines return more, and more useful, results.
"""
# make it an abstract class
__metaclass__ = abc.ABCMeta
[docs] def rewrite(self, term):
"""
Rewrites a term to a list of new terms to search with. Abstract base!
:param str term: a string to rewrite
:return: a list of semantically related strings, including ``term``
:rtype: list(str)
"""
raise NotImplementedError("Subclasses must implement!")
[docs]class ControlRewriter(Rewriter):
"""
A rewriter that's basically a no-op. Just returns the term you give it.
This is mostly useful for testing purposes.
"""
[docs] def rewrite(self, term):
"""
Rewrites a term to a list containing just itself. This is the degenerate case
of query rewriting - the original term isn't actually rewritten at all.
``rewrite(x) == [x]`` for all x.
:param str term: a string to rewrite
:return: a list containing just ``term``
:rtype: list(str)
"""
return [term]
[docs]class WikipediaRewriter(Rewriter):
"""
A class to rewrite queries using Wikipedia's Category API.
"""
WIKI_BASE = 'https://en.wikipedia.org/w/api.php?format=xml&action=query&prop=categories&titles='
def clean_category(self, x):
return x.replace('Category:', '')
[docs] def rewrite(self, term):
"""
Given a base term, returns a list of related terms based on the Wikipedia
category API.
For example, visit your favorite Wikipedia page and look for the list of
Categories at the very bottom of the page.
:param str term: a string to rewrite
:return: a list of semantically related strings, including ``term``
:rtype: list(str)
"""
api_url = self.WIKI_BASE + term
try:
raw_result = requests.get(api_url)
tree = etree.fromstring(raw_result.text)
# TODO join this and the below
wikipedia_results = [self.clean_category(
x.get('title')) for x in tree.findall('.//cl')]
# Words that identify a need to drop the category
dropwords = ['articles', 'wikipedia', 'accuracy', 'articles', 'statements',
'magic', 'pages', 'authors', 'editors', 'appearances', 'redirects', 'cs1']
dropwords.append(term)
wikipedia_results = [w.split('Category:')[-1].lower()
for w in wikipedia_results if not any(d in w.lower() for d in dropwords)]
# append the original term just for completeness
raw_results = wikipedia_results + [term]
# convert to unicode for consistency w/ other rewriters
# TODO this doesn't work
# return [unicode(rr) for rr in raw_results]
return raw_results
except Exception as e:
# TODO more fine grained exception handling
# this'll probably happen if the user is offline
# and we can't connect to wikipedia
# in this case, just make this rewriter a no-op
return [term]
[docs]class Word2VecRewriter(Rewriter):
"""
A class to rewrite queries using Word2Vec, an NLP package that finds
semantically related words and phrases to inputted words and phrases.
Word2Vec must be trained on a user-provided dataset before it is used.
"""
# TODO use kwargs or something to make creating this less insane
# http://stackoverflow.com/questions/1098549/proper-way-to-use-kwargs-in-python#1098556
[docs] def __init__(self, model_path, create=False, corpus=None, bigrams=True):
"""
Initializes the rewriter, given a particular Word2Vec corpus.
A good example corpus is the Wikipedia Text8Corpus.
You only need the corpus if you are recreating the model from scratch.
If ``create == True``, this generates a new Word2Vec
model (which takes a really long time to build.) If ``False``, this loads
an existing model we already saved.
:param str model_path: where to store the model files. This file
needn't exist, but its parent folder should.
:param bool create: True to create a new Word2Vec model, False to
use the one stored at ``model_path``.
:param Iterable corpus: only needed if ``create=True``. Defines a corpus
for Word2Vec to learn from.
:param bool bigrams: only needed if ``create=True``. If True, takes some
more time to build a model that supports bigrams (e.g. `new_york`).
Otherwise, it'll only support one-word searches. ``bigram=True`` makes
this slower but more complete.
"""
self.model_path = model_path
# TODO: add logic around defaulting to creating or not
if create:
# generate a new Word2Vec model... takes a while!
# TODO optimize parameters
transformed_corpus = None
if bigrams:
# TODO save the phraser somewhere... but that requires
# even more arguments.
# the Phrases class lets you generate bigrams, but the
# Phraser class is a more compact version of the same
# TODO making the phrases takes forever, making the phraser
# takes forever, turning it into a list takes forever... this
# is really annoying. is there any way to speed it up?
bigram_generator = phrases.Phraser(phrases.Phrases(corpus))
# weird bug where the bigram generator won't work unless
# it's turned into a list first. if you try to do it straight,
# it'll give you total gibberish. FIXME
bigram_corpus = list(bigram_generator[corpus])
transformed_corpus = bigram_corpus
else:
# no bigrams, same old corpus
transformed_corpus = corpus
self.model = word2vec.Word2Vec(transformed_corpus, workers=8)
self.model.save(self.model_path)
else:
self.model = word2vec.Word2Vec.load(self.model_path)
[docs] def rewrite(self, term):
"""
Rewrites a term to a list of new terms to search with. These words
are the `k` most similar words and phrases to the inputted term, as judged
by Word2Vec. Here, ``k==10``.
:param str term: a string to rewrite
:return: a list of semantically related strings, including ``term``
:rtype: list(str)
"""
# try using the model to rewrite the term
results = []
try:
# preprocess the term so it's more palatable to word2vec
encoded_term = self.encode_term(term)
# most_similar returns an array of tuples, each representing a term/phrase
# that is close to the original
# TODO consider choosing fewer results! or have a higher bar on how
# related they need to be
raw_results = self.model.similar_by_word(encoded_term, topn=10)
# extract just the name, which is index 0
# and decode all the results we get from word2vec
results = [self.decode_term(r[0]) for r in raw_results]
except KeyError as k:
# the word wasn't found in the model... must be too niche.
# no results then
results = []
# finally, tack on the original term to the results for completeness
return results + [term.decode("utf8")]
[docs] def encode_term(self, term):
"""
Converts a search term like `Hadrian's Wall` to `hadrians_wall`, which
plays better with Word2Vec. Primarily for internal use.
:param str term: a search term you'd normally feed into
`Word2VecRewriter.rewrite`.
:return: a cleaned up version of the term, which works better in `rewrite`.
:rtype: str
"""
# remove anything that isn't alphanumeric or space
alphanum_pattern = re.compile(r'[^\w\d\s]')
cleaned = alphanum_pattern.sub('', term)
# sub out spaces for underscores
space_pattern = re.compile(' ')
cleaned = space_pattern.sub('_', cleaned)
# finally lowercase it all
# TODO replace numbers with words (mp3 => mp_three)
return cleaned.lower()
[docs] def decode_term(self, encoded):
"""
Converts an encoded search term into something more human readable,
like `hadrians_wall` to `hadrians wall`.
:param str term: a cleaned term from `Word2VecRewriter:encode_term`.
:return: a more human-readable version of the inputted term.
:rtype: str
"""
# not much we can do besides replace underscores with spaces
underscore_pattern = re.compile('_')
return underscore_pattern.sub(' ', encoded)