Source code for search

from whoosh.fields import *
from whoosh.index import create_in, open_dir
from whoosh.qparser import MultifieldParser
from whoosh.query import *
import abc
import copy
import csv
import json
import os.path
import sys

import utils


[docs]class GenericSearchEngine(object): """ An abstract class for any search engine, whether that's an external API you've already built or a Whoosh-based search engine you can make from scratch via searchbetter. This class encapsulates some useful functionality like query rewriting that can benefit any search engine, even one not made using SearchBetter tools. Extending this class is easy - you just need to provide a search function and a few other details, and we'll build in functionality from there. """ # make it an abstract class __metaclass__ = abc.ABCMeta def __init__(self): # no rewriter yet # TODO let someone pass this in the constructor self.rewriter = None
[docs] def set_rewriter(self, rewriter): """ Sets a new query rewriter (from this_package.rewriter) as the default rewriter for this search engine. """ self.rewriter = rewriter
[docs] def search(self, term): """ Runs a plain-English search and returns results. :param term {String}: a query like you'd type into Google. :return: a list of dicts, each of which encodes a search result. """ if self.rewriter is None: # if there's no query rewriter in place, just search for the # original term return self.single_search(term) else: # there's a rewriter! use it rewritten_queries = self.rewriter.rewrite(term) results = [self.single_search(q) for q in rewritten_queries] # results are multi-level... flatten it flattened_results = utils.flatten(results) return self.process_raw_results(flattened_results)
[docs] def process_raw_results(self, raw_results): """ After rewriting, we'll pass the full list of results in here for you to clean up. This could include sorting, removing duplicates, etc. (What you can do, and how you do it, really depends on what kind of objects your search engine returns.) """ # default operation is a no-op return raw_results
### ### ### functions you need to specify ### ###
[docs]class WhooshSearchEngine(GenericSearchEngine): """ An abstract class for custom, Whoosh-based search engines. A batteries-included search engine that can operate on any given dataset. Uses the Whoosh library to index and run searches on the dataset. Has built-in support for query rewriting. """ # make it an abstract class __metaclass__ = abc.ABCMeta # TODO consider making more hierarchy. This is the WhooshSearchEngine, # which has the cool indexing capabilities. But more generally, you # could have a search engine that only has to support search(). # but at that point it's just a useless interface, mostly. # anyway, such a search engine would let the query rewriting search engine # inherit from search engine too.
[docs] def __init__(self, create, search_fields, index_path): """ Creates a new search engine. :param create {bool}: If True, recreates an index from scratch. If False, loads the existing index :param search_fields {str[]}: An array names of fields in the index that our search engine will search against. :param index_path {str}: A relative path to a folder where the whoosh index should be stored. """ super(WhooshSearchEngine, self).__init__() # TODO have an auto-detect feature that will determine if the # index exists, and depending on that creates or loads the index # TODO have the `create` option become `force_create`; normally # it'll intelligently auto-generate, but if you force it it'll # do what you say self.index_path = index_path # both these functions return an index if create: self.index = self.create_index() else: self.index = self.load_index() # set up searching # first, query parser self.parser = MultifieldParser(search_fields, self.index.schema)
[docs] def load_index(self): """ Used when the index is already created. This just loads it and returns it for you. """ index = open_dir(self.index_path) return index
[docs] def create_index(self): """ Creates and returns a brand-new index. This will call get_empty_index() behind the scenes. Subclasses must implement! """ raise NotImplementedError("Subclasses must implement!")
[docs] def get_empty_index(self, path, schema): """ Makes an empty index file, making the directory where it needs to be stored if necessary. Returns the index. This is called within create_index(). TODO this breakdown is still confusing """ if not os.path.exists(path): os.mkdir(path) index = create_in(path, schema) return index
[docs] def get_num_documents(self): """ Returns the number of documents in this search engine's corpus. That is, this is the size of the search engine. """ query = Every() with self.index.searcher() as searcher: result = searcher.search(query) return len(result) return None
def __len__(self): return self.get_num_documents() def process_raw_results(self, raw_results): # our search engine returns WhooshResult objects, so we can unique/sort # them # only give the unique ones # this works now that we use a Result object, which is hashable! unique_results = list(set(raw_results)) # now let's sort all the results by their relevance score (descending # b/c higher is better) # so the best stuff bubbles to the top unique_results.sort(key=lambda result: result.score, reverse=True) return unique_results
[docs]class UdacitySearchEngine(WhooshSearchEngine): """ Udacity """ # DATASET_PATH = secure.DATASET_PATH_BASE+'udacity-api.json' # INDEX_PATH = secure.INDEX_PATH_BASE+'udacity' """Which dataset fields we should search over.""" SEARCH_FIELDS = ["title", "subtitle", "expected_learning", "syllabus", "summary", "short_summary"]
[docs] def __init__(self, dataset_path, index_path, create=False): """ Creates a new Udacity search engine. :param dataset_path {string}: the path to the Udacity API JSON file. :param index_path {string}: the path to a folder where you'd like to store the search engine index. The given folder doesn't have to exist, but its *parent* folder does. :param create {bool}: If True, recreates an index from scratch. If False, loads the existing index """ self.dataset_path = dataset_path super(UdacitySearchEngine, self).__init__( create, self.SEARCH_FIELDS, index_path)
[docs] def create_index(self): """ Creates a new index to search the Udacity dataset. You only need to call this once; once the index is created, you can just load it again instead of creating it afresh all the time. """ # load data udacity_data = None with open(self.dataset_path, 'r') as file: udacity_data = json.load(file) # set up whoosh # schema # TODO: use StemmingAnalyzer here so we get the built-in benefits # of stemming in our search engine # http://whoosh.readthedocs.io/en/latest/stemming.html schema = Schema( slug=ID(stored=True), title=TEXT(stored=True), subtitle=TEXT, expected_learning=TEXT, syllabus=TEXT, summary=TEXT, short_summary=TEXT ) # make an index to store this stuff in index = self.get_empty_index(self.index_path, schema) # start adding documents (i.e. the courses) to the index try: writer = index.writer() for course in udacity_data['courses']: writer.add_document( slug=course['slug'], title=course['title'], subtitle=course['subtitle'], expected_learning=course['expected_learning'], syllabus=course['syllabus'], summary=course['summary'], short_summary=course['short_summary']) writer.commit() except Exception as e: print e # all done for now return index
[docs] def count_words(self): """ Returns the number of words in the underlying Udacity dataset. """ # will be useful for extracting textual content from a course later def extract_text_from_course(c): return [c[field] for field in self.SEARCH_FIELDS] # load data with open(self.dataset_path, 'r') as file: udacity_data = json.load(file) # extract just the text fields, no other markup or fields courses = udacity_data['courses'] paragraphs = [extract_text_from_course(c) for c in courses] # these are nested... flatten into one huge string array raw_lines = utils.flatten(paragraphs) # then flatten into one huge string mega_string = (" ").join(raw_lines) return utils.unique_words_in_string(mega_string)
[docs]class HarvardXSearchEngine(WhooshSearchEngine): """ HX """ # INDEX_PATH = secure.INDEX_PATH_BASE+'harvardx' SEARCH_FIELDS = ["display_name", "contents"]
[docs] def __init__(self, dataset_path, index_path, create=False): """ Creates a new HarvardX search engine. Searches over the HarvardX/DART database of all courses and course materials used in HarvardX. This includes videos, quizzes, etc. TODO: consider renaming to DART, probz :param dataset_path {string}: the path to the HarvardX course catalog CSV file. :param index_path {string}: the path to a folder where you'd like to store the search engine index. The given folder doesn't have to exist, but its *parent* folder does. :param create {bool}: If True, recreates an index from scratch. If False, loads the existing index """ super(HarvardXSearchEngine, self).__init__( create, self.SEARCH_FIELDS, index_path) self.dataset_path = dataset_path
[docs] def create_index(self): """ Creates a new index to search the dataset. You only need to call this once; once the index is created, you can just load it again instead of creating it afresh all the time. Returns the index object. """ # load data # real data # csvfile_path = secure.DATASET_PATH_BASE+'corpus_HarvardX_LatestCourses_based_on_2016-10-18.csv' # test data # csvfile_path = 'datasets/test.csv' # only consider resources with this category (type of content) # unsure about courses (b/c they have no content) and html (b/c they often include messy CSS/JS in there) # TODO: add "html" support. requires stripping comments # http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python # supported_categories = ('problem', 'video', 'course') # set up whoosh schema schema = Schema( course_id=ID(stored=True), display_name=TEXT(stored=True), contents=TEXT ) # TODO: use StemmingAnalyzer here so we get the built-in benefits # of stemming in our search engine # http://whoosh.readthedocs.io/en/latest/stemming.html # make an index to store this stuff in index = self.get_empty_index(self.index_path, schema) # start adding documents (i.e. the courses) to the index # first, some of the fields are HUGE so we need to let the csv # reader handle them csv.field_size_limit(sys.maxsize) with open(self.dataset_path, 'r') as csvfile: reader = csv.DictReader(csvfile) writer = index.writer() try: for row in reader: # ensure the content is actually a valid type if row['category'] not in supported_categories: pass # write writer.add_document( course_id=row['course_id'].decode('utf8'), display_name=row['display_name'].decode('utf8'), contents=row['contents'].decode('utf8')) writer.commit() except Exception as e: print e writer.cancel() # all done for now return index
[docs]class EdXSearchEngine(WhooshSearchEngine): """ edX """ # INDEX_PATH = secure.INDEX_PATH_BASE+'edx' SEARCH_FIELDS = ["name"]
[docs] def __init__(self, dataset_path, index_path, create=False): """ Creates a new search engine that searches over edX courses. :param dataset_path {string}: the path to the edX course listings file. :param index_path {string}: the path to a folder where you'd like to store the search engine index. The given folder doesn't have to exist, but its *parent* folder does. :param create {bool}: If True, recreates an index from scratch. If False, loads the existing index """ super(EdXSearchEngine, self).__init__( create, self.SEARCH_FIELDS, index_path) self.dataset_path = dataset_path
[docs] def create_index(self): """ Creates a new index to search the dataset. You only need to call this once; once the index is created, you can just load it again instead of creating it afresh all the time. Returns the index object. """ # load data # csvfile_path = secure.DATASET_PATH_BASE+'Master CourseListings - edX.csv' # set up whoosh schema schema = Schema( course_id=ID(stored=True), name=TEXT(stored=True) ) # TODO: use StemmingAnalyzer here so we get the built-in benefits # of stemming in our search engine # http://whoosh.readthedocs.io/en/latest/stemming.html # make an index to store this stuff in index = self.get_empty_index(self.index_path, schema) # start adding documents (i.e. the courses) to the index with open(self.dataset_path, 'r') as csvfile: reader = csv.DictReader(csvfile) writer = index.writer() try: for row in reader: # write writer.add_document( course_id=row['course_id'].decode('utf8'), name=row['name'].decode('utf8')) writer.commit() except Exception as e: print e writer.cancel() # all done for now return index
[docs] def count_words(self): """ Returns the number of words in the underlying Udacity dataset. """ with open(self.dataset_path, 'r') as csvfile: reader = csv.DictReader(csvfile) # the only text field that's useful is the name field names = [row['name'].decode('utf8') for row in reader] # turn into one huge string then count words in that mega_string = (" ").join(names) return utils.unique_words_in_string(mega_string)
[docs]class PrebuiltSearchEngine(WhooshSearchEngine): """ A search engine designed for when you're just given a model file and can use that directly without having to build anything. """ def __init__(self, search_fields, index_path): super(PrebuiltSearchEngine, self).__init__( create=False, search_fields=search_fields, index_path=index_path) def create_index(self): # no need to create!! # TODO raise an error raise NotImplementedError( "This search engine doesn't need to create an index! Use create = False.") pass
[docs]class WhooshResult(object): """ Encodes a search result from a Whoosh-based search engine. Basically a wrapper around a result dict and its relevance score (higher is better). """ def __init__(self, dict_data, score): self.dict_data = dict_data self.score = score
[docs] def get_dict(self): """ Get the underlying dict data """ return self.dict_data
def __repr__(self): """ Stringified version of the result, which encodes the dict and the score """ return str((self.dict_data, self.score)) # enable lookup as if this was a real dict def __getitem__(self, key): return self.dict_data[key] # to enable hashing def __hash__(self): return hash(frozenset(self.dict_data.items())) def __eq__(self, other): return frozenset(self.dict_data.items()) == frozenset(other.dict_data.items())