#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import division import sys, getopt, json, os.path # import BeautifulSoup relative to where this script exists include_path = os.path.dirname(os.path.abspath(os.path.dirname(sys.argv[0]))) sys.path.append(include_path + "/beautifulsoup4-4.1.0") from bs4 import BeautifulSoup # read in the HTML file #soup = BeautifulSoup(open("../../p4sag/publicsite-generated/chapter.windows.html")) def usage(level): print """ indexer.py [-h] -d -i -h: this help -d: the document file (HTML, with content in the '#content' id) to index -i: the index file, in JSON format. Will be created if it does not exist. """ sys.exit(level) class IndexerException(Exception): def __init__(self, value): self.value = value def __str__(self): return repr(self.value) class Indexer: """ An indexing class. """ path = "" files = [] index = {} titles = {} current = '' # the list of characters stripped from tokens strip_chars = " .,:;(){}[]" + u"\u201c" + u"\u201d" def __init__(self, path): """ Initialize the indexer. Requires an index filename, which will be read if it exists, and will be used for writing the completed index. """ self.path = path self.files = [] self.index = {} self.titles = {} self.read() def read(self): """ Reads the index filename. Expects JSON contents. """ content = '' try: with open(self.path, "r") as zfile: content = zfile.read() except IOError as e: if e.errno != 2: raise if len(content): content = json.loads(content) self.files = content["f"] self.index = content["i"] self.titles = content["t"] def write(self): """ Writes the index filename. Generates a JSON file. """ hash = { "f": self.files, "i": self.index, "t": self.titles, } with open(self.path, "w") as zfile: zfile.write(json.dumps(hash)) def add(self, path): """ Reads the specified filename, which is expected to be an HTML file, and pulls the text content of the DOM element with id '#content'. """ filename = os.path.basename(path) if filename in self.files: raise IndexerException( "{} already indexed".format(filename) ) soup = BeautifulSoup(open(path)) self.current = filename self.titles[filename] = soup.title.string self.files.append(filename) file_index = self.files.index(filename) text = soup.find(id="content").get_text() if len(text) < 1: raise IndexerException("{} has no content".format(self.current)) tokens = self.tokenize(text) # for x in tokens.keys(): # key = x[0:3] # if key not in self.index: # self.index[key] = {} # self.thread(x[3:], self.index[key], tokens[x]) for key in tokens.keys(): if key not in self.index: self.index[key] = {} self.index[key][file_index] = tokens[key] def tokenize(self, text): """ Splits text on whitespace, and produces hash of unique, lower-cased words in the text with length 3 or more characters, whose values are the repetition counts. """ tokens = {} for x in text.lower().split(): x = x.strip(self.strip_chars).replace('"', "").replace("'", "") if len(x) < 3: continue # check for stop words here if x not in tokens: tokens[x] = 0 tokens[x] = tokens[x] + 1; return tokens def thread(self, key, hash, score): """ Recursive function to unroll a key into a nested hash, implementing a progressive index, 3 characters at the first level, and one character for each subsequent level. """ # when we run out of key characters, inject the document/score if len(key) < 1: if "_t" not in hash: hash["_t"] = [] hash["_t"].append( { "d": self.files.index(self.current), "s": score } ); return; char = key[0] if char not in hash: hash[char] = {} self.thread(key[1:], hash[char], score) # ======================================================================== document_filename = '' index_filename = ''; try: options, args = getopt.getopt(sys.argv[1:], "d:hi:") except getopt.GetoptError: usage(2) for opt, arg in options: if opt == "-h": usage(0) elif opt == "-d": document_filename = os.path.basename(arg.strip("/\\")) print "{}".format(document_filename) document_filename = arg elif opt == "-i": index_filename = arg if len(document_filename) < 1: print "Error: No document filename specified" usage(1) if len(index_filename) < 1: print "Error: No index filename specified" usage(1) indexer = Indexer(index_filename) indexer.add(document_filename) indexer.write()