#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import division import sys, getopt, json, os.path from pprint import pprint as pp # import BeautifulSoup relative to where this script exists include_path = os.path.dirname(os.path.abspath(os.path.dirname(sys.argv[0]))) sys.path.append(include_path + "/beautifulsoup4-4.1.0") from bs4 import BeautifulSoup def usage(level): print """ indexer.py [-h] -d -i -h: this help -d: the document file (HTML, with content in the '#content' id) to index -i: the index file, in JSON format. Will be created if it does not exist. """ sys.exit(level) class IndexerException(Exception): def __init__(self, value): self.value = value def __str__(self): return repr(self.value) # ======================================================================== filenames = [] documents = [] index_file = '' index = {} try: options, args = getopt.getopt(sys.argv[1:], "hi:") except getopt.GetoptError: usage(2) for opt, arg in options: if opt == "-h": usage(0) elif opt == "-i": index_file = arg for arg in args: filename = os.path.basename(arg.strip("/\\")) if filename in filenames: raise IndexerException( "{} already specified".format(filename) ) filenames.append(filename) documents.append({ "name": filename, "path": arg }) if len(documents) < 1: print "Error: No document filename specified" usage(1) if len(index_file) < 1: print "Error: No index filename specified" usage(1) strip_chars = " .,:;(){}[]?|" + u"\u201c" + u"\u201d" + u"\xa9" for d, doc in enumerate(documents): print("Indexing {}".format(doc["path"])) soup = BeautifulSoup(open(doc["path"])) doc["title"] = soup.title.string text = soup.find(id="content").get_text() if len(text) < 1: raise IndexerException("{} has no content".format(doc["path"])) for pos, w in enumerate(text.lower().split()): word = w.strip(strip_chars).replace('"', '').replace("'", "") if len(word) < 1: continue if word not in index: index[word] = {} if d not in index[word]: index[word][d] = [] index[word][d].append(pos) hash = { "f": documents, "i": index, } with open(index_file, "w") as zFile: zFile.write(json.dumps(hash)) # print("\nIndex:") #print("final index:") #pp(index) # print("files:") # pp(documents)