indexer.py #1

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import sys, getopt, json, os.path
from pprint import pprint as pp

# import BeautifulSoup relative to where this script exists
include_path = os.path.dirname(os.path.abspath(os.path.dirname(sys.argv[0])))
sys.path.append(include_path + "/beautifulsoup4-4.1.0")
from bs4 import BeautifulSoup

def usage(level):
    print """
indexer.py [-h] -d <document file> -i <index file>
-h: this help
-d: the document file (HTML, with content in the '#content' id) to index
-i: the index file, in JSON format. Will be created if it does not exist.
"""
    sys.exit(level)


class IndexerException(Exception):
    def __init__(self, value):
        self.value = value
    def __str__(self):
        return repr(self.value)

# ========================================================================

filenames  = []
documents  = []
index_file = ''
index      = {}

try:
    options, args = getopt.getopt(sys.argv[1:], "hi:")
except getopt.GetoptError:
    usage(2)

for opt, arg in options:
    if opt == "-h":
        usage(0)
    elif opt == "-i":
        index_file = arg

for arg in args:
    filename = os.path.basename(arg.strip("/\\"))
    if filename in filenames:
        raise IndexerException(
            "{} already specified".format(filename)
        )
    filenames.append(filename)
    documents.append({ "name": filename, "path": arg })

if len(documents) < 1:
    print "Error: No document filename specified"
    usage(1)

if len(index_file) < 1:
    print "Error: No index filename specified"
    usage(1)

strip_chars = " .,:;(){}[]?|" + u"\u201c" + u"\u201d" + u"\xa9"

for d, doc in enumerate(documents):
    print("Indexing {}".format(doc["path"]))

    soup = BeautifulSoup(open(doc["path"]))
    doc["title"] = soup.title.string
    text = soup.find(id="content").get_text()
    if len(text) < 1:
        raise IndexerException("{} has no content".format(doc["path"]))

    for pos, w in enumerate(text.lower().split()):
        word = w.strip(strip_chars).replace('"', '').replace("'", "")
        if len(word) < 1:
            continue

        if word not in index:
            index[word] = {}

        if d not in index[word]:
            index[word][d] = []

        index[word][d].append(pos)

hash = {
    "f": documents,
    "i": index,
}
with open(index_file, "w") as zFile:
    zFile.write(json.dumps(hash))

# print("\nIndex:")
#print("final index:")
#pp(index)
# print("files:")
# pp(documents)
#	Change	User	Description
#1	10825	Paul Allen	Branching using paul_allen.p4convert
//guest/perforce_software/p4convert/docs/_build/perforce/indexer.py
#1	10706	Paul Allen	Add document '_build' files.