package com.nexwave.nquindexer;

import com.nexwave.nsidita.DocFileInfo;
import com.nexwave.stemmer.snowball.SnowballStemmer;
import com.nexwave.stemmer.snowball.ext.EnglishStemmer;
import com.nexwave.stemmer.snowball.ext.FrenchStemmer;
import com.nexwave.stemmer.snowball.ext.GermanStemmer;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;

/* loaded from: input_file:com/nexwave/nquindexer/SaxHTMLIndex.class */
public class SaxHTMLIndex extends SaxDocFileParser {
    private Map<String, String> tempDico;
    private int i;
    private ArrayList<String> cleanUpList;
    private ArrayList<String> cleanUpPunctuation;
    private int SCORING_FOR_H1;
    private int SCORING_FOR_H2;
    private int SCORING_FOR_H3;
    private int SCORING_FOR_H4;
    private int SCORING_FOR_H5;
    private int SCORING_FOR_H6;
    private int SCORING_FOR_BOLD;
    private int SCORING_FOR_ITALIC;
    private int SCORING_FOR_NORMAL_TEXT;
    private int SCORING_FOR_KEYWORD;
    private int SCORING_FOR_INDEXTERM;
    private List<WordAndScoring> wsList;

    public List<WordAndScoring> getWsList() {
        return this.wsList;
    }

    public SaxHTMLIndex() {
        this.i = 0;
        this.cleanUpList = null;
        this.cleanUpPunctuation = null;
        this.SCORING_FOR_H1 = 50;
        this.SCORING_FOR_H2 = 45;
        this.SCORING_FOR_H3 = 40;
        this.SCORING_FOR_H4 = 35;
        this.SCORING_FOR_H5 = 30;
        this.SCORING_FOR_H6 = 25;
        this.SCORING_FOR_BOLD = 5;
        this.SCORING_FOR_ITALIC = 3;
        this.SCORING_FOR_NORMAL_TEXT = 1;
        this.SCORING_FOR_KEYWORD = 100;
        this.SCORING_FOR_INDEXTERM = 75;
        this.wsList = null;
    }

    public SaxHTMLIndex(ArrayList<String> arrayList) {
        this.i = 0;
        this.cleanUpList = null;
        this.cleanUpPunctuation = null;
        this.SCORING_FOR_H1 = 50;
        this.SCORING_FOR_H2 = 45;
        this.SCORING_FOR_H3 = 40;
        this.SCORING_FOR_H4 = 35;
        this.SCORING_FOR_H5 = 30;
        this.SCORING_FOR_H6 = 25;
        this.SCORING_FOR_BOLD = 5;
        this.SCORING_FOR_ITALIC = 3;
        this.SCORING_FOR_NORMAL_TEXT = 1;
        this.SCORING_FOR_KEYWORD = 100;
        this.SCORING_FOR_INDEXTERM = 75;
        this.wsList = null;
        this.cleanUpList = arrayList;
    }

    public SaxHTMLIndex(ArrayList<String> arrayList, ArrayList<String> arrayList2) {
        this.i = 0;
        this.cleanUpList = null;
        this.cleanUpPunctuation = null;
        this.SCORING_FOR_H1 = 50;
        this.SCORING_FOR_H2 = 45;
        this.SCORING_FOR_H3 = 40;
        this.SCORING_FOR_H4 = 35;
        this.SCORING_FOR_H5 = 30;
        this.SCORING_FOR_H6 = 25;
        this.SCORING_FOR_BOLD = 5;
        this.SCORING_FOR_ITALIC = 3;
        this.SCORING_FOR_NORMAL_TEXT = 1;
        this.SCORING_FOR_KEYWORD = 100;
        this.SCORING_FOR_INDEXTERM = 75;
        this.wsList = null;
        this.cleanUpList = arrayList;
        this.cleanUpPunctuation = arrayList2;
    }

    public int init(Map<String, String> map) {
        this.tempDico = map;
        return 0;
    }

    public DocFileInfo runExtractData(File file, String str, boolean z) {
        this.fileDesc = new DocFileInfo(file);
        this.strbf = new StringBuffer("");
        parseDocument(file);
        String replaceAll = cleanBuffer(this.strbf).replaceAll("\\s+", " ");
        this.wsList = new ArrayList();
        if (str.equalsIgnoreCase("ja") || str.equalsIgnoreCase("zh") || str.equalsIgnoreCase("ko")) {
            LinkedList linkedList = new LinkedList();
            try {
                TokenStream tokenStream = new CJKAnalyzer(Version.LUCENE_30).tokenStream("", new StringReader(replaceAll.replaceAll("@@@([^\\s]*)@@@", "")));
                TermAttribute addAttribute = tokenStream.addAttribute(TermAttribute.class);
                tokenStream.addAttribute(OffsetAttribute.class);
                while (tokenStream.incrementToken()) {
                    String term = addAttribute.term();
                    linkedList.add(term);
                    WordAndScoring wordAndScoring = new WordAndScoring(term, term, 1);
                    boolean z2 = false;
                    Iterator<WordAndScoring> it = this.wsList.iterator();
                    while (true) {
                        if (!it.hasNext()) {
                            break;
                        }
                        WordAndScoring next = it.next();
                        if (next.getStem().equals(wordAndScoring.getStem())) {
                            z2 = true;
                            next.setScoring(next.getScoring() + wordAndScoring.getScoring());
                            break;
                        }
                    }
                    if (!z2) {
                        this.wsList.add(wordAndScoring);
                    }
                }
            } catch (IOException e) {
                System.out.println("Error tokenizing content using CJK Analyzer. IOException");
                e.printStackTrace();
            }
        } else {
            SnowballStemmer englishStemmer = str.equalsIgnoreCase("en") ? new EnglishStemmer() : str.equalsIgnoreCase("de") ? new GermanStemmer() : str.equalsIgnoreCase("fr") ? new FrenchStemmer() : null;
            this.wsList = new ArrayList();
            StringTokenizer stringTokenizer = new StringTokenizer(replaceAll, " ");
            while (stringTokenizer.hasMoreTokens()) {
                WordAndScoring wordAndScoring2 = getWordAndScoring(stringTokenizer.nextToken(), englishStemmer, z);
                if (wordAndScoring2 != null) {
                    boolean z3 = false;
                    Iterator<WordAndScoring> it2 = this.wsList.iterator();
                    while (true) {
                        if (!it2.hasNext()) {
                            break;
                        }
                        WordAndScoring next2 = it2.next();
                        if (next2.getStem().equals(wordAndScoring2.getStem())) {
                            z3 = true;
                            next2.setScoring(next2.getScoring() + wordAndScoring2.getScoring());
                            break;
                        }
                    }
                    if (!z3) {
                        this.wsList.add(wordAndScoring2);
                    }
                }
            }
        }
        for (WordAndScoring wordAndScoring3 : this.wsList) {
            if (wordAndScoring3 != null && this.tempDico.containsKey(wordAndScoring3.getStem())) {
                this.tempDico.put(wordAndScoring3.getStem(), this.tempDico.get(wordAndScoring3.getStem()).concat(",").concat(Integer.toString(this.i)).concat("*").concat(Integer.toString(wordAndScoring3.getScoring())));
            } else if (wordAndScoring3 != null) {
                this.tempDico.put(wordAndScoring3.getStem(), Integer.toString(this.i).concat("*").concat(Integer.toString(wordAndScoring3.getScoring())));
            }
        }
        this.i++;
        return this.fileDesc;
    }

    private WordAndScoring getWordAndScoring(String str, SnowballStemmer snowballStemmer, boolean z) {
        WordAndScoring wordAndScoring = null;
        if (str.indexOf("@@@") == -1 || str.indexOf("@@@") == str.lastIndexOf("@@@")) {
            String str2 = str;
            if (snowballStemmer != null && z) {
                str2 = snowballStemmer.doStem(str);
            }
            wordAndScoring = new WordAndScoring(str, str2, this.SCORING_FOR_NORMAL_TEXT);
        } else {
            String substring = str.substring(0, str.indexOf("@@@"));
            if (substring.length() > 0) {
                String substring2 = str.substring(str.indexOf("@@@elem_") + "@@@elem_".length(), str.lastIndexOf("@@@"));
                int i = this.SCORING_FOR_NORMAL_TEXT;
                if ("h1".equalsIgnoreCase(substring2)) {
                    i = this.SCORING_FOR_H1;
                } else if ("h2".equalsIgnoreCase(substring2)) {
                    i = this.SCORING_FOR_H2;
                } else if ("h3".equalsIgnoreCase(substring2)) {
                    i = this.SCORING_FOR_H3;
                } else if ("h4".equalsIgnoreCase(substring2)) {
                    i = this.SCORING_FOR_H4;
                } else if ("h5".equalsIgnoreCase(substring2)) {
                    i = this.SCORING_FOR_H5;
                } else if ("h6".equalsIgnoreCase(substring2)) {
                    i = this.SCORING_FOR_H6;
                } else if ("em".equalsIgnoreCase(substring2)) {
                    i = this.SCORING_FOR_ITALIC;
                } else if ("strong".equalsIgnoreCase(substring2)) {
                    i = this.SCORING_FOR_BOLD;
                } else if ("meta_keywords".equalsIgnoreCase(substring2)) {
                    i = this.SCORING_FOR_KEYWORD;
                } else if ("meta_indexterms".equalsIgnoreCase(substring2)) {
                    i = this.SCORING_FOR_INDEXTERM;
                }
                String str3 = substring;
                if (snowballStemmer != null && z) {
                    str3 = snowballStemmer.doStem(substring);
                }
                wordAndScoring = new WordAndScoring(substring, str3, i);
            }
        }
        return wordAndScoring;
    }

    private String cleanBuffer(StringBuffer stringBuffer) {
        String lowerCase = stringBuffer.toString().toLowerCase();
        StringBuffer stringBuffer2 = new StringBuffer("");
        StringBuffer stringBuffer3 = new StringBuffer("");
        if (this.cleanUpList == null || this.cleanUpList.isEmpty()) {
            stringBuffer2.append("(?i)\\bthe\\b|\\ba\\b|\\ban\\b|\\bto\\b|\\band\\b|\\bor\\b");
            stringBuffer2.append("|\\bis\\b|\\bare\\b|\\bin\\b|\\bwith\\b|\\bbe\\b|\\bcan\\b");
            stringBuffer2.append("|\\beach\\b|\\bhas\\b|\\bhave\\b|\\bof\\b|\\b\\xA9\\b|\\bnot\\b");
            stringBuffer2.append("|\\bfor\\b|\\bthis\\b|\\bas\\b|\\bit\\b|\\bhe\\b|\\bshe\\b");
            stringBuffer2.append("|\\byou\\b|\\bby\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");
            stringBuffer2.append("|\\b-or-\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");
            stringBuffer2.append("|\\bI\\b|\\bme\\b|\\bmy\\b");
            lowerCase = lowerCase.replaceFirst("Copyright ��� 1998-2007 NexWave Solutions.", " ");
        } else {
            stringBuffer2.append("\\ba\\b");
            Iterator<String> it = this.cleanUpList.iterator();
            while (it.hasNext()) {
                stringBuffer2.append("|\\b").append(it.next()).append("\\b");
            }
        }
        if (this.cleanUpPunctuation != null && !this.cleanUpPunctuation.isEmpty()) {
            stringBuffer3.append("\\u3002");
            Iterator<String> it2 = this.cleanUpPunctuation.iterator();
            while (it2.hasNext()) {
                stringBuffer3.append("|").append(it2.next());
            }
        }
        return minimalClean(lowerCase, stringBuffer2, stringBuffer3);
    }
}
