package com.gemantic.parser.extractor.impl;

import com.gemantic.parser.extractor.ArticleExtractor;
import com.gemantic.parser.htmltree.GRHtmlTree;
import com.gemantic.parser.model.Article;
import com.gemantic.parser.model.Paragraph;
import com.gemantic.parser.util.BaseUtil;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.log4j.Logger;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;

/* loaded from: input_file:com/gemantic/parser/extractor/impl/ArticleSourceExtractor.class */
public class ArticleSourceExtractor implements ArticleExtractor {
    private SourceDict m_dictsource;
    private static Logger logger = Logger.getLogger(ArticleSourceExtractor.class);

    public ArticleSourceExtractor() {
        this.m_dictsource = null;
        this.m_dictsource = new SourceDict(getClass().getResourceAsStream("/sourcedict.txt"));
        logger.info("source dict count =" + this.m_dictsource.getSourceCount());
        this.m_dictsource.initSourceConfig(getClass().getResourceAsStream("/source.properties"));
        logger.info("source config count =" + this.m_dictsource.getConfigCount());
    }

    @Override // com.gemantic.parser.extractor.ArticleExtractor
    public Article extract(Article article, TagNode tagNode, GRHtmlTree gRHtmlTree, List list) {
        extractSource(article, gRHtmlTree, list);
        repairSource(article, tagNode);
        formatSource(article);
        return article;
    }

    private boolean extractSource(Article article, GRHtmlTree gRHtmlTree, List list) {
        article.getTitlePos();
        boolean z = false;
        for (int i = 0; i < list.size(); i++) {
            Paragraph paragraph = (Paragraph) list.get(i);
            if (paragraph.getNodeID() >= article.getTitlePos()) {
                String trim = paragraph.getContentS().trim();
                if (trim.length() < 200 && trim.length() >= 2 && !z && trim.length() >= 2 && trim.length() <= 40) {
                    int indexOf = trim.indexOf("来源：");
                    String substring = indexOf >= 0 ? trim.substring(indexOf + "来源：".length()) : "";
                    int indexOf2 = trim.indexOf("来源:");
                    if (indexOf2 >= 0) {
                        substring = trim.substring(indexOf2 + "来源:".length());
                    }
                    int indexOf3 = trim.indexOf("来源 ");
                    if (indexOf3 >= 0) {
                        substring = trim.substring(indexOf3 + "来源  ".length());
                    }
                    if (this.m_dictsource.existDict(substring)) {
                        article.setSource(substring, paragraph.getNodeID());
                        z = true;
                    }
                    if (!z && this.m_dictsource.existDict(trim)) {
                        article.setSource(trim, paragraph.getNodeID());
                        z = true;
                    }
                }
                if (z) {
                    break;
                }
            }
        }
        return z;
    }

    public boolean repairSource(Article article, TagNode tagNode) {
        String host = article.getHost();
        if (host == null) {
            return false;
        }
        for (Map.Entry entry : ((HashMap) this.m_dictsource.getSourceXPathMap()).entrySet()) {
            String str = (String) entry.getKey();
            if (host.indexOf(str) >= 0) {
                String elementsByXpath = getElementsByXpath(tagNode, (String) entry.getValue());
                if (BaseUtil.emptyStr(elementsByXpath) || !BaseUtil.emptyStr(article.getSource())) {
                    return true;
                }
                article.setSource(elementsByXpath);
                return true;
            }
        }
        return true;
    }

    private String getElementsByXpath(TagNode tagNode, String str) {
        try {
            for (Object obj : tagNode.evaluateXPath(str)) {
                TagNode tagNode2 = (TagNode) obj;
                if (tagNode2.getText() != null) {
                    return tagNode2.getText().toString();
                }
            }
            return null;
        } catch (XPatherException e) {
            e.printStackTrace();
            return null;
        }
    }

    private void formatSource(Article article) {
        String source = article.getSource();
        if (source == null) {
            return;
        }
        String trim = source.replaceAll("文章来源:", "").replaceAll("来源:", "").replaceAll("来源：", "").replaceAll("来源", "").replaceAll("《", "").replaceAll("》", "").trim();
        if (BaseUtil.emptyStr(trim)) {
            return;
        }
        article.setSource(trim);
    }

    public SourceDict getSourceDict() {
        return this.m_dictsource;
    }
}
