package com.gemantic.parser.extractor.impl;

import com.gemantic.parser.extractor.ArticleExtractor;
import com.gemantic.parser.htmltree.GRHtmlTree;
import com.gemantic.parser.model.Article;
import com.gemantic.parser.model.Paragraph;
import com.gemantic.parser.util.BaseUtil;
import com.gemantic.parser.util.NodeUtil;
import com.gemantic.parser.util.RegexUtil;
import com.gemantic.parser.util.TagNodeUtil;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.log4j.Logger;
import org.htmlcleaner.TagNode;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;

/* loaded from: input_file:com/gemantic/parser/extractor/impl/ArticleTitleExtractor.class */
public class ArticleTitleExtractor implements ArticleExtractor {
    private static Logger logger = Logger.getLogger(ArticleTitleExtractor.class);

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:com/gemantic/parser/extractor/impl/ArticleTitleExtractor$HtmlNodeItemCompare.class */
    public class HtmlNodeItemCompare implements Comparator {
        /* JADX INFO: Access modifiers changed from: package-private */
        public HtmlNodeItemCompare() {
        }

        @Override // java.util.Comparator
        public int compare(Object obj, Object obj2) {
            int score = ((HtmlNodeItem) obj).getScore();
            int score2 = ((HtmlNodeItem) obj2).getScore();
            if (score > score2) {
                return -1;
            }
            return score < score2 ? 1 : 0;
        }
    }

    @Override // com.gemantic.parser.extractor.ArticleExtractor
    public Article extract(Article article, TagNode tagNode, GRHtmlTree gRHtmlTree, List list) {
        article.setHtmlTitle(TagNodeUtil.extractHtmlTitle(tagNode));
        article.setKeywords(extractHtmlKeywords(tagNode));
        extractTitle(article, gRHtmlTree, list);
        return article;
    }

    protected boolean extractTitle(Article article, GRHtmlTree gRHtmlTree, List list) {
        if (article.getHtmlTitle() == null) {
            return false;
        }
        String unescapeHtml = BaseUtil.unescapeHtml(article.getHtmlTitle());
        String unescapeHtml2 = BaseUtil.unescapeHtml(article.getAnchor());
        if (unescapeHtml2 == null) {
            unescapeHtml2 = "";
        }
        String replaceAll = unescapeHtml2.replaceAll("\\.\\.\\.", "");
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < list.size(); i++) {
            Paragraph paragraph = (Paragraph) list.get(i);
            String trim = paragraph.getContentS().trim();
            if (trim.length() >= 4 && trim.length() <= 50 && (paragraph.getLinkNum() < 1 || "h1".equals(paragraph.getTagName()))) {
                String replaceDateTime = RegexUtil.replaceDateTime(BaseUtil.unescapeHtml(trim));
                int indexOf = unescapeHtml.indexOf(replaceDateTime);
                int indexOf2 = replaceAll.length() > 0 ? replaceDateTime.indexOf(replaceAll) : -1;
                if (indexOf >= 0 || indexOf2 >= 0) {
                    HtmlNodeItem htmlNodeItem = new HtmlNodeItem(trim, paragraph.getTagName(), paragraph.getNodeID());
                    if (indexOf >= 0) {
                        htmlNodeItem.setScore(htmlNodeItem.getScore() + 100);
                    }
                    if (indexOf2 >= 0) {
                        htmlNodeItem.setScore(htmlNodeItem.getScore() + 100);
                    }
                    arrayList.add(htmlNodeItem);
                }
            }
        }
        for (int i2 = 0; i2 < arrayList.size(); i2++) {
            calHtmlNodeItemScore((HtmlNodeItem) arrayList.get(i2));
        }
        Collections.sort(arrayList, new HtmlNodeItemCompare());
        logger.debug("==title node txt==\n" + arrayList.toString());
        boolean z = false;
        if (arrayList.size() > 0) {
            HtmlNodeItem htmlNodeItem2 = (HtmlNodeItem) arrayList.get(0);
            article.setTitle(htmlNodeItem2.getTagValue(), htmlNodeItem2.getPos());
            z = true;
        }
        return z;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public void calHtmlNodeItemScore(HtmlNodeItem htmlNodeItem) {
        if (htmlNodeItem == null) {
            return;
        }
        if ("h1".equals(htmlNodeItem.getTagName()) || "h2".equals(htmlNodeItem.getTagName())) {
            htmlNodeItem.setScore(htmlNodeItem.getScore() + 100);
        }
        if (htmlNodeItem.getTagValue() != null) {
            htmlNodeItem.setScore(htmlNodeItem.getScore() + htmlNodeItem.getTagValue().length());
        }
    }

    protected String extractHtmlTitle(Document document) {
        NodeList elementsByTagName = document.getElementsByTagName("title");
        return (elementsByTagName.getLength() <= 0 || NodeUtil.getNodeValue(elementsByTagName.item(0)) == null) ? "" : BaseUtil.filterTag(NodeUtil.getNodeValue(elementsByTagName.item(0)).trim());
    }

    protected String extractHtmlKeywords(TagNode tagNode) {
        for (TagNode tagNode2 : tagNode.getElementsByAttValue("name", "keywords", true, false)) {
            if (tagNode2.getAttributeByName("content") != null) {
                return tagNode2.getAttributeByName("content").trim();
            }
        }
        return "";
    }
}
