package com.gemantic.parser.impl;

import com.gemantic.parser.htmltree.GRHtmlTree;
import com.gemantic.parser.model.Article;
import com.gemantic.parser.model.Paragraph;
import com.gemantic.parser.util.ArticleUtil;
import com.gemantic.parser.util.BaseUtil;
import com.gemantic.parser.util.TagNodeUtil;
import org.apache.log4j.Logger;
import org.htmlcleaner.TagNode;
import org.w3c.dom.Document;

/* loaded from: input_file:com/gemantic/parser/impl/PolicyParser.class */
public class PolicyParser extends NewsParser {
    private static Logger logger = Logger.getLogger(PolicyParser.class);

    @Override // com.gemantic.parser.impl.NewsParser
    public Article parse(String str, String str2, long j, boolean z, String str3) {
        Paragraph paragraphSum;
        if (str == null || str2 == null) {
            return null;
        }
        Article article = new Article();
        article.setUrl(str2);
        article.setHost(BaseUtil.getUrlHost(str2));
        article.setFetchTime(j);
        if (str3 != null) {
            article.setAnchor(str3.trim());
        }
        long currentTimeMillis = System.currentTimeMillis();
        TagNode htmlTagNode = getHtmlTagNode(str);
        if (htmlTagNode == null) {
            logger.warn("cannot convert to tagnode:" + str2);
            return article;
        }
        Document TagNode2Document = TagNode2Document(htmlTagNode);
        if (TagNode2Document == null) {
            logger.warn("cannot createDOM:" + str2);
            return article;
        }
        GRHtmlTree gRHtmlTree = new GRHtmlTree();
        if (this.debug) {
            gRHtmlTree.setDebug(this.debug);
        }
        if (!gRHtmlTree.constructTree(TagNode2Document)) {
            logger.warn("cannot constructTree:" + str2);
            return article;
        }
        gRHtmlTree.calMaxTxtBlock();
        logger.debug("my dom time:" + (System.currentTimeMillis() - currentTimeMillis));
        Paragraph chooseTxtNode = chooseTxtNode(article, 0, gRHtmlTree);
        if (chooseTxtNode == null) {
            article.setHtmlTitle(TagNodeUtil.extractHtmlTitle(htmlTagNode));
            logger.debug("cannot choose txt node:" + str2);
            return article;
        }
        if (article.getPageType() == 1) {
            article.setContentPos(chooseTxtNode.getNodeID());
            extractNodeTxt(article, gRHtmlTree);
            Article extrac = extrac(article, TagNode2Document, htmlTagNode, gRHtmlTree);
            if (this.debug) {
                logger.debug(" ======== article1 ========");
                ArticleUtil.printArticleInfo(extrac);
            }
            if (extrac.getTitlePos() <= 0) {
                extrac = reExtractArticleUse2ndNode(extrac, TagNode2Document, htmlTagNode, gRHtmlTree);
            }
            if (extrac.getTitlePos() <= 0) {
                extrac = reExtractArticleTCSameLevel(extrac, TagNode2Document, htmlTagNode, gRHtmlTree);
            }
            if (extrac.getTitlePos() <= 0 && extrac.getContentPos() > 0 && extrac.getSiteCatPos() > 0) {
                getTitleReExtactor().extract(extrac, htmlTagNode, gRHtmlTree, htmlListNodeFromContent2Body(extrac, gRHtmlTree));
            }
            article = check(extrac);
        }
        if (article.getPageType() == 1 && (paragraphSum = article.getParagraphSum()) != null && paragraphSum.getWordNum() < 100) {
            gRHtmlTree.calMaxLinkBlock();
            if (gRHtmlTree.getPossibleLinkNode(0) != null) {
                article.setPageType(2);
            }
        }
        gRHtmlTree.clear();
        logger.debug("extract to end,total time=" + (System.currentTimeMillis() - currentTimeMillis));
        return article;
    }
}
