package com.gemantic.parser.impl;

import com.gemantic.parser.Parser;
import com.gemantic.parser.check.ArticleChecker;
import com.gemantic.parser.extractor.ArticleExtractor;
import com.gemantic.parser.extractor.impl.ArticleTitleReExtractor;
import com.gemantic.parser.htmltree.GRHtmlTree;
import com.gemantic.parser.model.Article;
import com.gemantic.parser.model.HtmlContent;
import com.gemantic.parser.model.LinkBlockItem;
import com.gemantic.parser.model.Paragraph;
import com.gemantic.parser.rule.TagRule;
import com.gemantic.parser.util.ArticleUtil;
import com.gemantic.parser.util.BaseUtil;
import com.gemantic.parser.util.NodeUtil;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.apache.log4j.Logger;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.Utils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.traversal.DocumentTraversal;
import org.w3c.dom.traversal.NodeFilter;
import org.w3c.dom.traversal.NodeIterator;

/* loaded from: input_file:com/gemantic/parser/impl/NewsParser.class */
public class NewsParser implements Parser {
    private HtmlCleaner cleaner;
    private CleanerProperties props;
    private ArticleTitleReExtractor titleReExtactor;
    public static final String DEFAULT_CHARSET = System.getProperty("file.encoding");
    private static Logger logger = Logger.getLogger(NewsParser.class);
    private List<ArticleChecker> checkers = new ArrayList();
    private List<ArticleExtractor> extractors = new ArrayList();
    public boolean debug = false;

    public void init() {
        this.props = this.cleaner.getProperties();
        this.props.setOmitComments(true);
        this.props.setOmitXmlDeclaration(true);
        this.props.setTranslateSpecialEntities(false);
        this.props.setRecognizeUnicodeChars(false);
        this.props.setUseCdataForScriptAndStyle(false);
        this.props.setUseEmptyElementTags(false);
    }

    public Article parseContent(HtmlContent htmlContent) {
        return parse(htmlContent.getContent(), htmlContent.getUrl(), htmlContent.getFetchTime(), false, htmlContent.getAnchor());
    }

    public Article navPageParse(URL url, String str) throws IOException {
        return navPageParse(Utils.readUrl(url, str).toString(), url.toString());
    }

    public Article navPageParse(String str, String str2) {
        if (str == null || str2 == null) {
            return null;
        }
        Article article = new Article();
        article.setUrl(str2);
        article.setHost(BaseUtil.getUrlHost(str2));
        article.setPageType(2);
        TagNode htmlTagNode = getHtmlTagNode(str);
        if (htmlTagNode == null) {
            logger.warn("cannot convert to tagnode:" + str2);
            return article;
        }
        Document TagNode2Document = TagNode2Document(htmlTagNode);
        if (TagNode2Document == null) {
            logger.warn("cannot createDOM:" + str2);
            return article;
        }
        GRHtmlTree gRHtmlTree = new GRHtmlTree();
        gRHtmlTree.setUrl(str2);
        if (!gRHtmlTree.constructTree(TagNode2Document)) {
            logger.warn("cannot constructTree:" + str2);
            return article;
        }
        gRHtmlTree.calMaxLinkBlock();
        extractNodeLinkBlock(article, gRHtmlTree);
        return article;
    }

    public Article parse(URL url, String str) throws IOException {
        return parse(Utils.readUrl(url, str).toString(), url.toString(), new Date().getTime());
    }

    public Article parse(URL url, String str, Long l, boolean z) throws IOException {
        return parse(Utils.readUrl(url, str).toString(), url.toString(), l.longValue(), z);
    }

    public Article parse(URL url, String str, Long l, boolean z, String str2) throws IOException {
        return parse(Utils.readUrl(url, str).toString(), url.toString(), l.longValue(), z, str2);
    }

    public Article parse(URL url) throws IOException {
        return parse(url, DEFAULT_CHARSET);
    }

    public Article parse(String str) {
        return parse(str, "");
    }

    public Article parse(String str, String str2) {
        return parse(str, str2, new Date().getTime());
    }

    public Article parse(String str, String str2, long j) {
        return parse(str, str2, j, false);
    }

    public Article parse(String str, String str2, long j, String str3) {
        return parse(str, str2, j, false, str3);
    }

    public Article parse(String str, String str2, long j, boolean z) {
        return parse(str, str2, j, z, "");
    }

    public Article parse(String str, String str2, long j, boolean z, String str3) {
        Paragraph paragraphSum;
        if (str == null || str2 == null) {
            return null;
        }
        Article article = new Article();
        article.setUrl(str2);
        article.setHost(BaseUtil.getUrlHost(str2));
        article.setFetchTime(j);
        if (str3 != null) {
            article.setAnchor(str3.trim());
        }
        long currentTimeMillis = System.currentTimeMillis();
        TagNode htmlTagNode = getHtmlTagNode(str);
        if (htmlTagNode == null) {
            logger.warn("cannot convert to tagnode:" + str2);
            return article;
        }
        Document TagNode2Document = TagNode2Document(htmlTagNode);
        if (TagNode2Document == null) {
            logger.warn("cannot createDOM:" + str2);
            return article;
        }
        GRHtmlTree gRHtmlTree = new GRHtmlTree();
        if (this.debug) {
            gRHtmlTree.setDebug(this.debug);
        }
        if (!gRHtmlTree.constructTree(TagNode2Document)) {
            logger.warn("cannot constructTree:" + str2);
            return article;
        }
        gRHtmlTree.calMaxTxtBlock();
        logger.debug("my dom time:" + (System.currentTimeMillis() - currentTimeMillis));
        Paragraph chooseTxtNode = chooseTxtNode(article, 0, gRHtmlTree);
        if (chooseTxtNode == null) {
            logger.debug("cannot choose txt node:" + str2);
            if (z) {
                gRHtmlTree.calMaxLinkBlock();
                extractNodeLinkBlock(article, gRHtmlTree);
            }
            return article;
        }
        if (article.getPageType() == 1) {
            article.setContentPos(chooseTxtNode.getNodeID());
            extractNodeTxt(article, gRHtmlTree);
            Article extrac = extrac(article, TagNode2Document, htmlTagNode, gRHtmlTree);
            if (this.debug) {
                logger.debug(" ======== article1 ========");
                ArticleUtil.printArticleInfo(extrac);
            }
            if (extrac.getTitlePos() <= 0) {
                extrac = reExtractArticleUse2ndNode(extrac, TagNode2Document, htmlTagNode, gRHtmlTree);
            }
            if (extrac.getTitlePos() <= 0) {
                extrac = reExtractArticleTCSameLevel(extrac, TagNode2Document, htmlTagNode, gRHtmlTree);
            }
            if (extrac.getTitlePos() <= 0 && extrac.getContentPos() > 0 && extrac.getSiteCatPos() > 0) {
                this.titleReExtactor.extract(extrac, htmlTagNode, gRHtmlTree, htmlListNodeFromContent2Body(extrac, gRHtmlTree));
            }
            article = check(extrac);
        }
        if (article.getPageType() == 1 && (paragraphSum = article.getParagraphSum()) != null && paragraphSum.getWordNum() < 100) {
            gRHtmlTree.calMaxLinkBlock();
            if (gRHtmlTree.getPossibleLinkNode(0) != null) {
                article.setPageType(2);
            }
        }
        if (article.getPageType() == 2 && z) {
            gRHtmlTree.calMaxLinkBlock();
            extractNodeLinkBlock(article, gRHtmlTree);
        }
        gRHtmlTree.clear();
        logger.debug("extract to end,total time=" + (System.currentTimeMillis() - currentTimeMillis));
        return article;
    }

    public void extractNodeLinkBlock(Article article, GRHtmlTree gRHtmlTree) {
        LinkBlockItem possibleLinkNode = gRHtmlTree.getPossibleLinkNode(0);
        if (possibleLinkNode == null) {
            return;
        }
        article.setContent(possibleLinkNode.getContentS());
        LinkedHashMap linkPair = possibleLinkNode.getLinkPair();
        StringBuffer stringBuffer = new StringBuffer();
        if (linkPair != null) {
            stringBuffer.append("<ul>\n");
            for (Map.Entry entry : linkPair.entrySet()) {
                stringBuffer.append("<li><a href=\"" + entry.getKey() + "\">" + entry.getValue() + "</a><br>\n");
                article.add2LinkPair((String) entry.getKey(), (String) entry.getValue());
            }
            stringBuffer.append("</ul>\n");
            article.setContentTag(stringBuffer.toString());
        }
    }

    protected boolean statBodyInfo(Document document, Article article) {
        Paragraph paragraph = new Paragraph();
        paragraph.setReservedEnter(true);
        paragraph.setReservedTag(true);
        Element elementsByTagName = getElementsByTagName(document, "body");
        if (elementsByTagName == null) {
            article.setParaBody(paragraph);
            return false;
        }
        NodeUtil.ElementStatic(elementsByTagName, paragraph, article.getUrl());
        article.setParaBody(paragraph);
        return true;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Article reExtractArticleUse2ndNode(Article article, Document document, TagNode tagNode, GRHtmlTree gRHtmlTree) {
        Article deepCopy = article.deepCopy();
        Paragraph chooseTxtNode = chooseTxtNode(deepCopy, 1, gRHtmlTree);
        if (chooseTxtNode != null) {
            deepCopy.setContentPos(chooseTxtNode.getNodeID());
            extractNodeTxt(deepCopy, gRHtmlTree);
            Article extrac = extrac(deepCopy, document, tagNode, gRHtmlTree);
            if (this.debug) {
                logger.debug(" ======== article2 2nd node ========");
                ArticleUtil.printArticleInfo(extrac);
            }
            if (extrac.getTitlePos() > 0 && extrac.getTimePos() > 0 && extrac.getSiteCatPos() > 0) {
                article = extrac;
                if (this.debug) {
                    logger.debug(" ===== use article2 as the aritcle========");
                }
            }
        }
        return article;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Article reExtractArticleTCSameLevel(Article article, Document document, TagNode tagNode, GRHtmlTree gRHtmlTree) {
        Article deepCopy = article.deepCopy();
        extractNodeTxt(deepCopy, true, gRHtmlTree);
        Article extrac = extrac(deepCopy, document, tagNode, gRHtmlTree);
        if (this.debug) {
            logger.debug(" ======== article3 TC same level========");
            ArticleUtil.printArticleInfo(extrac);
        }
        if (extrac.getTitlePos() > 0 && (extrac.getTimePos() > 0 || extrac.getSiteCatPos() > 0)) {
            article = extrac;
            if (this.debug) {
                logger.debug(" ===== use article3 as the aritcle========");
            }
        }
        return article;
    }

    protected Element getElementsByTagName(Document document, String str) {
        NodeList elementsByTagName = document.getElementsByTagName(str);
        if (elementsByTagName.getLength() <= 0 || elementsByTagName.item(0) == null) {
            return null;
        }
        return (Element) elementsByTagName.item(0);
    }

    public TagNode getHtmlTagNode(String str) {
        TagNode tagNode = null;
        try {
            tagNode = this.cleaner.clean(str);
        } catch (Exception e) {
            System.out.println("Error in NewsParser.getHtmlTagNode()");
            e.printStackTrace();
        }
        return tagNode;
    }

    public Document TagNode2Document(TagNode tagNode) {
        Document document = null;
        try {
            document = new DomSerializer(this.props, false).createDOM(tagNode);
        } catch (Throwable th) {
            System.out.println("Convert eorr in NewsParser.TagNode2Document:");
            th.printStackTrace();
        }
        return document;
    }

    public List DocElement2List(Document document) {
        NodeIterator createNodeIterator = ((DocumentTraversal) document).createNodeIterator(document.getDocumentElement(), 1, (NodeFilter) null, true);
        ArrayList arrayList = new ArrayList();
        arrayList.clear();
        Node nextNode = createNodeIterator.nextNode();
        while (true) {
            Node node = nextNode;
            if (node == null) {
                return arrayList;
            }
            arrayList.add(node);
            nextNode = createNodeIterator.nextNode();
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Paragraph chooseTxtNode(Article article, int i, GRHtmlTree gRHtmlTree) {
        Paragraph possibleNode = gRHtmlTree.getPossibleNode(i);
        if (possibleNode == null) {
            article.setPageType(2);
            return possibleNode;
        }
        int count = gRHtmlTree.getCount() > 0 ? gRHtmlTree.getCount() : 1;
        logger.info("choose txt:" + article.getUrl() + "; total node =" + count + ";" + i + "txt pos=" + possibleNode.getNodeID() + ";pos% =" + ((possibleNode.getNodeID() * 1.0d) / count));
        if ((possibleNode.getNodeID() * 1.0d) / count > 0.95d) {
            article.setPageType(2);
            return null;
        }
        article.setPageType(1);
        return possibleNode;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public void extractNodeTxt(Article article, GRHtmlTree gRHtmlTree) {
        extractNodeTxt(article, false, gRHtmlTree);
    }

    protected void extractNodeTxt(Article article, boolean z, GRHtmlTree gRHtmlTree) {
        NodeList childNodes;
        int length;
        Element element = (Element) gRHtmlTree.getItemDataByIndex(article.getContentPos());
        if (element != null && (length = (childNodes = element.getChildNodes()).getLength()) >= 1) {
            article.clearParagraph();
            int i = -1;
            boolean z2 = false;
            int i2 = -1;
            for (int i3 = 0; i3 < length; i3++) {
                Node item = childNodes.item(i3);
                switch (item.getNodeType()) {
                    case GRHtmlTree.POSSIBILTY_SECOND /* 1 */:
                        if (z) {
                            if ("br".equals(item.getNodeName()) || "p".equals(item.getNodeName())) {
                                z2 = true;
                                if (i2 < 0) {
                                    i2 = gRHtmlTree.findItemData(item);
                                    if (i2 > 0) {
                                        article.setContentPos(i2);
                                    }
                                }
                            }
                            if (!z2) {
                                break;
                            }
                        }
                        Paragraph paragraph = new Paragraph();
                        paragraph.setTagName(item.getNodeName());
                        NodeUtil.ElementStatic((Element) childNodes.item(i3), paragraph, article.getUrl());
                        double d = (i3 * 1.0d) / length;
                        double linkWordLen = paragraph.getLinkWordLen() / (paragraph.getWordCNum() > 2 ? paragraph.getWordCNum() : 1.0d);
                        if (!"br".equals(item.getNodeName()) && !"p".equals(item.getNodeName())) {
                            if ("a".equals(item.getNodeName())) {
                                if (i3 == i + 1) {
                                    article.addParagraph(paragraph);
                                    break;
                                } else {
                                    break;
                                }
                            } else if (d < 0.8d) {
                                if (linkWordLen >= 0.5d && !"strong".equals(item.getNodeName()) && !"b".equals(item.getNodeName()) && !"center".equals(item.getNodeName())) {
                                    break;
                                } else {
                                    article.addParagraph(paragraph);
                                    break;
                                }
                            } else if (linkWordLen < 0.5d && ("strong".equals(item.getNodeName()) || "b".equals(item.getNodeName()))) {
                                article.addParagraph(paragraph);
                                break;
                            }
                        } else if (article.getParagraphs().size() <= 3) {
                            if (linkWordLen < 0.8d) {
                                article.addParagraph(paragraph);
                                break;
                            } else {
                                break;
                            }
                        } else {
                            article.addParagraph(paragraph);
                            break;
                        }
                        break;
                    case 3:
                        if (item.getNodeValue() != null && !"".equals(item.getNodeValue().trim())) {
                            if (z) {
                                z2 = true;
                            }
                            Paragraph paragraph2 = new Paragraph();
                            paragraph2.setTagName(item.getNodeName());
                            NodeUtil.TextNodeStatic(item, paragraph2);
                            article.addParagraph(paragraph2);
                            i = i3;
                            break;
                        }
                        break;
                }
            }
            ArticleUtil.sumArticleParagraph(article);
            htmlPageType(article);
        }
    }

    public static void htmlPageType(Article article) {
        Paragraph paragraphSum = article.getParagraphSum();
        if (paragraphSum == null) {
            return;
        }
        if (paragraphSum.getWordNum() < 25 || (paragraphSum.getLinkWordLen() * 1.0d) / paragraphSum.getWordCNum() > 0.5d || (paragraphSum.getWordCNum() < 300 && paragraphSum.getDotCNum() < 2)) {
            article.setPageType(2);
        }
        if (paragraphSum != null) {
            int aNum = paragraphSum.getANum() > 0 ? paragraphSum.getANum() : 1;
            if (aNum <= 10 || paragraphSum.getLinkWordLen() / aNum <= 8) {
                return;
            }
            article.setPageType(2);
        }
    }

    public String getInnerTxt(String str) {
        if (str == null) {
            return "";
        }
        TagNode htmlTagNode = getHtmlTagNode(str);
        if (htmlTagNode == null) {
            System.out.println("in NewsParser.getInnerTxt error.");
            return "";
        }
        StringBuffer text = htmlTagNode.getText();
        return text != null ? text.toString() : "";
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Article check(Article article) {
        if (this.checkers == null || this.checkers.isEmpty()) {
            return article;
        }
        if (article == null) {
            return article;
        }
        Iterator<ArticleChecker> it = this.checkers.iterator();
        while (it.hasNext()) {
            article = it.next().check(article);
        }
        return article;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Article extrac(Article article, Document document, TagNode tagNode, GRHtmlTree gRHtmlTree) {
        if (this.extractors == null || this.extractors.isEmpty()) {
            return article;
        }
        if (article == null) {
            return article;
        }
        List<Paragraph> htmlListNodeFromContent2Body = htmlListNodeFromContent2Body(article, gRHtmlTree);
        Iterator<ArticleExtractor> it = this.extractors.iterator();
        while (it.hasNext()) {
            article = it.next().extract(article, tagNode, gRHtmlTree, htmlListNodeFromContent2Body);
        }
        return article;
    }

    public List<Paragraph> htmlListNodeFromContent2Body(Article article, GRHtmlTree gRHtmlTree) {
        ArrayList arrayList = new ArrayList();
        for (int contentPos = article.getContentPos() - 1; contentPos > 0; contentPos--) {
            Node itemDataByIndex = gRHtmlTree.getItemDataByIndex(contentPos);
            if ("body".equals(itemDataByIndex.getNodeName())) {
                break;
            }
            Element firstChildElement = NodeUtil.getFirstChildElement(itemDataByIndex);
            if (TagRule.getInstance().TitleChildNode(itemDataByIndex.getNodeName()) || firstChildElement == null) {
                Paragraph paragraph = new Paragraph();
                paragraph.setReservedEnter(true);
                paragraph.setReservedTag(false);
                NodeUtil.ElementStatic((Element) itemDataByIndex, paragraph, article.getUrl());
                String trim = paragraph.getContentS().replaceAll(" {2,}", " ").replaceAll("\\t{2,}", "\\t").trim();
                if (trim.length() < 200 && trim.length() >= 2) {
                    if (this.debug) {
                        logger.debug("nArrayId=" + contentPos + ";node=" + itemDataByIndex.getNodeName() + "; len=" + trim.length() + "; BlockContent=" + trim);
                    }
                    trim.trim();
                    paragraph.setNodeID(contentPos);
                    paragraph.setTagName(itemDataByIndex.getNodeName());
                    arrayList.add(paragraph);
                }
            }
        }
        return arrayList;
    }

    public void setDebug(boolean z) {
        this.debug = z;
    }

    public HtmlCleaner getCleaner() {
        return this.cleaner;
    }

    public void setCleaner(HtmlCleaner htmlCleaner) {
        this.cleaner = htmlCleaner;
    }

    public List<ArticleChecker> getCheckers() {
        return this.checkers;
    }

    public void setCheckers(List<ArticleChecker> list) {
        this.checkers = list;
    }

    public List<ArticleExtractor> getExtractors() {
        return this.extractors;
    }

    public void setExtractors(List<ArticleExtractor> list) {
        this.extractors = list;
    }

    public ArticleTitleReExtractor getTitleReExtactor() {
        return this.titleReExtactor;
    }

    public void setTitleReExtactor(ArticleTitleReExtractor articleTitleReExtractor) {
        this.titleReExtactor = articleTitleReExtractor;
    }
}
