package com.gemantic.parser.extractor.impl;

import com.gemantic.parser.extractor.ArticleExtractor;
import com.gemantic.parser.htmltree.GRHtmlTree;
import com.gemantic.parser.model.Article;
import com.gemantic.parser.model.Paragraph;
import com.gemantic.parser.util.BaseUtil;
import com.gemantic.parser.util.RegexUtil;
import java.util.ArrayList;
import java.util.List;
import org.apache.log4j.Logger;
import org.htmlcleaner.TagNode;

/* loaded from: input_file:com/gemantic/parser/extractor/impl/ArticleCatExtractor.class */
public class ArticleCatExtractor implements ArticleExtractor {
    private static Logger logger = Logger.getLogger(ArticleCatExtractor.class);

    @Override // com.gemantic.parser.extractor.ArticleExtractor
    public Article extract(Article article, TagNode tagNode, GRHtmlTree gRHtmlTree, List list) {
        extractCat(article, gRHtmlTree, list);
        return article;
    }

    private boolean extractCat(Article article, GRHtmlTree gRHtmlTree, List list) {
        ArrayList arrayList = new ArrayList();
        boolean z = false;
        for (int i = 0; i < list.size(); i++) {
            Paragraph paragraph = (Paragraph) list.get(i);
            String replaceAll = paragraph.getContentS().trim().replaceAll(" {2,}", " ").replaceAll("\\t{2,}", "\\t");
            if (replaceAll.length() < 200 && replaceAll.length() >= 2) {
                String trim = replaceAll.trim();
                arrayList.add(new HtmlNodeItem(trim, paragraph.getNodeID()));
                if (!z) {
                    String parseCatSentence = RegexUtil.parseCatSentence(trim);
                    if (!BaseUtil.emptyStr(parseCatSentence)) {
                        String formatMapCat = formatMapCat(parseCatSentence);
                        if (!BaseUtil.emptyStr(formatMapCat)) {
                            z = true;
                            article.setSiteCat(formatMapCat, paragraph.getNodeID());
                        }
                    }
                }
            }
        }
        repairCat(article, arrayList);
        return z;
    }

    private String formatMapCat(String str) {
        if (str == null) {
            return null;
        }
        return str.replaceAll("&raquo;", ">").replaceAll("-&gt;", ">").replaceAll("->", ">").replaceAll("&gt;&gt;", ">").replaceAll(">>", ">").replaceAll("&gt; &gt;", ">").replaceAll("&gt;", ">").replaceAll("&gt", ">").replaceAll("→", ">").replaceAll(" - ", ">").replaceAll("\\|", ">").replaceAll("&nbsp;", " ").trim().replaceAll(" ", "").replaceAll("\u3000", "").replaceAll(">+", ">");
    }

    private void repairCat(Article article, List<HtmlNodeItem> list) {
        if (BaseUtil.emptyStr(article.getSiteCat())) {
            for (int i = 0; i < list.size(); i++) {
                HtmlNodeItem htmlNodeItem = list.get(i);
                String tagValue = htmlNodeItem.getTagValue();
                if (tagValue != null && (tagValue.indexOf("&gt;") > 0 || tagValue.indexOf(">") > 0)) {
                    String parseCatSentence = RegexUtil.parseCatSentence(tagValue.replaceAll("\n", "").replaceAll("\r", "").replaceAll(" {2,}", " "));
                    if (!BaseUtil.emptyStr(parseCatSentence)) {
                        article.setSiteCat(formatMapCat(parseCatSentence), htmlNodeItem.getPos());
                    }
                    if (!BaseUtil.emptyStr(article.getSiteCat())) {
                        break;
                    }
                }
            }
        }
        if (BaseUtil.emptyStr(article.getSiteCat())) {
            for (int i2 = 0; i2 < list.size(); i2++) {
                HtmlNodeItem htmlNodeItem2 = list.get(i2);
                String tagValue2 = htmlNodeItem2.getTagValue();
                if (tagValue2 != null && (tagValue2.indexOf("&gt;") > 0 || tagValue2.indexOf(">") > 0)) {
                    logger.debug("txt222=" + tagValue2);
                    String parseCatSentenceSpecial = RegexUtil.parseCatSentenceSpecial(tagValue2.replaceAll("\n", "").replaceAll("\r", "").replaceAll(" {2,}", " "));
                    if (!BaseUtil.emptyStr(parseCatSentenceSpecial)) {
                        article.setSiteCat(formatMapCat(parseCatSentenceSpecial), htmlNodeItem2.getPos());
                    }
                    if (!BaseUtil.emptyStr(article.getSiteCat())) {
                        return;
                    }
                }
            }
        }
    }
}
