package com.gemantic.parser.mytest;

import com.gemantic.parser.impl.NewsParser;
import com.gemantic.parser.model.Article;
import com.gemantic.parser.util.BaseUtil;
import com.gemantic.parser.util.FileUtil;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.springframework.context.support.ClassPathXmlApplicationContext;

/* loaded from: input_file:com/gemantic/parser/mytest/TestListPage.class */
public class TestListPage {
    private NewsParser parser;
    private static final int CHUNK_SIZE = 2000;
    private static Logger logger = Logger.getLogger(TestExtractTitle.class);
    private static Pattern metaPattern = Pattern.compile("<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>", 2);
    private static Pattern charsetPattern = Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)", 2);

    public TestListPage() {
        init();
    }

    private void init() {
        try {
            this.parser = (NewsParser) new ClassPathXmlApplicationContext("classpath:META-INF/applicationContext_parser.xml").getBean("NewsParser");
        } catch (Throwable th) {
            logger.error("cant get spring classpath:META-INF/applicationContext_parser.xml");
            th.printStackTrace();
            logger.error(th.getMessage());
        }
    }

    private static String sniffCharacterEncoding(byte[] bArr) {
        try {
            Matcher matcher = metaPattern.matcher(new String(bArr, 0, bArr.length < CHUNK_SIZE ? bArr.length : CHUNK_SIZE, Charset.forName("ASCII").toString()));
            String str = null;
            if (matcher.find()) {
                Matcher matcher2 = charsetPattern.matcher(matcher.group(1));
                if (matcher2.find()) {
                    str = new String(matcher2.group(1));
                }
            }
            return str;
        } catch (UnsupportedEncodingException e) {
            return null;
        }
    }

    public static void main(String[] strArr) throws Exception {
        TestListPage testListPage = new TestListPage();
        for (int i = 0; i < 1; i++) {
            logger.info("process times #####" + i + "#####");
            ArrayList arrayList = new ArrayList();
            FileUtil.readfile2List("D:/work/cat/fetch_url/listpage2/", arrayList);
            logger.info("total count:" + arrayList.size());
            int i2 = 0;
            int i3 = 0;
            StringBuffer stringBuffer = new StringBuffer();
            for (int i4 = 0; i4 < arrayList.size(); i4++) {
                String sniffCharacterEncoding = sniffCharacterEncoding(FileUtil.getBytesFromFile((String) arrayList.get(i4)));
                if (sniffCharacterEncoding == null) {
                    sniffCharacterEncoding = "GBK";
                }
                logger.info("process item =" + i4 + ";file=" + ((String) arrayList.get(i4)) + " ;charset =" + sniffCharacterEncoding);
                Article parse = testListPage.parser.parse(BaseUtil.readFile((String) arrayList.get(i4), sniffCharacterEncoding), "", 0L, "中国人民解放");
                if (parse == null) {
                    logger.warn("parse or extarct error, item =" + i4 + ";src = " + ((String) arrayList.get(i4)));
                } else {
                    if (parse.getPageType() != 2) {
                        logger.info("item =" + i4 + ";page type(Detail) =" + parse.getPageTypeName());
                        i2++;
                        stringBuffer.append((String) arrayList.get(i4));
                        stringBuffer.append("\n");
                    } else {
                        logger.info("item =" + i4 + ";page type(List) =" + parse.getPageTypeName());
                    }
                    logger.info("process finised");
                    i3++;
                    if (i3 % 1000 == 0) {
                        logger.warn("===========sum================");
                        logger.warn("totalcount=" + i3);
                        logger.warn("total error =" + i2 + "\nerror file=" + stringBuffer.toString());
                    }
                }
            }
            logger.warn("===========total ================");
            logger.warn("totalcount=" + i3);
            logger.warn("total error =" + i2 + "\nerror file=" + stringBuffer.toString());
        }
    }
}
