package cn.gtmap.crawler.news.core;

import cn.gtmap.crawler.news.ArticleSave;
import cn.gtmap.crawler.news.CrawlerUtils;
import cn.gtmap.crawler.news.model.Article;
import com.gtis.common.util.UUIDGenerator;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.solr.response.RawResponseWriter;
import org.springframework.beans.factory.BeanFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

/* loaded from: input_file:WEB-INF/classes/cn/gtmap/crawler/news/core/ArticleCrawlerProcessor.class */
public class ArticleCrawlerProcessor extends BaseLogger implements PageProcessor {
    private Site site = Site.me().setRetryTimes(3).setSleepTime(3000);

    @Override // us.codecraft.webmagic.processor.PageProcessor
    public void process(Page page) {
        Selectable xpath = page.getHtml().xpath("//div[@class='newslist']");
        String str = page.getUrl().get();
        if (!isNull(xpath.get())) {
            List<String> all = xpath.links().all();
            String str2 = xpath.xpath("//span[@class='date'][1]/text()").get();
            if (Article.dao.findFirst("select * from Article t where t.title=? and t.date=?", xpath.xpath("//a[1]/text()").get(), str2) != null) {
                page.setSkip(true);
            }
            if (str.indexOf("&Page=") < 0) {
                List<String> all2 = page.getHtml().xpath("//div[@class='page']").links().all();
                if (isNotNull(all2) && all2.size() > 0) {
                    List<String> expandCrawlerPageLinks = CrawlerUtils.expandCrawlerPageLinks(all2.get(all2.size() - 1));
                    removeDuplicate(expandCrawlerPageLinks);
                    all.addAll(expandCrawlerPageLinks);
                }
            }
            page.addTargetRequests(all);
            return;
        }
        String str3 = page.getHtml().xpath("//div[@class='titleinfo']/h1/text()").get();
        String replaceAll = page.getHtml().xpath("//article[@class='content_news']").toString().replaceAll("src=\"/", "src=\"http://www.bbsgtj.gov.cn/");
        String parseDate = parseDate(page.getHtml().xpath("//div[@class='titleinfo']/tidyText()").get());
        try {
            Article article = new Article();
            article.set("id", UUIDGenerator.generate());
            article.set("channel", getChannelId(str));
            if (isNotNull(parseDate)) {
                article.set("date", parseDate);
            }
            article.set("title", str3);
            article.set(RawResponseWriter.CONTENT, replaceAll);
            ArticleSave.getArticleSave().saveArticle(article);
        } catch (Exception e) {
            this.logger.error("数据入库失败 {}", e.getLocalizedMessage());
        }
    }

    private String parseDate(String str) {
        if (!isNotNull(str)) {
            return null;
        }
        String str2 = str.split("：")[1];
        StringBuffer stringBuffer = new StringBuffer(20);
        Matcher matcher = Pattern.compile("\\d+").matcher(str2);
        while (matcher.find()) {
            if (stringBuffer.length() > 0) {
                stringBuffer.append("-");
            }
            stringBuffer.append(matcher.group().trim());
        }
        return stringBuffer.toString();
    }

    private String getChannelId(String str) {
        return str.split("\\?chn=")[1].split(BeanFactory.FACTORY_BEAN_PREFIX)[0];
    }

    private void removeDuplicate(List<String> list) {
        HashSet hashSet = new HashSet(list);
        list.clear();
        list.addAll(hashSet);
    }

    @Override // us.codecraft.webmagic.processor.PageProcessor
    public Site getSite() {
        return this.site;
    }
}
