/*
 * Decompiled with CFR 0.152.
 */
package cn.gtmap.crawler.news.core;

import cn.gtmap.crawler.news.ArticleSave;
import cn.gtmap.crawler.news.CrawlerUtils;
import cn.gtmap.crawler.news.core.BaseLogger;
import cn.gtmap.crawler.news.model.Article;
import com.gtis.common.util.UUIDGenerator;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

public class ArticleCrawlerProcessor
extends BaseLogger
implements PageProcessor {
    private Site site = Site.me().setRetryTimes(3).setSleepTime(3000);

    public void process(Page page) {
        Selectable rootPath = page.getHtml().xpath("//div[@class='newslist']");
        String pageUrl = page.getUrl().get();
        if (this.isNull((Object)rootPath.get())) {
            String title = page.getHtml().xpath("//div[@class='titleinfo']/h1/text()").get();
            String content = page.getHtml().xpath("//article[@class='content_news']").toString();
            content = content.replaceAll("src=\"/", "src=\"http://www.bbsgtj.gov.cn/");
            String date = this.parseDate(page.getHtml().xpath("//div[@class='titleinfo']/tidyText()").get());
            try {
                Article article = new Article();
                article.set("id", (Object)UUIDGenerator.generate());
                article.set("channel", (Object)this.getChannelId(pageUrl));
                if (this.isNotNull((Object)date)) {
                    article.set("date", (Object)date);
                }
                article.set("title", (Object)title);
                article.set("content", (Object)content);
                ArticleSave.getArticleSave().saveArticle(article);
            }
            catch (Exception e) {
                this.logger.error("\u6570\u636e\u5165\u5e93\u5931\u8d25 {}", (Object)e.getLocalizedMessage());
            }
        } else {
            List pageUrls;
            List urls = rootPath.links().all();
            String firstDate = rootPath.xpath("//span[@class='date'][1]/text()").get();
            String firstTitle = rootPath.xpath("//a[1]/text()").get();
            if (Article.dao.findFirst("select * from Article t where t.title=? and t.date=?", new Object[]{firstTitle, firstDate}) != null) {
                page.setSkip(true);
            }
            if (pageUrl.indexOf("&Page=") < 0 && this.isNotNull((Object)(pageUrls = page.getHtml().xpath("//div[@class='page']").links().all())) && pageUrls.size() > 0) {
                pageUrls = CrawlerUtils.expandCrawlerPageLinks((String)((String)pageUrls.get(pageUrls.size() - 1)));
                this.removeDuplicate(pageUrls);
                urls.addAll(pageUrls);
            }
            page.addTargetRequests(urls);
        }
    }

    private String parseDate(String dateStr) {
        if (this.isNotNull((Object)dateStr)) {
            String[] arr = dateStr.split("\uff1a");
            dateStr = arr[1];
            StringBuffer stringBuffer = new StringBuffer(20);
            String pattern = "\\d+";
            Matcher matcher = Pattern.compile(pattern).matcher(dateStr);
            while (matcher.find()) {
                if (stringBuffer.length() > 0) {
                    stringBuffer.append("-");
                }
                stringBuffer.append(matcher.group().trim());
            }
            return stringBuffer.toString();
        }
        return null;
    }

    private String getChannelId(String url) {
        String[] arr = url.split("\\?chn=");
        String part = arr[1];
        return part.split("&")[0];
    }

    private void removeDuplicate(List<String> list) {
        HashSet<String> set = new HashSet<String>(list);
        list.clear();
        list.addAll(set);
    }

    public Site getSite() {
        return this.site;
    }
}

