/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.parse;

import com.google.common.util.concurrent.ThreadFactoryBuilder;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlStatus;
import org.apache.nutch.crawl.Signature;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.fetcher.FetcherJob;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseCallable;
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.ParseStatusUtils;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.parse.ParserNotFound;
import org.apache.nutch.storage.Mark;
import org.apache.nutch.storage.ParseStatus;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.TableUtil;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ParseUtil
extends Configured {
    public static final Logger LOG = LoggerFactory.getLogger(ParseUtil.class);
    private static final int DEFAULT_MAX_PARSE_TIME = 30;
    private Configuration conf;
    private Signature sig;
    private URLFilters filters;
    private URLNormalizers normalizers;
    private int maxOutlinks;
    private boolean ignoreExternalLinks;
    private ParserFactory parserFactory;
    private int maxParseTime;
    private ExecutorService executorService;

    public ParseUtil(Configuration conf) {
        super(conf);
        this.setConf(conf);
    }

    public Configuration getConf() {
        return this.conf;
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
        this.parserFactory = new ParserFactory(conf);
        this.maxParseTime = conf.getInt("parser.timeout", 30);
        this.sig = SignatureFactory.getSignature(conf);
        this.filters = new URLFilters(conf);
        this.normalizers = new URLNormalizers(conf, "outlink");
        int maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
        this.maxOutlinks = maxOutlinksPerPage < 0 ? Integer.MAX_VALUE : maxOutlinksPerPage;
        this.ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
        this.executorService = Executors.newCachedThreadPool(new ThreadFactoryBuilder().setNameFormat("parse-%d").setDaemon(true).build());
    }

    public Parse parse(String url, WebPage page) throws ParserNotFound, ParseException {
        Parser[] parsers = null;
        String contentType = TableUtil.toString(page.getContentType());
        parsers = this.parserFactory.getParsers(contentType, url);
        for (int i = 0; i < parsers.length; ++i) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Parsing [" + url + "] with [" + parsers[i] + "]");
            }
            Parse parse = null;
            parse = this.maxParseTime != -1 ? this.runParser(parsers[i], url, page) : parsers[i].getParse(url, page);
            if (parse == null || !ParseStatusUtils.isSuccess(parse.getParseStatus())) continue;
            return parse;
        }
        LOG.warn("Unable to successfully parse content " + url + " of type " + contentType);
        return ParseStatusUtils.getEmptyParse(new ParseException("Unable to successfully parse content"), null);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private Parse runParser(Parser p, String url, WebPage page) {
        ParseCallable pc = new ParseCallable(p, page, url);
        Future<Parse> task = this.executorService.submit(pc);
        Parse res = null;
        try {
            res = task.get(this.maxParseTime, TimeUnit.SECONDS);
        }
        catch (Exception e) {
            LOG.warn("Error parsing " + url, (Throwable)e);
            task.cancel(true);
        }
        finally {
            pc = null;
        }
        return res;
    }

    public void process(String key, WebPage page) {
        Parse parse;
        String url = TableUtil.unreverseUrl(key);
        byte status = page.getStatus().byteValue();
        if (status != 2) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Skipping " + url + " as status is: " + CrawlStatus.getName(status));
            }
            return;
        }
        try {
            parse = this.parse(url, page);
        }
        catch (ParserNotFound e) {
            LOG.warn("No suitable parser found: " + e.getMessage());
            return;
        }
        catch (Exception e) {
            LOG.warn("Error parsing: " + url + ": " + StringUtils.stringifyException((Throwable)e));
            return;
        }
        if (parse == null) {
            return;
        }
        ParseStatus pstatus = parse.getParseStatus();
        page.setParseStatus(pstatus);
        if (ParseStatusUtils.isSuccess(pstatus)) {
            if (pstatus.getMinorCode() == 100) {
                String newUrl = ParseStatusUtils.getMessage(pstatus);
                int refreshTime = Integer.parseInt(ParseStatusUtils.getArg(pstatus, 1));
                try {
                    newUrl = this.normalizers.normalize(newUrl, "fetcher");
                    if (newUrl == null) {
                        LOG.warn("redirect normalized to null " + url);
                        return;
                    }
                    try {
                        newUrl = this.filters.filter(newUrl);
                    }
                    catch (URLFilterException e) {
                        return;
                    }
                    if (newUrl == null) {
                        LOG.warn("redirect filtered to null " + url);
                        return;
                    }
                }
                catch (MalformedURLException e) {
                    LOG.warn("malformed url exception parsing redirect " + url);
                    return;
                }
                page.getOutlinks().put((CharSequence)new Utf8(newUrl), (CharSequence)new Utf8());
                page.getMetadata().put((CharSequence)FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL);
                if (newUrl == null || newUrl.equals(url)) {
                    String reprUrl = URLUtil.chooseRepr(url, newUrl, refreshTime < 5);
                    if (reprUrl == null) {
                        LOG.warn("reprUrl==null for " + url);
                        return;
                    }
                    page.setReprUrl((CharSequence)new Utf8(reprUrl));
                }
            } else {
                String fromHost;
                page.setText((CharSequence)new Utf8(parse.getText()));
                page.setTitle((CharSequence)new Utf8(parse.getTitle()));
                ByteBuffer prevSig = page.getSignature();
                if (prevSig != null) {
                    page.setPrevSignature(prevSig);
                }
                byte[] signature = this.sig.calculate(page);
                page.setSignature(ByteBuffer.wrap(signature));
                if (page.getOutlinks() != null) {
                    page.getOutlinks().clear();
                }
                Outlink[] outlinks = parse.getOutlinks();
                int outlinksToStore = Math.min(this.maxOutlinks, outlinks.length);
                if (this.ignoreExternalLinks) {
                    try {
                        fromHost = new URL(url).getHost().toLowerCase();
                    }
                    catch (MalformedURLException e) {
                        fromHost = null;
                    }
                } else {
                    fromHost = null;
                }
                int validCount = 0;
                for (int i = 0; validCount < outlinksToStore && i < outlinks.length; ++i) {
                    String toUrl = outlinks[i].getToUrl();
                    try {
                        toUrl = this.normalizers.normalize(toUrl, "outlink");
                        toUrl = this.filters.filter(toUrl);
                    }
                    catch (MalformedURLException e2) {
                        continue;
                    }
                    catch (URLFilterException e) {
                        continue;
                    }
                    if (toUrl == null) continue;
                    Utf8 utf8ToUrl = new Utf8(toUrl);
                    if (page.getOutlinks().get(utf8ToUrl) != null) continue;
                    if (this.ignoreExternalLinks) {
                        String toHost;
                        try {
                            toHost = new URL(toUrl).getHost().toLowerCase();
                        }
                        catch (MalformedURLException e) {
                            toHost = null;
                        }
                        if (toHost == null || !toHost.equals(fromHost)) continue;
                    }
                    ++validCount;
                    page.getOutlinks().put((CharSequence)utf8ToUrl, (CharSequence)new Utf8(outlinks[i].getAnchor()));
                }
                Utf8 fetchMark = Mark.FETCH_MARK.checkMark(page);
                if (fetchMark != null) {
                    Mark.PARSE_MARK.putMark(page, fetchMark);
                }
            }
        }
    }
}

