/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.crawl;

import java.io.IOException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;
import org.apache.gora.mapreduce.GoraMapper;
import org.apache.gora.query.Query;
import org.apache.gora.query.Result;
import org.apache.gora.store.DataStore;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlStatus;
import org.apache.nutch.parse.ParseStatusUtils;
import org.apache.nutch.protocol.ProtocolStatusUtils;
import org.apache.nutch.storage.StorageUtils;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TableUtil;
import org.apache.nutch.util.ToolUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WebTableReader
extends NutchTool
implements Tool {
    public static final Logger LOG = LoggerFactory.getLogger(WebTableReader.class);

    public void processStatJob(boolean sort) throws Exception {
        if (LOG.isInfoEnabled()) {
            LOG.info("WebTable statistics start");
        }
        this.run(ToolUtil.toArgMap("sort", sort));
        if (LOG.isInfoEnabled()) {
            LOG.info("Statistics for WebTable: ");
            for (Map.Entry e : this.results.entrySet()) {
                LOG.info((String)e.getKey() + ":\t" + e.getValue());
            }
            LOG.info("WebTable statistics: done");
        }
    }

    private void read(String key, boolean dumpContent, boolean dumpHeaders, boolean dumpLinks, boolean dumpText) throws ClassNotFoundException, IOException, Exception {
        DataStore<String, WebPage> datastore = StorageUtils.createWebStore(this.getConf(), String.class, WebPage.class);
        Query query = datastore.newQuery();
        String reversedUrl = TableUtil.reverseUrl(key);
        query.setKey((Object)reversedUrl);
        Result result = datastore.execute(query);
        boolean found = false;
        while (result.next()) {
            try {
                WebPage page = (WebPage)result.get();
                String skey = (String)result.getKey();
                if (page == null || skey == null) break;
                found = true;
                String url = TableUtil.unreverseUrl(skey);
                System.out.println(WebTableReader.getPageRepresentation(url, page, dumpContent, dumpHeaders, dumpLinks, dumpText));
            }
            catch (Exception e) {
                e.printStackTrace();
            }
        }
        if (!found) {
            System.out.println(key + " not found");
        }
        result.close();
        datastore.close();
    }

    public void processDumpJob(String output, Configuration config, String regex, boolean content, boolean headers, boolean links, boolean text) throws IOException, ClassNotFoundException, InterruptedException {
        if (LOG.isInfoEnabled()) {
            LOG.info("WebTable dump: starting");
        }
        Path outFolder = new Path(output);
        NutchJob job = new NutchJob(this.getConf(), "db_dump");
        Configuration cfg = job.getConfiguration();
        cfg.set("webtable.url.regex", regex);
        cfg.setBoolean("webtable.dump.content", content);
        cfg.setBoolean("webtable.dump.headers", headers);
        cfg.setBoolean("webtable.dump.links", links);
        cfg.setBoolean("webtable.dump.text", text);
        DataStore<String, WebPage> store = StorageUtils.createWebStore(job.getConfiguration(), String.class, WebPage.class);
        Query query = store.newQuery();
        String[] fields = Arrays.copyOfRange(WebPage._ALL_FIELDS, 1, WebPage._ALL_FIELDS.length);
        query.setFields(fields);
        GoraMapper.initMapperJob((Job)job, (Query)query, store, Text.class, Text.class, WebTableRegexMapper.class, null, (boolean)true);
        FileOutputFormat.setOutputPath((Job)job, (Path)outFolder);
        job.setOutputFormatClass(TextOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        boolean success = job.waitForCompletion(true);
        if (LOG.isInfoEnabled()) {
            LOG.info("WebTable dump: done");
        }
    }

    private static String getPageRepresentation(String key, WebPage page, boolean dumpContent, boolean dumpHeaders, boolean dumpLinks, boolean dumpText) {
        CharSequence text;
        ByteBuffer content;
        Map<CharSequence, CharSequence> headers;
        Map<CharSequence, ByteBuffer> metadata;
        ByteBuffer sig;
        StringBuffer sb = new StringBuffer();
        sb.append("key:\t" + key).append("\n");
        sb.append("baseUrl:\t" + page.getBaseUrl()).append("\n");
        sb.append("status:\t").append(page.getStatus()).append(" (").append(CrawlStatus.getName(page.getStatus().byteValue())).append(")\n");
        sb.append("fetchTime:\t" + page.getFetchTime()).append("\n");
        sb.append("prevFetchTime:\t" + page.getPrevFetchTime()).append("\n");
        sb.append("fetchInterval:\t" + page.getFetchInterval()).append("\n");
        sb.append("retriesSinceFetch:\t" + page.getRetriesSinceFetch()).append("\n");
        sb.append("modifiedTime:\t" + page.getModifiedTime()).append("\n");
        sb.append("prevModifiedTime:\t" + page.getPrevModifiedTime()).append("\n");
        sb.append("protocolStatus:\t" + ProtocolStatusUtils.toString(page.getProtocolStatus())).append("\n");
        ByteBuffer prevSig = page.getPrevSignature();
        if (prevSig != null) {
            sb.append("prevSignature:\t" + StringUtil.toHexString(prevSig)).append("\n");
        }
        if ((sig = page.getSignature()) != null) {
            sb.append("signature:\t" + StringUtil.toHexString(sig)).append("\n");
        }
        sb.append("parseStatus:\t" + ParseStatusUtils.toString(page.getParseStatus())).append("\n");
        sb.append("title:\t" + page.getTitle()).append("\n");
        sb.append("score:\t" + page.getScore()).append("\n");
        Map<CharSequence, CharSequence> markers = page.getMarkers();
        if (markers != null) {
            for (Map.Entry<CharSequence, CharSequence> entry : markers.entrySet()) {
                sb.append("marker " + entry.getKey().toString()).append(" : \t").append(entry.getValue()).append("\n");
            }
        }
        sb.append("reprUrl:\t" + page.getReprUrl()).append("\n");
        CharSequence batchId = page.getBatchId();
        if (batchId != null) {
            sb.append("batchId:\t" + batchId.toString()).append("\n");
        }
        if ((metadata = page.getMetadata()) != null) {
            for (Map.Entry<CharSequence, ByteBuffer> entry : metadata.entrySet()) {
                sb.append("metadata " + entry.getKey().toString()).append(" : \t").append(Bytes.toString(entry.getValue())).append("\n");
            }
        }
        if (dumpLinks) {
            Map<CharSequence, CharSequence> inlinks = page.getInlinks();
            Map<CharSequence, CharSequence> outlinks = page.getOutlinks();
            if (outlinks != null) {
                for (Map.Entry<CharSequence, CharSequence> e : outlinks.entrySet()) {
                    sb.append("outlink:\t" + e.getKey() + "\t" + e.getValue() + "\n");
                }
            }
            if (inlinks != null) {
                for (Map.Entry<CharSequence, CharSequence> e : inlinks.entrySet()) {
                    sb.append("inlink:\t" + e.getKey() + "\t" + e.getValue() + "\n");
                }
            }
        }
        if (dumpHeaders && (headers = page.getHeaders()) != null) {
            for (Map.Entry<CharSequence, CharSequence> e : headers.entrySet()) {
                sb.append("header:\t" + e.getKey() + "\t" + e.getValue() + "\n");
            }
        }
        if ((content = page.getContent()) != null && dumpContent) {
            sb.append("contentType:\t" + page.getContentType()).append("\n");
            sb.append("content:start:\n");
            sb.append(Bytes.toString(content));
            sb.append("\ncontent:end:\n");
        }
        if ((text = page.getText()) != null && dumpText) {
            sb.append("text:start:\n");
            sb.append(text.toString());
            sb.append("\ntext:end:\n");
        }
        return sb.toString();
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new WebTableReader(), (String[])args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        if (args.length < 1) {
            System.err.println("Usage: WebTableReader (-stats | -url [url] | -dump <out_dir> [-regex regex]) \n \t \t      [-crawlId <id>] [-content] [-headers] [-links] [-text]");
            System.err.println("    -crawlId <id>  - the id to prefix the schemas to operate on, \n \t \t     (default: storage.crawl.id)");
            System.err.println("    -stats [-sort] - print overall statistics to System.out");
            System.err.println("    [-sort]        - list status sorted by host");
            System.err.println("    -url <url>     - print information on <url> to System.out");
            System.err.println("    -dump <out_dir> [-regex regex] - dump the webtable to a text file in \n \t \t     <out_dir>");
            System.err.println("    -content       - dump also raw content");
            System.err.println("    -headers       - dump protocol headers");
            System.err.println("    -links         - dump links");
            System.err.println("    -text          - dump extracted text");
            System.err.println("    [-regex]       - filter on the URL of the webtable entry");
            return -1;
        }
        String param = null;
        boolean content = false;
        boolean links = false;
        boolean text = false;
        boolean headers = false;
        boolean toSort = false;
        String regex = ".+";
        Enum op = null;
        try {
            for (int i = 0; i < args.length; ++i) {
                if (args[i].equals("-url")) {
                    param = args[++i];
                    op = Op.READ;
                    continue;
                }
                if (args[i].equals("-stats")) {
                    op = Op.STAT;
                    continue;
                }
                if (args[i].equals("-sort")) {
                    toSort = true;
                    continue;
                }
                if (args[i].equals("-dump")) {
                    op = Op.DUMP;
                    param = args[++i];
                    continue;
                }
                if (args[i].equals("-content")) {
                    content = true;
                    continue;
                }
                if (args[i].equals("-headers")) {
                    headers = true;
                    continue;
                }
                if (args[i].equals("-links")) {
                    links = true;
                    continue;
                }
                if (args[i].equals("-text")) {
                    text = true;
                    continue;
                }
                if (args[i].equals("-regex")) {
                    regex = args[++i];
                    continue;
                }
                if (!args[i].equals("-crawlId")) continue;
                this.getConf().set("storage.crawl.id", args[++i]);
            }
            if (op == null) {
                throw new Exception("Select one of -url | -stats | -dump");
            }
            switch (1.$SwitchMap$org$apache$nutch$crawl$WebTableReader$Op[op.ordinal()]) {
                case 1: {
                    this.read(param, content, headers, links, text);
                    break;
                }
                case 2: {
                    this.processStatJob(toSort);
                    break;
                }
                case 3: {
                    this.processDumpJob(param, this.getConf(), regex, content, headers, links, text);
                }
            }
            return 0;
        }
        catch (Exception e) {
            LOG.error("WebTableReader: " + StringUtils.stringifyException((Throwable)e));
            return -1;
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    @Override
    public Map<String, Object> run(Map<String, Object> args) throws Exception {
        Path tmpFolder = new Path(this.getConf().get("mapred.temp.dir", ".") + "stat_tmp" + System.currentTimeMillis());
        this.numJobs = 1;
        this.currentJob = new NutchJob(this.getConf(), "db_stats");
        this.currentJob.getConfiguration().setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
        Boolean sort = (Boolean)args.get("sort");
        if (sort == null) {
            sort = Boolean.FALSE;
        }
        this.currentJob.getConfiguration().setBoolean("db.reader.stats.sort", sort.booleanValue());
        DataStore<String, WebPage> store = StorageUtils.createWebStore(this.currentJob.getConfiguration(), String.class, WebPage.class);
        Query query = store.newQuery();
        String[] fields = Arrays.copyOfRange(WebPage._ALL_FIELDS, 1, WebPage._ALL_FIELDS.length);
        query.setFields(fields);
        GoraMapper.initMapperJob((Job)this.currentJob, (Query)query, store, Text.class, LongWritable.class, WebTableStatMapper.class, null, (boolean)true);
        this.currentJob.setCombinerClass(WebTableStatCombiner.class);
        this.currentJob.setReducerClass(WebTableStatReducer.class);
        FileOutputFormat.setOutputPath((Job)this.currentJob, (Path)tmpFolder);
        this.currentJob.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class);
        this.currentJob.setOutputKeyClass(Text.class);
        this.currentJob.setOutputValueClass(LongWritable.class);
        FileSystem fileSystem = FileSystem.get((Configuration)this.getConf());
        try {
            this.currentJob.waitForCompletion(true);
        }
        finally {
            ToolUtil.recordJobStatus(null, this.currentJob, this.results);
            if (!this.currentJob.isSuccessful()) {
                fileSystem.delete(tmpFolder, true);
                return this.results;
            }
        }
        Text key = new Text();
        LongWritable value = new LongWritable();
        SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders((Configuration)this.getConf(), (Path)tmpFolder);
        TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>();
        for (int i = 0; i < readers.length; ++i) {
            SequenceFile.Reader reader = readers[i];
            while (reader.next((Writable)key, (Writable)value)) {
                String k = key.toString();
                LongWritable val = (LongWritable)stats.get(k);
                if (val == null) {
                    val = new LongWritable();
                    if (k.equals("scx")) {
                        val.set(Long.MIN_VALUE);
                    }
                    if (k.equals("scn")) {
                        val.set(Long.MAX_VALUE);
                    }
                    stats.put(k, val);
                }
                if (k.equals("scx")) {
                    if (val.get() >= value.get()) continue;
                    val.set(value.get());
                    continue;
                }
                if (k.equals("scn")) {
                    if (val.get() <= value.get()) continue;
                    val.set(value.get());
                    continue;
                }
                val.set(val.get() + value.get());
            }
            reader.close();
        }
        LongWritable totalCnt = (LongWritable)stats.get("T");
        if (totalCnt == null) {
            totalCnt = new LongWritable(0L);
        }
        stats.remove("T");
        this.results.put("TOTAL urls", totalCnt.get());
        for (Map.Entry entry : stats.entrySet()) {
            String k = (String)entry.getKey();
            LongWritable val = (LongWritable)entry.getValue();
            if (k.equals("scn")) {
                this.results.put("min score", Float.valueOf((float)val.get() / 1000.0f));
                continue;
            }
            if (k.equals("scx")) {
                this.results.put("max score", Float.valueOf((float)val.get() / 1000.0f));
                continue;
            }
            if (k.equals("sct")) {
                this.results.put("avg score", Float.valueOf((float)((double)val.get() / (double)totalCnt.get() / 1000.0)));
                continue;
            }
            if (k.startsWith("status")) {
                String[] st = k.split(" ");
                int code = Integer.parseInt(st[1]);
                if (st.length > 2) {
                    this.results.put(st[2], val.get());
                    continue;
                }
                this.results.put(st[0] + " " + code + " (" + CrawlStatus.getName((byte)code) + ")", val.get());
                continue;
            }
            this.results.put(k, val.get());
        }
        fileSystem.delete(tmpFolder, true);
        return this.results;
    }

    static class 1 {
        static final /* synthetic */ int[] $SwitchMap$org$apache$nutch$crawl$WebTableReader$Op;

        static {
            $SwitchMap$org$apache$nutch$crawl$WebTableReader$Op = new int[Op.values().length];
            try {
                1.$SwitchMap$org$apache$nutch$crawl$WebTableReader$Op[Op.READ.ordinal()] = 1;
            }
            catch (NoSuchFieldError ex) {
                // empty catch block
            }
            try {
                1.$SwitchMap$org$apache$nutch$crawl$WebTableReader$Op[Op.STAT.ordinal()] = 2;
            }
            catch (NoSuchFieldError ex) {
                // empty catch block
            }
            try {
                1.$SwitchMap$org$apache$nutch$crawl$WebTableReader$Op[Op.DUMP.ordinal()] = 3;
            }
            catch (NoSuchFieldError noSuchFieldError) {
                // empty catch block
            }
        }
    }

    private static enum Op {
        READ,
        STAT,
        DUMP;

    }

    public static class WebTableRegexMapper
    extends GoraMapper<String, WebPage, Text, Text> {
        static final String regexParamName = "webtable.url.regex";
        static final String contentParamName = "webtable.dump.content";
        static final String linksParamName = "webtable.dump.links";
        static final String textParamName = "webtable.dump.text";
        static final String headersParamName = "webtable.dump.headers";
        private Pattern regex = null;
        private boolean dumpContent;
        private boolean dumpHeaders;
        private boolean dumpLinks;
        private boolean dumpText;

        protected void map(String key, WebPage value, Mapper.Context context) throws IOException, InterruptedException {
            String url = TableUtil.unreverseUrl(key.toString());
            if (this.regex.matcher(url).matches()) {
                context.write((Object)new Text(url), (Object)new Text(WebTableReader.getPageRepresentation(key, value, this.dumpContent, this.dumpHeaders, this.dumpLinks, this.dumpText)));
            }
        }

        protected void setup(Mapper.Context context) throws IOException, InterruptedException {
            this.regex = Pattern.compile(context.getConfiguration().get(regexParamName, ".+"));
            this.dumpContent = context.getConfiguration().getBoolean(contentParamName, false);
            this.dumpHeaders = context.getConfiguration().getBoolean(headersParamName, false);
            this.dumpLinks = context.getConfiguration().getBoolean(linksParamName, false);
            this.dumpText = context.getConfiguration().getBoolean(textParamName, false);
        }
    }

    public static class WebTableStatReducer
    extends Reducer<Text, LongWritable, Text, LongWritable> {
        public void cleanup(Reducer.Context context) {
        }

        protected void reduce(Text key, Iterable<LongWritable> values, Reducer.Context context) throws IOException, InterruptedException {
            Iterator<LongWritable> iter = values.iterator();
            String k = key.toString();
            if (k.equals("T")) {
                long sum = 0L;
                while (iter.hasNext()) {
                    sum += iter.next().get();
                }
                context.write((Object)key, (Object)new LongWritable(sum));
            } else if (k.startsWith("status") || k.startsWith("retry")) {
                LongWritable cnt = new LongWritable();
                while (iter.hasNext()) {
                    LongWritable val = iter.next();
                    cnt.set(cnt.get() + val.get());
                }
                context.write((Object)key, (Object)cnt);
            } else if (k.equals("scx")) {
                LongWritable cnt = new LongWritable(Long.MIN_VALUE);
                while (iter.hasNext()) {
                    LongWritable val = iter.next();
                    if (cnt.get() >= val.get()) continue;
                    cnt.set(val.get());
                }
                context.write((Object)key, (Object)cnt);
            } else if (k.equals("scn")) {
                LongWritable cnt = new LongWritable(Long.MAX_VALUE);
                while (iter.hasNext()) {
                    LongWritable val = iter.next();
                    if (cnt.get() <= val.get()) continue;
                    cnt.set(val.get());
                }
                context.write((Object)key, (Object)cnt);
            } else if (k.equals("sct")) {
                LongWritable cnt = new LongWritable();
                while (iter.hasNext()) {
                    LongWritable val = iter.next();
                    cnt.set(cnt.get() + val.get());
                }
                context.write((Object)key, (Object)cnt);
            }
        }
    }

    public static class WebTableStatCombiner
    extends Reducer<Text, LongWritable, Text, LongWritable> {
        LongWritable val = new LongWritable();

        public void setup(Reducer.Context context) {
        }

        public void cleanup(Reducer.Context context) {
        }

        public void reduce(Text key, Iterable<LongWritable> values, Reducer.Context context) throws IOException, InterruptedException {
            this.val.set(0L);
            Iterator<LongWritable> iter = values.iterator();
            String k = key.toString();
            if (!k.equals("s")) {
                while (iter.hasNext()) {
                    LongWritable cnt = iter.next();
                    this.val.set(this.val.get() + cnt.get());
                }
                context.write((Object)key, (Object)this.val);
            } else {
                long total = 0L;
                long min = Long.MAX_VALUE;
                long max = Long.MIN_VALUE;
                while (iter.hasNext()) {
                    LongWritable cnt = iter.next();
                    if (cnt.get() < min) {
                        min = cnt.get();
                    }
                    if (cnt.get() > max) {
                        max = cnt.get();
                    }
                    total += cnt.get();
                }
                context.write((Object)new Text("scn"), (Object)new LongWritable(min));
                context.write((Object)new Text("scx"), (Object)new LongWritable(max));
                context.write((Object)new Text("sct"), (Object)new LongWritable(total));
            }
        }
    }

    public static class WebTableStatMapper
    extends GoraMapper<String, WebPage, Text, LongWritable> {
        LongWritable COUNT_1 = new LongWritable(1L);
        private boolean sort = false;

        public void setup(Mapper.Context context) {
            this.sort = context.getConfiguration().getBoolean("db.reader.stats.sort", false);
        }

        public void close() {
        }

        protected void map(String key, WebPage value, Mapper.Context context) throws IOException, InterruptedException {
            context.write((Object)new Text("T"), (Object)this.COUNT_1);
            context.write((Object)new Text("status " + value.getStatus()), (Object)this.COUNT_1);
            context.write((Object)new Text("retry " + value.getRetriesSinceFetch()), (Object)this.COUNT_1);
            context.write((Object)new Text("s"), (Object)new LongWritable((long)((double)value.getScore().floatValue() * 1000.0)));
            if (this.sort) {
                URL u = new URL(TableUtil.unreverseUrl(key.toString()));
                String host = u.getHost();
                context.write((Object)new Text("status " + value.getStatus() + " " + host), (Object)this.COUNT_1);
            }
        }
    }
}

