/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.crawl;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.FetchScheduleFactory;
import org.apache.nutch.crawl.GeneratorMapper;
import org.apache.nutch.crawl.GeneratorReducer;
import org.apache.nutch.crawl.URLPartitioner;
import org.apache.nutch.storage.StorageUtils;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.ToolUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class GeneratorJob
extends NutchTool
implements Tool {
    public static final String GENERATE_UPDATE_CRAWLDB = "generate.update.crawldb";
    public static final String GENERATOR_MIN_SCORE = "generate.min.score";
    public static final String GENERATOR_FILTER = "generate.filter";
    public static final String GENERATOR_NORMALISE = "generate.normalise";
    public static final String GENERATOR_MAX_COUNT = "generate.max.count";
    public static final String GENERATOR_COUNT_MODE = "generate.count.mode";
    public static final String GENERATOR_COUNT_VALUE_DOMAIN = "domain";
    public static final String GENERATOR_COUNT_VALUE_HOST = "host";
    public static final String GENERATOR_COUNT_VALUE_IP = "ip";
    public static final String GENERATOR_TOP_N = "generate.topN";
    public static final String GENERATOR_CUR_TIME = "generate.curTime";
    public static final String GENERATOR_DELAY = "crawl.gen.delay";
    public static final String GENERATOR_RANDOM_SEED = "generate.partition.seed";
    public static final String BATCH_ID = "generate.batch.id";
    public static final String GENERATE_COUNT = "generate.count";
    private static final Set<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
    public static final Logger LOG;

    public GeneratorJob() {
    }

    public GeneratorJob(Configuration conf) {
        this.setConf(conf);
    }

    public Collection<WebPage.Field> getFields(Job job) {
        HashSet<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS);
        fields.addAll(FetchScheduleFactory.getFetchSchedule(job.getConfiguration()).getFields());
        return fields;
    }

    @Override
    public Map<String, Object> run(Map<String, Object> args) throws Exception {
        String mode;
        String batchId = (String)args.get("batch");
        if (batchId != null) {
            this.getConf().set(BATCH_ID, batchId);
        }
        Long topN = (Long)args.get("topN");
        Long curTime = (Long)args.get("curTime");
        if (curTime == null) {
            curTime = System.currentTimeMillis();
        }
        Boolean filter = (Boolean)args.get("filter");
        Boolean norm = (Boolean)args.get("normalize");
        this.getConf().setLong(GENERATOR_CUR_TIME, curTime.longValue());
        if (topN != null) {
            this.getConf().setLong(GENERATOR_TOP_N, topN.longValue());
        }
        if (filter != null) {
            this.getConf().setBoolean(GENERATOR_FILTER, filter.booleanValue());
        }
        this.getConf().setLong("_ngt_", System.currentTimeMillis());
        if (norm != null) {
            this.getConf().setBoolean(GENERATOR_NORMALISE, norm.booleanValue());
        }
        if (GENERATOR_COUNT_VALUE_HOST.equalsIgnoreCase(mode = this.getConf().get(GENERATOR_COUNT_MODE, GENERATOR_COUNT_VALUE_HOST))) {
            this.getConf().set("partition.url.mode", "byHost");
        } else if (GENERATOR_COUNT_VALUE_DOMAIN.equalsIgnoreCase(mode)) {
            this.getConf().set("partition.url.mode", "byDomain");
        } else {
            LOG.warn("Unknown generator.max.count mode '" + mode + "', using mode=" + GENERATOR_COUNT_VALUE_HOST);
            this.getConf().set(GENERATOR_COUNT_MODE, GENERATOR_COUNT_VALUE_HOST);
            this.getConf().set("partition.url.mode", "byHost");
        }
        this.numJobs = 1;
        this.currentJobNum = 0;
        this.currentJob = new NutchJob(this.getConf(), "generate: " + this.getConf().get(BATCH_ID));
        Collection<WebPage.Field> fields = this.getFields(this.currentJob);
        StorageUtils.initMapperJob(this.currentJob, fields, SelectorEntry.class, WebPage.class, GeneratorMapper.class, URLPartitioner.SelectorEntryPartitioner.class, true);
        StorageUtils.initReducerJob(this.currentJob, GeneratorReducer.class);
        this.currentJob.waitForCompletion(true);
        ToolUtil.recordJobStatus(null, this.currentJob, this.results);
        this.results.put(BATCH_ID, this.getConf().get(BATCH_ID));
        long generateCount = this.currentJob.getCounters().findCounter("Generator", "GENERATE_MARK").getValue();
        this.results.put(GENERATE_COUNT, generateCount);
        return this.results;
    }

    public String generate(long topN, long curTime, boolean filter, boolean norm) throws Exception {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("GeneratorJob: starting at " + sdf.format(start));
        LOG.info("GeneratorJob: Selecting best-scoring urls due for fetch.");
        LOG.info("GeneratorJob: starting");
        LOG.info("GeneratorJob: filtering: " + filter);
        LOG.info("GeneratorJob: normalizing: " + norm);
        if (topN != Long.MAX_VALUE) {
            LOG.info("GeneratorJob: topN: " + topN);
        }
        Map<String, Object> results = this.run(ToolUtil.toArgMap("topN", topN, "curTime", curTime, "filter", filter, "normalize", norm));
        String batchId = this.getConf().get(BATCH_ID);
        long finish = System.currentTimeMillis();
        long generateCount = (Long)results.get(GENERATE_COUNT);
        LOG.info("GeneratorJob: finished at " + sdf.format(finish) + ", time elapsed: " + TimingUtil.elapsedTime(start, finish));
        LOG.info("GeneratorJob: generated batch id: " + batchId + " containing " + generateCount + " URLs");
        if (generateCount == 0L) {
            return null;
        }
        return batchId;
    }

    public int run(String[] args) throws Exception {
        if (args.length <= 0) {
            System.out.println("Usage: GeneratorJob [-topN N] [-crawlId id] [-noFilter] [-noNorm] [-adddays numDays]");
            System.out.println("    -topN <N>      - number of top URLs to be selected, default is Long.MAX_VALUE ");
            System.out.println("    -crawlId <id>  - the id to prefix the schemas to operate on, \n \t \t    (default: storage.crawl.id)\");");
            System.out.println("    -noFilter      - do not activate the filter plugin to filter the url, default is true ");
            System.out.println("    -noNorm        - do not activate the normalizer plugin to normalize the url, default is true ");
            System.out.println("    -adddays       - Adds numDays to the current time to facilitate crawling urls already");
            System.out.println("                     fetched sooner then db.fetch.interval.default. Default value is 0.");
            System.out.println("    -batchId       - the batch id ");
            System.out.println("----------------------");
            System.out.println("Please set the params.");
            return -1;
        }
        long curTime = System.currentTimeMillis();
        long topN = Long.MAX_VALUE;
        boolean filter = true;
        boolean norm = true;
        int randomSeed = Math.abs(new Random().nextInt());
        String batchId = curTime / 1000L + "-" + randomSeed;
        this.getConf().set(BATCH_ID, batchId);
        for (int i = 0; i < args.length; ++i) {
            if ("-topN".equals(args[i])) {
                topN = Long.parseLong(args[++i]);
                continue;
            }
            if ("-noFilter".equals(args[i])) {
                filter = false;
                continue;
            }
            if ("-noNorm".equals(args[i])) {
                norm = false;
                continue;
            }
            if ("-crawlId".equals(args[i])) {
                this.getConf().set("storage.crawl.id", args[++i]);
                continue;
            }
            if ("-adddays".equals(args[i])) {
                long numDays = Integer.parseInt(args[++i]);
                curTime += numDays * 1000L * 60L * 60L * 24L;
                continue;
            }
            if ("-batchId".equals(args[i])) {
                this.getConf().set(BATCH_ID, args[++i]);
                continue;
            }
            System.err.println("Unrecognized arg " + args[i]);
            return -1;
        }
        try {
            return this.generate(topN, curTime, filter, norm) != null ? 0 : 1;
        }
        catch (Exception e) {
            LOG.error("GeneratorJob: " + StringUtils.stringifyException((Throwable)e));
            return -1;
        }
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new GeneratorJob(), (String[])args);
        System.exit(res);
    }

    static {
        FIELDS.add(WebPage.Field.FETCH_TIME);
        FIELDS.add(WebPage.Field.SCORE);
        FIELDS.add(WebPage.Field.STATUS);
        FIELDS.add(WebPage.Field.MARKERS);
        LOG = LoggerFactory.getLogger(GeneratorJob.class);
        WritableComparator.define(SelectorEntry.class, (WritableComparator)new SelectorEntryComparator());
    }

    public static class SelectorEntryComparator
    extends WritableComparator {
        public SelectorEntryComparator() {
            super(SelectorEntry.class, true);
        }
    }

    public static class SelectorEntry
    implements WritableComparable<SelectorEntry> {
        String url;
        float score;

        public SelectorEntry() {
        }

        public SelectorEntry(String url, float score) {
            this.url = url;
            this.score = score;
        }

        public void readFields(DataInput in) throws IOException {
            this.url = Text.readString((DataInput)in);
            this.score = in.readFloat();
        }

        public void write(DataOutput out) throws IOException {
            Text.writeString((DataOutput)out, (String)this.url);
            out.writeFloat(this.score);
        }

        public int compareTo(SelectorEntry se) {
            if (se.score > this.score) {
                return 1;
            }
            if (se.score == this.score) {
                return this.url.compareTo(se.url);
            }
            return -1;
        }

        public int hashCode() {
            int prime = 31;
            int result = 1;
            result = 31 * result + this.url.hashCode();
            result = 31 * result + Float.floatToIntBits(this.score);
            return result;
        }

        public boolean equals(Object obj) {
            SelectorEntry other = (SelectorEntry)obj;
            if (!this.url.equals(other.url)) {
                return false;
            }
            return Float.floatToIntBits(this.score) == Float.floatToIntBits(other.score);
        }

        public void set(String url, float score) {
            this.url = url;
            this.score = score;
        }
    }
}

