/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.indexer;

import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import org.apache.gora.mapreduce.GoraMapper;
import org.apache.gora.mapreduce.StringComparator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.indexer.IndexCleaningFilters;
import org.apache.nutch.indexer.IndexWriters;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.storage.StorageUtils;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.ToolUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class CleaningJob
extends NutchTool
implements Tool {
    public static final String ARG_COMMIT = "commit";
    public static final Logger LOG = LoggerFactory.getLogger(CleaningJob.class);
    private Configuration conf;
    private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

    public Configuration getConf() {
        return this.conf;
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
    }

    public Collection<WebPage.Field> getFields(Job job) {
        Configuration conf = job.getConfiguration();
        HashSet<WebPage.Field> columns = new HashSet<WebPage.Field>(FIELDS);
        IndexCleaningFilters filters = new IndexCleaningFilters(conf);
        columns.addAll(filters.getFields());
        return columns;
    }

    @Override
    public Map<String, Object> run(Map<String, Object> args) throws Exception {
        this.getConf().setBoolean(ARG_COMMIT, ((Boolean)args.get(ARG_COMMIT)).booleanValue());
        this.currentJob = new NutchJob(this.getConf(), "CleaningJob");
        this.currentJob.getConfiguration().setClass("mapred.output.key.comparator.class", StringComparator.class, RawComparator.class);
        Collection<WebPage.Field> fields = this.getFields(this.currentJob);
        StorageUtils.initMapperJob(this.currentJob, fields, String.class, WebPage.class, CleanMapper.class);
        this.currentJob.setReducerClass(CleanReducer.class);
        this.currentJob.setOutputFormatClass(NullOutputFormat.class);
        this.currentJob.waitForCompletion(true);
        ToolUtil.recordJobStatus(null, this.currentJob, this.results);
        return this.results;
    }

    public int delete(boolean commit) throws Exception {
        LOG.info("CleaningJob: starting");
        this.run(ToolUtil.toArgMap(ARG_COMMIT, commit));
        LOG.info("CleaningJob: done");
        return 0;
    }

    public int run(String[] args) throws Exception {
        if (args.length < 1) {
            System.err.println("Usage: CleaningJob [-crawlId <id>] [-noCommit]");
            return 1;
        }
        boolean commit = true;
        if (args.length == 3 && args[2].equals("-noCommit")) {
            commit = false;
        }
        if (args.length == 3 && "-crawlId".equals(args[0])) {
            this.getConf().set("storage.crawl.id", args[1]);
        }
        return this.delete(commit);
    }

    public static void main(String[] args) throws Exception {
        int result = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new CleaningJob(), (String[])args);
        System.exit(result);
    }

    static {
        FIELDS.add(WebPage.Field.STATUS);
    }

    public static class CleanReducer
    extends Reducer<String, WebPage, NullWritable, NullWritable> {
        private int numDeletes = 0;
        private static final int NUM_MAX_DELETE_REQUEST = 1000;
        private boolean commit;
        IndexWriters writers = null;

        public void setup(Reducer.Context job) throws IOException {
            Configuration conf = job.getConfiguration();
            this.writers = new IndexWriters(conf);
            try {
                this.writers.open(conf);
            }
            catch (IOException e) {
                throw new RuntimeException(e);
            }
            this.commit = conf.getBoolean(CleaningJob.ARG_COMMIT, false);
        }

        public void reduce(String key, Iterable<WebPage> values, Reducer.Context context) throws IOException {
            this.writers.delete(key);
            ++this.numDeletes;
            context.getCounter("SolrClean", "DELETED").increment(1L);
        }

        public void cleanup(Reducer.Context context) throws IOException {
            this.writers.close();
            if (this.numDeletes > 0 && this.commit) {
                this.writers.commit();
            }
            LOG.info("CleaningJob: deleted a total of " + this.numDeletes + " documents");
        }
    }

    public static class CleanMapper
    extends GoraMapper<String, WebPage, String, WebPage> {
        private IndexCleaningFilters filters;

        protected void setup(Mapper.Context context) throws IOException {
            Configuration conf = context.getConfiguration();
            this.filters = new IndexCleaningFilters(conf);
        }

        public void map(String key, WebPage page, Mapper.Context context) throws IOException, InterruptedException {
            try {
                if (page.getStatus() == 3 || this.filters.remove(key, page)) {
                    context.write((Object)key, (Object)page);
                }
            }
            catch (IndexingException e) {
                LOG.warn("Error indexing " + key + ": " + e);
            }
        }
    }
}

