/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.util;

import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.TableUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class EncodingDetector {
    public static final Utf8 CONTENT_TYPE_UTF8 = new Utf8("Content-Type");
    public static final Logger LOG = LoggerFactory.getLogger(EncodingDetector.class);
    public static final int NO_THRESHOLD = -1;
    public static final String MIN_CONFIDENCE_KEY = "encodingdetector.charset.min.confidence";
    private static final HashMap<String, String> ALIASES = new HashMap();
    private static final HashSet<String> DETECTABLES = new HashSet();
    private static final int MIN_LENGTH = 4;
    private final int minConfidence;
    private final CharsetDetector detector;
    private final List<EncodingClue> clues;

    public EncodingDetector(Configuration conf) {
        this.minConfidence = conf.getInt(MIN_CONFIDENCE_KEY, -1);
        this.detector = new CharsetDetector();
        this.clues = new ArrayList<EncodingClue>();
    }

    public void autoDetectClues(WebPage page, boolean filter) {
        this.autoDetectClues(page.getContent(), page.getContentType(), EncodingDetector.parseCharacterEncoding(page.getHeaders().get(CONTENT_TYPE_UTF8)), filter);
    }

    private void autoDetectClues(ByteBuffer dataBuffer, CharSequence typeUtf8, String encoding, boolean filter) {
        int length = dataBuffer.remaining();
        String type = TableUtil.toString(typeUtf8);
        if (this.minConfidence >= 0 && DETECTABLES.contains(type) && length > 4) {
            CharsetMatch[] matches = null;
            try {
                this.detector.enableInputFilter(filter);
                this.detector.setText((InputStream)new ByteArrayInputStream(dataBuffer.array(), dataBuffer.arrayOffset() + dataBuffer.position(), length));
                matches = this.detector.detectAll();
            }
            catch (Exception e) {
                LOG.debug("Exception from ICU4J (ignoring): ", (Throwable)e);
            }
            if (matches != null) {
                for (void var11_12 : matches) {
                    this.addClue(var11_12.getName(), "detect", var11_12.getConfidence());
                }
            }
        }
        this.addClue(encoding, "header");
    }

    public void addClue(String value, String source, int confidence) {
        if (value == null || "".equals(value)) {
            return;
        }
        if ((value = EncodingDetector.resolveEncodingAlias(value)) != null) {
            this.clues.add(new EncodingClue(value, source, confidence));
        }
    }

    public void addClue(String value, String source) {
        this.addClue(value, source, -1);
    }

    public String guessEncoding(WebPage page, String defaultValue) {
        CharSequence baseUrlUtf8 = page.getBaseUrl();
        String baseUrl = TableUtil.toString(baseUrlUtf8);
        return this.guessEncoding(baseUrl, defaultValue);
    }

    private String guessEncoding(String baseUrl, String defaultValue) {
        EncodingClue defaultClue;
        if (LOG.isTraceEnabled()) {
            this.findDisagreements(baseUrl, this.clues);
        }
        EncodingClue bestClue = defaultClue = new EncodingClue(defaultValue, "default");
        for (EncodingClue clue : this.clues) {
            if (LOG.isTraceEnabled()) {
                LOG.trace(baseUrl + ": charset " + clue);
            }
            String charset = clue.value;
            if (this.minConfidence >= 0 && clue.confidence >= this.minConfidence) {
                if (LOG.isTraceEnabled()) {
                    LOG.trace(baseUrl + ": Choosing encoding: " + charset + " with confidence " + clue.confidence);
                }
                return EncodingDetector.resolveEncodingAlias(charset).toLowerCase();
            }
            if (clue.confidence != -1 || bestClue != defaultClue) continue;
            bestClue = clue;
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace(baseUrl + ": Choosing encoding: " + bestClue);
        }
        return bestClue.value.toLowerCase();
    }

    public void clearClues() {
        this.clues.clear();
    }

    private void findDisagreements(String url, List<EncodingClue> newClues) {
        HashSet<String> valsSeen = new HashSet<String>();
        HashSet<String> sourcesSeen = new HashSet<String>();
        boolean disagreement = false;
        for (int i = 0; i < newClues.size(); ++i) {
            EncodingClue clue = newClues.get(i);
            if (clue.isEmpty() || sourcesSeen.contains(clue.source)) continue;
            if (valsSeen.size() > 0 && !valsSeen.contains(clue.value) && clue.meetsThreshold()) {
                disagreement = true;
            }
            if (clue.meetsThreshold()) {
                valsSeen.add(clue.value);
            }
            sourcesSeen.add(clue.source);
        }
        if (disagreement) {
            StringBuffer sb = new StringBuffer();
            sb.append("Disagreement: " + url + "; ");
            for (int i = 0; i < newClues.size(); ++i) {
                if (i > 0) {
                    sb.append(", ");
                }
                sb.append(newClues.get(i));
            }
            LOG.trace(sb.toString());
        }
    }

    public static String resolveEncodingAlias(String encoding) {
        try {
            if (encoding == null || !Charset.isSupported(encoding)) {
                return null;
            }
            String canonicalName = new String(Charset.forName(encoding).name());
            return ALIASES.containsKey(canonicalName) ? ALIASES.get(canonicalName) : canonicalName;
        }
        catch (Exception e) {
            LOG.warn("Invalid encoding " + encoding + " detected, using default.");
            return null;
        }
    }

    public static String parseCharacterEncoding(CharSequence contentTypeUtf8) {
        if (contentTypeUtf8 == null) {
            return null;
        }
        String contentType = contentTypeUtf8.toString();
        int start = contentType.indexOf("charset=");
        if (start < 0) {
            return null;
        }
        String encoding = contentType.substring(start + 8);
        int end = encoding.indexOf(59);
        if (end >= 0) {
            encoding = encoding.substring(0, end);
        }
        if ((encoding = encoding.trim()).length() > 2 && encoding.startsWith("\"") && encoding.endsWith("\"")) {
            encoding = encoding.substring(1, encoding.length() - 1);
        }
        return encoding.trim();
    }

    static {
        DETECTABLES.add("text/html");
        DETECTABLES.add("text/plain");
        DETECTABLES.add("text/richtext");
        DETECTABLES.add("text/rtf");
        DETECTABLES.add("text/sgml");
        DETECTABLES.add("text/tab-separated-values");
        DETECTABLES.add("text/xml");
        DETECTABLES.add("application/rss+xml");
        DETECTABLES.add("application/xhtml+xml");
        ALIASES.put("ISO-8859-1", "windows-1252");
        ALIASES.put("EUC-KR", "x-windows-949");
        ALIASES.put("x-EUC-CN", "GB18030");
        ALIASES.put("GBK", "GB18030");
    }

    private class EncodingClue {
        private final String value;
        private final String source;
        private final int confidence;

        public EncodingClue(String value, String source) {
            this(value, source, -1);
        }

        public EncodingClue(String value, String source, int confidence) {
            this.value = value.toLowerCase();
            this.source = source;
            this.confidence = confidence;
        }

        public String getSource() {
            return this.source;
        }

        public String getValue() {
            return this.value;
        }

        public String toString() {
            return this.value + " (" + this.source + (this.confidence >= 0 ? ", " + this.confidence + "% confidence" : "") + ")";
        }

        public boolean isEmpty() {
            return this.value == null || "".equals(this.value);
        }

        public boolean meetsThreshold() {
            return this.confidence < 0 || EncodingDetector.this.minConfidence >= 0 && this.confidence >= EncodingDetector.this.minConfidence;
        }
    }
}

