/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.crawl;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import org.apache.hadoop.io.MD5Hash;
import org.apache.nutch.crawl.MD5Signature;
import org.apache.nutch.crawl.Signature;
import org.apache.nutch.storage.WebPage;

public class TextProfileSignature
extends Signature {
    private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
    Signature fallback = new MD5Signature();

    @Override
    public byte[] calculate(WebPage page) {
        int MIN_TOKEN_LEN = this.getConf().getInt("db.signature.text_profile.min_token_len", 2);
        float QUANT_RATE = this.getConf().getFloat("db.signature.text_profile.quant_rate", 0.01f);
        HashMap<String, Token> tokens = new HashMap<String, Token>();
        String text = null;
        if (page.getText() != null) {
            text = page.getText().toString();
        }
        if (text == null || text.length() == 0) {
            return this.fallback.calculate(page);
        }
        StringBuffer curToken = new StringBuffer();
        int maxFreq = 0;
        for (int i = 0; i < text.length(); ++i) {
            char c = text.charAt(i);
            if (Character.isLetterOrDigit(c)) {
                curToken.append(Character.toLowerCase(c));
                continue;
            }
            if (curToken.length() <= 0) continue;
            if (curToken.length() > MIN_TOKEN_LEN) {
                String s = curToken.toString();
                Token tok = (Token)tokens.get(s);
                if (tok == null) {
                    tok = new Token(0, s);
                    tokens.put(s, tok);
                }
                ++tok.cnt;
                if (tok.cnt > maxFreq) {
                    maxFreq = tok.cnt;
                }
            }
            curToken.setLength(0);
        }
        if (curToken.length() > MIN_TOKEN_LEN) {
            String s = curToken.toString();
            Token tok = (Token)tokens.get(s);
            if (tok == null) {
                tok = new Token(0, s);
                tokens.put(s, tok);
            }
            ++tok.cnt;
            if (tok.cnt > maxFreq) {
                maxFreq = tok.cnt;
            }
        }
        Iterator<Object> it = tokens.values().iterator();
        ArrayList<Token> profile = new ArrayList<Token>();
        int QUANT = Math.round((float)maxFreq * QUANT_RATE);
        if (QUANT < 2) {
            QUANT = maxFreq > 1 ? 2 : 1;
        }
        while (it.hasNext()) {
            Token t = (Token)it.next();
            t.cnt = t.cnt / QUANT * QUANT;
            if (t.cnt < QUANT) continue;
            profile.add(t);
        }
        Collections.sort(profile, new TokenComparator());
        StringBuffer newText = new StringBuffer();
        for (Token t : profile) {
            if (newText.length() > 0) {
                newText.append("\n");
            }
            newText.append(t.toString());
        }
        return MD5Hash.digest((String)newText.toString()).getDigest();
    }

    @Override
    public Collection<WebPage.Field> getFields() {
        return FIELDS;
    }

    static {
        FIELDS.add(WebPage.Field.CONTENT);
    }

    private static class TokenComparator
    implements Comparator<Token> {
        private TokenComparator() {
        }

        @Override
        public int compare(Token t1, Token t2) {
            return t2.cnt - t1.cnt;
        }
    }

    private static class Token {
        public int cnt;
        public String val;

        public Token(int cnt, String val) {
            this.cnt = cnt;
            this.val = val;
        }

        public String toString() {
            return this.val + " " + this.cnt;
        }
    }
}

