/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.WordTag;
import edu.stanford.nlp.parser.lexparser.IntTaggedWord;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.trees.Tree;
import java.io.Serializable;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class ChineseUnknownWordModel
implements Serializable {
    private static final String encoding = "GB18030";
    private static final boolean VERBOSE = false;
    private boolean useFirst = true;
    private boolean useGT = false;
    boolean useUnicodeType = false;
    private static final String unknown = "UNK";
    private static final String dateMatch = ".*[\u5e74\u6708\u65e5\u53f7]";
    private static final String numberMatch = ".*[\uff10\uff11\uff12\uff13\uff14\uff15\uff16\uff17\uff18\uff19\uff11\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e\u5343\u4e07\u4ebf].*";
    private static final String ordinalMatch = "\u7b2c.*";
    private static final String properNameMatch = ".*\u00b7.*";
    private Map<String, Counter<String>> tagHash = new HashMap<String, Counter<String>>();
    private Set seenFirst = new HashSet();
    private Map unknownGT = new HashMap();
    private static final long serialVersionUID = 221L;

    void useGoodTuring() {
        this.useGT = true;
        this.useFirst = false;
    }

    public double score(IntTaggedWord itw) {
        return this.score(itw.toTaggedWord());
    }

    /*
     * Enabled force condition propagation
     * Lifted jumps to return sites
     */
    public double score(TaggedWord tw) {
        String word = tw.word();
        String tag = tw.tag();
        if (word.matches(dateMatch)) {
            if (!tag.equals("NT")) return Double.NEGATIVE_INFINITY;
            return 0.0;
        }
        if (word.matches(numberMatch)) {
            if (tag.equals("CD") && !word.matches(ordinalMatch)) {
                return 0.0;
            }
            if (!tag.equals("OD")) return Double.NEGATIVE_INFINITY;
            if (!word.matches(ordinalMatch)) return Double.NEGATIVE_INFINITY;
            return 0.0;
        }
        if (word.matches(properNameMatch)) {
            if (!tag.equals("NR")) return Double.NEGATIVE_INFINITY;
            return 0.0;
        }
        if (this.useFirst) {
            Counter<String> wordProbs;
            char ch;
            int type;
            String first = word.substring(0, 1);
            if (this.useUnicodeType && (type = Character.getType(ch = word.charAt(0))) != 5) {
                first = Integer.toString(type);
            }
            if (!this.seenFirst.contains(first)) {
                if (this.useGT) {
                    return this.scoreGT(tag);
                }
                first = unknown;
            }
            if ((wordProbs = this.tagHash.get(tag)) == null) {
                return Double.NEGATIVE_INFINITY;
            }
            if (!wordProbs.containsKey(first)) return wordProbs.getCount(unknown);
            return wordProbs.getCount(first);
        }
        if (!this.useGT) return Double.NEGATIVE_INFINITY;
        return this.scoreGT(tag);
    }

    private double scoreGT(String tag) {
        double logProb = this.unknownGT.containsKey(tag) ? (Double)this.unknownGT.get(tag) : Double.NEGATIVE_INFINITY;
        return logProb;
    }

    public void train(Collection<Tree> trees) {
        if (this.useFirst) {
            System.err.println("ChineseUWM: treating unknown word as the average of their equivalents by first-character identity. useUnicodeType: " + this.useUnicodeType);
        }
        if (this.useGT) {
            System.err.println("ChineseUWM: using Good-Turing smoothing for unknown words.");
        }
        this.trainUnknownGT(trees);
        HashMap c = new HashMap();
        Counter<String> tc = new Counter<String>();
        for (Tree t : trees) {
            Sentence words = t.taggedYield();
            for (TaggedWord tw : words) {
                String tag;
                char ch;
                int type;
                String word = tw.word();
                String first = tw.word().substring(0, 1);
                if (this.useUnicodeType && (type = Character.getType(ch = word.charAt(0))) != 5) {
                    first = Integer.toString(type);
                }
                if (!c.containsKey(tag = tw.tag())) {
                    c.put(tag, new Counter());
                }
                ((Counter)c.get(tag)).incrementCount(first);
                tc.incrementCount(tag);
                this.seenFirst.add(first);
            }
        }
        for (String tag : c.keySet()) {
            Counter wc = (Counter)c.get(tag);
            if (!this.tagHash.containsKey(tag)) {
                this.tagHash.put(tag, new Counter());
            }
            tc.incrementCount(tag);
            wc.setCount(unknown, 1.0);
            for (String first : wc.keySet()) {
                double prob = Math.log(wc.getCount(first) / tc.getCount(tag));
                this.tagHash.get(tag).setCount(first, prob);
            }
        }
    }

    private void trainUnknownGT(Collection<Tree> trees) {
        Counter<TaggedWord> twCount = new Counter<TaggedWord>();
        Counter<WordTag> wtCount = new Counter<WordTag>();
        Counter<String> tagCount = new Counter<String>();
        Counter<String> r1 = new Counter<String>();
        Counter<String> r0 = new Counter<String>();
        HashSet<String> seenWords = new HashSet<String>();
        int tokens = 0;
        for (Tree t : trees) {
            Sentence words = t.taggedYield();
            Iterator j = words.iterator();
            while (j.hasNext()) {
                ++tokens;
                TaggedWord tw = (TaggedWord)j.next();
                WordTag wt = ChineseUnknownWordModel.toWordTag(tw);
                String word = wt.word();
                String tag = wt.tag();
                wtCount.incrementCount(wt);
                twCount.incrementCount(tw);
                tagCount.incrementCount(tag);
                boolean alreadySeen = seenWords.add(word);
            }
        }
        System.err.println("Total tokens: " + tokens + " [num words + numSent (boundarySymbols)]");
        System.err.println("Total WordTag types: " + wtCount.keySet().size());
        System.err.println("Total TaggedWord types: " + twCount.keySet().size() + " [should equal word types!]");
        System.err.println("Total tag types: " + tagCount.keySet().size());
        System.err.println("Total word types: " + seenWords.size());
        for (WordTag wt : wtCount.keySet()) {
            if (wtCount.getCount(wt) != 1.0) continue;
            r1.incrementCount(wt.tag());
        }
        for (String tag : tagCount.keySet()) {
            for (String word : seenWords) {
                WordTag wt = new WordTag(word, tag);
                if (wtCount.containsKey(wt)) continue;
                r0.incrementCount(tag);
            }
        }
        for (String tag : tagCount.keySet()) {
            double logprob = Math.log(r1.getCount(tag) / (tagCount.getCount(tag) * r0.getCount(tag)));
            this.unknownGT.put(tag, new Double(logprob));
        }
    }

    public static void main(String[] args) {
        System.out.println("Testing unknown matching");
        String s = "\u5218\u00b7\u9769\u547d";
        if (s.matches(properNameMatch)) {
            System.out.println("hooray names!");
        } else {
            System.out.println("Uh-oh names!");
        }
        String s1 = "\uff13\uff10\uff10\uff10";
        if (s1.matches(numberMatch)) {
            System.out.println("hooray numbers!");
        } else {
            System.out.println("Uh-oh numbers!");
        }
        String s11 = "\u767e\u5206\u4e4b\u56db\u5341\u4e09\u70b9\u4e8c";
        if (s1.matches(numberMatch)) {
            System.out.println("hooray numbers!");
        } else {
            System.out.println("Uh-oh numbers!");
        }
        String s12 = "\u767e\u5206\u4e4b\u4e09\u5341\u516b\u70b9\u516d";
        if (s1.matches(numberMatch)) {
            System.out.println("hooray numbers!");
        } else {
            System.out.println("Uh-oh numbers!");
        }
        String s2 = "\u4e09\u6708";
        if (s2.matches(dateMatch)) {
            System.out.println("hooray dates!");
        } else {
            System.out.println("Uh-oh dates!");
        }
        System.out.println("Testing tagged word");
        Counter<TaggedWord> c = new Counter<TaggedWord>();
        TaggedWord tw1 = new TaggedWord("w", "t");
        c.incrementCount(tw1);
        TaggedWord tw2 = new TaggedWord("w", "t2");
        System.out.println(c.containsKey(tw2));
        System.out.println(tw1.equals(tw2));
        WordTag wt1 = ChineseUnknownWordModel.toWordTag(tw1);
        WordTag wt2 = ChineseUnknownWordModel.toWordTag(tw2);
        WordTag wt3 = new WordTag("w", "t2");
        System.out.println(wt1.equals(wt2));
        System.out.println(wt2.equals(wt3));
    }

    private static WordTag toWordTag(TaggedWord tw) {
        return new WordTag(tw.word(), tw.tag());
    }
}

