/*
 * Decompiled with CFR 0.152.
 */
package weka.filters.unsupervised.attribute;

import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.process.LexedTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.StringUtils;
import java.io.File;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.List;
import java.util.Vector;
import java.util.regex.Pattern;
import weka.core.Capabilities;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.RevisionUtils;
import weka.core.Utils;
import weka.core.stopwords.Null;
import weka.core.stopwords.StopwordsHandler;
import weka.core.tokenizers.Tokenizer;
import weka.core.tokenizers.WhiteSpaceTokenizer;
import weka.core.tokenizers.WordTokenizer;
import weka.filters.Filter;
import weka.filters.SimpleStreamFilter;

public class PartOfSpeechTagging
extends SimpleStreamFilter {
    private static final long serialVersionUID = 5180866251200474411L;
    protected static TokenizerFactory m_TokenizerFactory;
    protected Range m_AttributeIndices = new Range("first-last");
    protected Tokenizer m_Tokenizer = new WhiteSpaceTokenizer();
    protected File m_Model = new File(".");
    protected String m_AdditionalOptions = "";
    protected LexicalizedParser m_Parser = null;
    protected boolean m_SuppressLabelPrefixes = false;
    protected String m_RegExpLabels = ".*";
    protected StopwordsHandler m_Stopwords = new Null();

    public String globalInfo() {
        return "Performs part-of-speech tagging using the Stanford parser and the user-specified model.";
    }

    public Enumeration<Option> listOptions() {
        Vector<Object> result = new Vector<Object>();
        result.addElement(new Option("\tThe attribute range to work on.\nThis is a comma separated list of attribute indices, with \"first\" and \"last\" valid values.\n\tSpecify an inclusive range with \"-\".\n\tE.g: \"first-3,5,6-10,last\".\n\t(default: first-last)", "R", 1, "-R <range>"));
        result.addElement(new Option("\tInverts the attribute selection range.\n\t(default: off)", "V", 0, "-V"));
        result.addElement(new Option("\tThe tokenizing algorihtm (classname plus parameters) to use.\n\t(default: " + WordTokenizer.class.getName() + ")", "tokenizer", 1, "-tokenizer <spec>"));
        result.addElement(new Option("\tThe stanford model file to use.\n\t(default: .", "model", 1, "-model <file>"));
        result.addElement(new Option("\tThe additional options for the parser.\n\t(default: ", "additional", 1, "-additional <options>"));
        result.addElement(new Option("\tWhether to suppress label prefixes (like VP or NP).\n\t(default: off)", "suppress-label-prefixes", 0, "-suppress-label-prefixes"));
        result.addElement(new Option("\tThe regular expression for the labels (like NV or VP) to keep.\n\tWords which labels don't match this expression get dropped.\n\t(default: .*", "regexp-labels", 1, "-regexp-labels <expression>"));
        result.addElement(new Option("\tThe stopwords algorihtm (classname plus parameters) to use.\n\t(default: " + Null.class.getName() + ")", "stopwords", 1, "-stopwords <spec>"));
        result.addAll(Collections.list(super.listOptions()));
        return result.elements();
    }

    public void setOptions(String[] options) throws Exception {
        String className;
        String[] classOptions;
        String tmpStr = Utils.getOption((String)"R", (String[])options);
        if (tmpStr.length() != 0) {
            this.setAttributeIndices(tmpStr);
        } else {
            this.setAttributeIndices("first-last");
        }
        this.setInvertSelection(Utils.getFlag((String)"V", (String[])options));
        tmpStr = Utils.getOption((String)"tokenizer", (String[])options);
        if (tmpStr.length() == 0) {
            this.setTokenizer((Tokenizer)new WordTokenizer());
        } else {
            classOptions = Utils.splitOptions((String)tmpStr);
            if (classOptions.length == 0) {
                throw new Exception("Invalid tokenizer specification string");
            }
            className = classOptions[0];
            classOptions[0] = "";
            Tokenizer tokenizer = (Tokenizer)Class.forName(className).newInstance();
            tokenizer.setOptions(classOptions);
            this.setTokenizer(tokenizer);
        }
        tmpStr = Utils.getOption((String)"model", (String[])options);
        if (tmpStr.length() != 0) {
            this.setModel(new File(tmpStr));
        } else {
            this.setModel(new File("."));
        }
        tmpStr = Utils.getOption((String)"additional", (String[])options);
        if (tmpStr.length() != 0) {
            this.setAdditionalOptions(tmpStr);
        } else {
            this.setAdditionalOptions("");
        }
        this.setSuppressLabelPrefixes(Utils.getFlag((String)"suppress-label-prefixes", (String[])options));
        tmpStr = Utils.getOption((String)"regexp-labels", (String[])options);
        if (tmpStr.length() != 0) {
            this.setRegExpLabels(tmpStr);
        } else {
            this.setRegExpLabels(".*");
        }
        tmpStr = Utils.getOption((String)"stopwords", (String[])options);
        if (tmpStr.length() == 0) {
            this.setStopwords((StopwordsHandler)new Null());
        } else {
            classOptions = Utils.splitOptions((String)tmpStr);
            if (classOptions.length == 0) {
                throw new Exception("Invalid stopwords specification string");
            }
            className = classOptions[0];
            classOptions[0] = "";
            StopwordsHandler stopwords = (StopwordsHandler)Class.forName(className).newInstance();
            if (stopwords instanceof OptionHandler) {
                ((OptionHandler)stopwords).setOptions(classOptions);
            }
            this.setStopwords(stopwords);
        }
        if (this.getInputFormat() != null) {
            this.setInputFormat(this.getInputFormat());
        }
        super.setOptions(options);
        Utils.checkForRemainingOptions((String[])options);
    }

    public String[] getOptions() {
        Vector<String> result = new Vector<String>();
        result.add("-R");
        result.add(this.getAttributeIndices());
        if (this.getInvertSelection()) {
            result.add("-V");
        }
        result.add("-tokenizer");
        result.add(Utils.toCommandLine((Object)this.getTokenizer()));
        result.add("-model");
        result.add("" + this.getModel());
        if (!this.getAdditionalOptions().isEmpty()) {
            result.add("-additional");
            result.add(this.getAdditionalOptions());
        }
        if (this.getSuppressLabelPrefixes()) {
            result.add("-suppress-label-prefixes");
        }
        result.add("-regexp-labels");
        result.add(this.getRegExpLabels());
        result.add("-stopwords");
        result.add(Utils.toCommandLine((Object)this.getStopwords()));
        Collections.addAll(result, super.getOptions());
        return result.toArray(new String[result.size()]);
    }

    public void setAttributeIndices(String value) {
        this.m_AttributeIndices.setRanges(value);
    }

    public String getAttributeIndices() {
        return this.m_AttributeIndices.getRanges();
    }

    public String attributeIndicesTipText() {
        return "Specify range of attributes to act on; this is a comma separated list of attribute indices, with \"first\" and \"last\" valid values; specify an inclusive range with \"-\"; eg: \"first-3,5,6-10,last\".";
    }

    public void setInvertSelection(boolean value) {
        this.m_AttributeIndices.setInvert(value);
    }

    public boolean getInvertSelection() {
        return this.m_AttributeIndices.getInvert();
    }

    public String invertSelectionTipText() {
        return "If set to true, the selection will be inverted; eg: the attribute indices '2-4' then mean everything apart from '2-4'.";
    }

    public void setTokenizer(Tokenizer value) {
        this.m_Tokenizer = value;
    }

    public Tokenizer getTokenizer() {
        return this.m_Tokenizer;
    }

    public String tokenizerTipText() {
        return "The tokenizing algorithm to use on the strings.";
    }

    public void setModel(File value) {
        this.m_Model = value;
    }

    public File getModel() {
        return this.m_Model;
    }

    public String modelTipText() {
        return "The Stanford model file to use.";
    }

    public void setAdditionalOptions(String value) {
        this.m_AdditionalOptions = value;
    }

    public String getAdditionalOptions() {
        return this.m_AdditionalOptions;
    }

    public String additionalOptionsTipText() {
        return "The additional options for the stanford parser.";
    }

    public void setSuppressLabelPrefixes(boolean value) {
        this.m_SuppressLabelPrefixes = value;
    }

    public boolean getSuppressLabelPrefixes() {
        return this.m_SuppressLabelPrefixes;
    }

    public String suppressLabelPrefixesTipText() {
        return "If set to true, the prefix labels (like 'NP' and 'VP') get suppressed.";
    }

    public void setRegExpLabels(String value) {
        if (value == null || value.isEmpty()) {
            value = ".*";
        }
        this.m_RegExpLabels = value;
    }

    public String getRegExpLabels() {
        return this.m_RegExpLabels;
    }

    public String regExpLabelsTipText() {
        return "The regular expression that prefix labels must match in order for their associated words to make it into the output.";
    }

    public void setStopwords(StopwordsHandler value) {
        this.m_Stopwords = value;
    }

    public StopwordsHandler getStopwords() {
        return this.m_Stopwords;
    }

    public String stopwordsTipText() {
        return "The stopwrods algorithm to apply after the parsing.";
    }

    public Capabilities getCapabilities() {
        Capabilities result = super.getCapabilities();
        result.disableAll();
        result.enableAllAttributes();
        result.enable(Capabilities.Capability.MISSING_VALUES);
        result.enableAllClasses();
        result.enable(Capabilities.Capability.MISSING_CLASS_VALUES);
        result.enable(Capabilities.Capability.NO_CLASS);
        return result;
    }

    protected void reset() {
        super.reset();
        this.m_Parser = null;
    }

    protected Instances determineOutputFormat(Instances inputFormat) throws Exception {
        if (!this.m_Model.exists()) {
            throw new IllegalStateException("Model file does not exist: " + this.m_Model);
        }
        if (this.m_Model.isDirectory()) {
            throw new IllegalStateException("Model file points to a directory: " + this.m_Model);
        }
        this.m_Parser = LexicalizedParser.loadModel((String)this.m_Model.getAbsolutePath(), (String[])Utils.splitOptions((String)this.m_AdditionalOptions));
        this.m_AttributeIndices.setUpper(inputFormat.numAttributes() - 1);
        return new Instances(inputFormat, 0);
    }

    protected String shorten(String s, int max) {
        if (s.length() > max) {
            return s.substring(0, max) + "...";
        }
        return s;
    }

    protected void debug(String msg) {
        if (this.getDebug()) {
            System.out.println(((Object)((Object)this)).getClass().getName() + ": " + msg);
        }
    }

    protected void traverseTree(Tree parentTree, StringBuilder content, Pattern pattern) {
        for (int i = 0; i < parentTree.children().length; ++i) {
            Tree childTree = parentTree.children()[i];
            if (childTree.isLeaf()) {
                String label = parentTree.label().value();
                String word = childTree.label().value();
                if (this.m_Stopwords.isStopword(word) || pattern != null && !pattern.matcher(label).matches()) continue;
                if (content.length() > 0) {
                    content.append(" ");
                }
                if (!this.m_SuppressLabelPrefixes) {
                    content.append(label + ":");
                }
                content.append(word);
            }
            this.traverseTree(childTree, content, pattern);
        }
    }

    protected List<String> getSentences(String doc) {
        ArrayList<String> result = new ArrayList<String>();
        DocumentPreprocessor preProcessor = new DocumentPreprocessor((Reader)new StringReader(doc));
        preProcessor.setTokenizerFactory(PartOfSpeechTagging.getTokenizerFactory());
        for (List sentence : preProcessor) {
            result.add(StringUtils.joinWithOriginalWhiteSpace((List)sentence));
        }
        return result;
    }

    protected Instance process(Instance instance) throws Exception {
        ArrayList<String> words = new ArrayList<String>();
        double[] values = new double[instance.numAttributes()];
        double[] current = instance.toDoubleArray();
        Pattern pattern = null;
        if (!this.m_RegExpLabels.equals(".*")) {
            pattern = Pattern.compile(this.m_RegExpLabels);
        }
        for (int i = 0; i < instance.numAttributes() - 1; ++i) {
            if (!instance.attribute(i).isString() || !this.m_AttributeIndices.isInRange(i)) {
                values[i] = current[i];
                continue;
            }
            if (instance.isMissing(i)) {
                values[i] = Utils.missingValue();
                continue;
            }
            List<String> sentences = this.getSentences(instance.stringValue(i));
            if (this.getDebug()) {
                this.debug(sentences.size() + " sentence(s) [" + instance.stringValue(i).length() + "]: " + this.shorten(instance.stringValue(i), 40));
            }
            StringBuilder tagged = new StringBuilder();
            for (String sentence : sentences) {
                if (this.getDebug()) {
                    this.debug("    " + this.shorten(sentence, 30) + " [" + sentence.length() + "]");
                }
                words.clear();
                this.m_Tokenizer.tokenize(sentence);
                while (this.m_Tokenizer.hasMoreElements()) {
                    words.add(this.m_Tokenizer.nextElement());
                }
                Tree tree = this.m_Parser.apply(Sentence.toWordList((String[])words.toArray(new String[words.size()])));
                this.traverseTree(tree, tagged, pattern);
                tagged.append(". ");
            }
            String newString = tagged.toString().trim();
            values[i] = this.getOutputFormat().attribute(i).addStringValue(newString);
        }
        return new DenseInstance(instance.weight(), values);
    }

    public String getRevision() {
        return RevisionUtils.extract((String)"$Revision: -1 $");
    }

    protected static synchronized TokenizerFactory getTokenizerFactory() {
        if (m_TokenizerFactory == null) {
            m_TokenizerFactory = PTBTokenizer.factory((LexedTokenFactory)new CoreLabelTokenFactory(), (String)"normalizeParentheses=false,normalizeOtherBrackets=false,invertible=true");
        }
        return m_TokenizerFactory;
    }

    public static void main(String[] args) {
        PartOfSpeechTagging.runFilter((Filter)new PartOfSpeechTagging(), (String[])args);
    }
}

