/*
 * Decompiled with CFR 0.152.
 */
package weka.core.tokenizers;

import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Vector;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.WekaOptionUtils;
import weka.core.tokenizers.Tokenizer;
import weka.core.tokenizers.WordTokenizer;
import weka.core.tokenizers.cleaners.PassThrough;
import weka.core.tokenizers.cleaners.TokenCleaner;

public class PreCleanedTokenizer
extends Tokenizer {
    private static final long serialVersionUID = 6094968316580671771L;
    public static final String PRE_TOKENIZER = "pre-tokenizer";
    public static final String CLEANER = "cleaner";
    public static final String POST_TOKENIZER = "post-tokenizer";
    protected Tokenizer m_PreTokenizer = this.getDefaultPreTokenizer();
    protected TokenCleaner m_Cleaner = this.getDefaultCleaner();
    protected Tokenizer m_PostTokenizer = this.getDefaultPostTokenizer();

    public String globalInfo() {
        return "Allows the cleaning of tokens before actual tokenization.\nProcess:\n- apply pre-tokenizer to obtain tokens to clean\n- apply token cleaner to tokens\n- combine cleaned tokens back into a string (separated by blanks)\n- apply post-tokenizer to produce final set of tokens";
    }

    public Enumeration listOptions() {
        Vector result = new Vector();
        WekaOptionUtils.addOption(result, this.preTokenizerTipText(), this.getDefaultPreTokenizer().getClass().getName(), PRE_TOKENIZER);
        WekaOptionUtils.addOption(result, this.cleanerTipText(), this.getDefaultCleaner().getClass().getName(), CLEANER);
        WekaOptionUtils.addOption(result, this.postTokenizerTipText(), this.getDefaultPostTokenizer().getClass().getName(), POST_TOKENIZER);
        WekaOptionUtils.add(result, super.listOptions());
        return WekaOptionUtils.toEnumeration(result);
    }

    public void setOptions(String[] options) throws Exception {
        this.setPreTokenizer((Tokenizer)WekaOptionUtils.parse(options, PRE_TOKENIZER, (OptionHandler)this.getDefaultPreTokenizer()));
        this.setCleaner((TokenCleaner)WekaOptionUtils.parse(options, CLEANER, (OptionHandler)this.getDefaultCleaner()));
        this.setPostTokenizer((Tokenizer)WekaOptionUtils.parse(options, POST_TOKENIZER, (OptionHandler)this.getDefaultPostTokenizer()));
        super.setOptions(options);
    }

    public String[] getOptions() {
        ArrayList<String> result = new ArrayList<String>();
        WekaOptionUtils.add(result, PRE_TOKENIZER, (OptionHandler)this.getPreTokenizer());
        WekaOptionUtils.add(result, CLEANER, (OptionHandler)this.getCleaner());
        WekaOptionUtils.add(result, POST_TOKENIZER, (OptionHandler)this.getPostTokenizer());
        WekaOptionUtils.add(result, super.getOptions());
        return WekaOptionUtils.toArray(result);
    }

    protected Tokenizer getDefaultPreTokenizer() {
        return new WordTokenizer();
    }

    public void setPreTokenizer(Tokenizer value) {
        this.m_PreTokenizer = value;
    }

    public Tokenizer getPreTokenizer() {
        return this.m_PreTokenizer;
    }

    public String preTokenizerTipText() {
        return "The tokenizer to use for the initial tokenization (before cleaning).";
    }

    protected TokenCleaner getDefaultCleaner() {
        return new PassThrough();
    }

    public void setCleaner(TokenCleaner value) {
        this.m_Cleaner = value;
    }

    public TokenCleaner getCleaner() {
        return this.m_Cleaner;
    }

    public String cleanerTipText() {
        return "The cleaner to use for cleaning the tokens from the initial tokenization.";
    }

    protected Tokenizer getDefaultPostTokenizer() {
        return new WordTokenizer();
    }

    public void setPostTokenizer(Tokenizer value) {
        this.m_PostTokenizer = value;
    }

    public Tokenizer getPostTokenizer() {
        return this.m_PostTokenizer;
    }

    public String postTokenizerTipText() {
        return "The tokenizer to use for the final tokenization (after cleaning).";
    }

    public boolean hasMoreElements() {
        return this.m_PostTokenizer.hasMoreElements();
    }

    public String nextElement() {
        return this.m_PostTokenizer.nextElement();
    }

    public void tokenize(String s) {
        this.m_PreTokenizer.tokenize(s);
        StringBuilder clean = new StringBuilder();
        while (this.m_PreTokenizer.hasMoreElements()) {
            String token = this.m_PreTokenizer.nextElement();
            if ((token = this.m_Cleaner.clean(token)) == null) continue;
            if (clean.length() > 0) {
                clean.append(" ");
            }
            clean.append(token);
        }
        this.m_PostTokenizer.tokenize(clean.toString());
    }

    public String getRevision() {
        return RevisionUtils.extract((String)"$Revision: 10824 $");
    }
}

