/*
 * Decompiled with CFR 0.152.
 */
package weka.filters.unsupervised.attribute;

import java.io.File;
import java.io.Serializable;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.Vector;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.SparseInstance;
import weka.core.Stopwords;
import weka.core.Tag;
import weka.core.Utils;
import weka.core.stemmers.NullStemmer;
import weka.core.stemmers.Stemmer;
import weka.core.tokenizers.Tokenizer;
import weka.core.tokenizers.WordTokenizer;
import weka.filters.Filter;
import weka.filters.UnsupervisedFilter;

public class StringToWordVector
extends Filter
implements UnsupervisedFilter,
OptionHandler {
    static final long serialVersionUID = 8249106275278565424L;
    protected Range m_SelectedRange = new Range("first-last");
    private TreeMap m_Dictionary = new TreeMap();
    private boolean m_OutputCounts = false;
    private String m_Prefix = "";
    private int[] m_DocsCounts;
    private int m_NumInstances = -1;
    private double m_AvgDocLength = -1.0;
    private int m_WordsToKeep = 1000;
    private double m_PeriodicPruningRate = -1.0;
    private boolean m_TFTransform;
    protected int m_filterType = 0;
    public static final int FILTER_NONE = 0;
    public static final int FILTER_NORMALIZE_ALL = 1;
    public static final int FILTER_NORMALIZE_TEST_ONLY = 2;
    public static final Tag[] TAGS_FILTER = new Tag[]{new Tag(0, "No normalization"), new Tag(1, "Normalize all data"), new Tag(2, "Normalize test data only")};
    private boolean m_IDFTransform;
    private boolean m_lowerCaseTokens;
    private boolean m_useStoplist;
    private Stemmer m_Stemmer = new NullStemmer();
    private int m_minTermFreq = 1;
    private boolean m_doNotOperateOnPerClassBasis = false;
    private File m_Stopwords = new File(System.getProperty("user.dir"));
    private Tokenizer m_Tokenizer = new WordTokenizer();

    public StringToWordVector() {
    }

    @Override
    public Enumeration listOptions() {
        Vector<Option> result = new Vector<Option>();
        result.addElement(new Option("\tOutput word counts rather than boolean word presence.\n", "C", 0, "-C"));
        result.addElement(new Option("\tSpecify list of string attributes to convert to words (as weka Range).\n\t(default: select all string attributes)", "R", 1, "-R <index1,index2-index4,...>"));
        result.addElement(new Option("\tInvert matching sense of column indexes.", "V", 0, "-V"));
        result.addElement(new Option("\tSpecify a prefix for the created attribute names.\n\t(default: \"\")", "P", 1, "-P <attribute name prefix>"));
        result.addElement(new Option("\tSpecify approximate number of word fields to create.\n\tSurplus words will be discarded..\n\t(default: 1000)", "W", 1, "-W <number of words to keep>"));
        result.addElement(new Option("\tSpecify the rate (e.g., every 10% of the input dataset) at which to periodically prune the dictionary.\n\t-W prunes after creating a full dictionary. You may not have enough memory for this approach.\n\t(default: no periodic pruning)", "prune-rate", 1, "-prune-rate <rate as a percentage of dataset>"));
        result.addElement(new Option("\tTransform the word frequencies into log(1+fij)\n\twhere fij is the frequency of word i in jth document(instance).\n", "T", 0, "-T"));
        result.addElement(new Option("\tTransform each word frequency into:\n\tfij*log(num of Documents/num of documents containing word i)\n\t  where fij if frequency of word i in jth document(instance)", "I", 0, "-I"));
        result.addElement(new Option("\tWhether to 0=not normalize/1=normalize all data/2=normalize test data only\n\tto average length of training documents (default 0=don't normalize).", "N", 1, "-N"));
        result.addElement(new Option("\tConvert all tokens to lowercase before adding to the dictionary.", "L", 0, "-L"));
        result.addElement(new Option("\tIgnore words that are in the stoplist.", "S", 0, "-S"));
        result.addElement(new Option("\tThe stemmering algorihtm (classname plus parameters) to use.", "stemmer", 1, "-stemmer <spec>"));
        result.addElement(new Option("\tThe minimum term frequency (default = 1).", "M", 1, "-M <int>"));
        result.addElement(new Option("\tIf this is set, the maximum number of words and the \n\tminimum term frequency is not enforced on a per-class \n\tbasis but based on the documents in all the classes \n\t(even if a class attribute is set).", "O", 0, "-O"));
        result.addElement(new Option("\tA file containing stopwords to override the default ones.\n\tUsing this option automatically sets the flag ('-S') to use the\n\tstoplist if the file exists.\n\tFormat: one stopword per line, lines starting with '#'\n\tare interpreted as comments and ignored.", "stopwords", 1, "-stopwords <file>"));
        result.addElement(new Option("\tThe tokenizing algorihtm (classname plus parameters) to use.\n\t(default: " + WordTokenizer.class.getName() + ")", "tokenizer", 1, "-tokenizer <spec>"));
        return result.elements();
    }

    @Override
    public void setOptions(String[] options) throws Exception {
        String value = Utils.getOption('R', options);
        if (value.length() != 0) {
            this.setSelectedRange(value);
        } else {
            this.setSelectedRange("first-last");
        }
        this.setInvertSelection(Utils.getFlag('V', options));
        value = Utils.getOption('P', options);
        if (value.length() != 0) {
            this.setAttributeNamePrefix(value);
        } else {
            this.setAttributeNamePrefix("");
        }
        value = Utils.getOption('W', options);
        if (value.length() != 0) {
            this.setWordsToKeep(Integer.valueOf(value));
        } else {
            this.setWordsToKeep(1000);
        }
        value = Utils.getOption("prune-rate", options);
        if (value.length() > 0) {
            this.setPeriodicPruning(Double.parseDouble(value));
        } else {
            this.setPeriodicPruning(-1.0);
        }
        value = Utils.getOption('M', options);
        if (value.length() != 0) {
            this.setMinTermFreq(Integer.valueOf(value));
        } else {
            this.setMinTermFreq(1);
        }
        this.setOutputWordCounts(Utils.getFlag('C', options));
        this.setTFTransform(Utils.getFlag('T', options));
        this.setIDFTransform(Utils.getFlag('I', options));
        this.setDoNotOperateOnPerClassBasis(Utils.getFlag('O', options));
        String nString = Utils.getOption('N', options);
        if (nString.length() != 0) {
            this.setNormalizeDocLength(new SelectedTag(Integer.parseInt(nString), TAGS_FILTER));
        } else {
            this.setNormalizeDocLength(new SelectedTag(0, TAGS_FILTER));
        }
        this.setLowerCaseTokens(Utils.getFlag('L', options));
        this.setUseStoplist(Utils.getFlag('S', options));
        String stemmerString = Utils.getOption("stemmer", options);
        if (stemmerString.length() == 0) {
            this.setStemmer(null);
        } else {
            String[] stemmerSpec = Utils.splitOptions(stemmerString);
            if (stemmerSpec.length == 0) {
                throw new Exception("Invalid stemmer specification string");
            }
            String stemmerName = stemmerSpec[0];
            stemmerSpec[0] = "";
            Stemmer stemmer = (Stemmer)Class.forName(stemmerName).newInstance();
            if (stemmer instanceof OptionHandler) {
                ((OptionHandler)((Object)stemmer)).setOptions(stemmerSpec);
            }
            this.setStemmer(stemmer);
        }
        value = Utils.getOption("stopwords", options);
        if (value.length() != 0) {
            this.setStopwords(new File(value));
        } else {
            this.setStopwords(null);
        }
        String tokenizerString = Utils.getOption("tokenizer", options);
        if (tokenizerString.length() == 0) {
            this.setTokenizer(new WordTokenizer());
        } else {
            String[] tokenizerSpec = Utils.splitOptions(tokenizerString);
            if (tokenizerSpec.length == 0) {
                throw new Exception("Invalid tokenizer specification string");
            }
            String tokenizerName = tokenizerSpec[0];
            tokenizerSpec[0] = "";
            Tokenizer tokenizer = (Tokenizer)Class.forName(tokenizerName).newInstance();
            if (tokenizer instanceof OptionHandler) {
                tokenizer.setOptions(tokenizerSpec);
            }
            this.setTokenizer(tokenizer);
        }
    }

    @Override
    public String[] getOptions() {
        String spec;
        Vector<String> result = new Vector<String>();
        result.add("-R");
        result.add(this.getSelectedRange().getRanges());
        if (this.getInvertSelection()) {
            result.add("-V");
        }
        if (!"".equals(this.getAttributeNamePrefix())) {
            result.add("-P");
            result.add(this.getAttributeNamePrefix());
        }
        result.add("-W");
        result.add(String.valueOf(this.getWordsToKeep()));
        result.add("-prune-rate");
        result.add(String.valueOf(this.getPeriodicPruning()));
        if (this.getOutputWordCounts()) {
            result.add("-C");
        }
        if (this.getTFTransform()) {
            result.add("-T");
        }
        if (this.getIDFTransform()) {
            result.add("-I");
        }
        result.add("-N");
        result.add("" + this.m_filterType);
        if (this.getLowerCaseTokens()) {
            result.add("-L");
        }
        if (this.getUseStoplist()) {
            result.add("-S");
        }
        if (this.getStemmer() != null) {
            result.add("-stemmer");
            spec = this.getStemmer().getClass().getName();
            if (this.getStemmer() instanceof OptionHandler) {
                spec = spec + " " + Utils.joinOptions(((OptionHandler)((Object)this.getStemmer())).getOptions());
            }
            result.add(spec.trim());
        }
        result.add("-M");
        result.add(String.valueOf(this.getMinTermFreq()));
        if (this.getDoNotOperateOnPerClassBasis()) {
            result.add("-O");
        }
        if (!this.getStopwords().isDirectory()) {
            result.add("-stopwords");
            result.add(this.getStopwords().getAbsolutePath());
        }
        result.add("-tokenizer");
        spec = this.getTokenizer().getClass().getName();
        if (this.getTokenizer() instanceof OptionHandler) {
            spec = spec + " " + Utils.joinOptions(this.getTokenizer().getOptions());
        }
        result.add(spec.trim());
        return result.toArray(new String[result.size()]);
    }

    public StringToWordVector(int wordsToKeep) {
        this.m_WordsToKeep = wordsToKeep;
    }

    @Override
    public Capabilities getCapabilities() {
        Capabilities result = super.getCapabilities();
        result.disableAll();
        result.enableAllAttributes();
        result.enable(Capabilities.Capability.MISSING_VALUES);
        result.enableAllClasses();
        result.enable(Capabilities.Capability.MISSING_CLASS_VALUES);
        result.enable(Capabilities.Capability.NO_CLASS);
        return result;
    }

    @Override
    public boolean setInputFormat(Instances instanceInfo) throws Exception {
        super.setInputFormat(instanceInfo);
        this.m_SelectedRange.setUpper(instanceInfo.numAttributes() - 1);
        this.m_AvgDocLength = -1.0;
        this.m_NumInstances = -1;
        return false;
    }

    @Override
    public boolean input(Instance instance) throws Exception {
        if (this.getInputFormat() == null) {
            throw new IllegalStateException("No input instance format defined");
        }
        if (this.m_NewBatch) {
            this.resetQueue();
            this.m_NewBatch = false;
        }
        if (this.isFirstBatchDone()) {
            FastVector fv = new FastVector();
            int firstCopy = this.convertInstancewoDocNorm(instance, fv);
            Instance inst = (Instance)fv.elementAt(0);
            if (this.m_filterType != 0) {
                this.normalizeInstance(inst, firstCopy);
            }
            this.push(inst);
            return true;
        }
        this.bufferInput(instance);
        return false;
    }

    @Override
    public boolean batchFinished() throws Exception {
        if (this.getInputFormat() == null) {
            throw new IllegalStateException("No input instance format defined");
        }
        if (!this.isFirstBatchDone()) {
            int i;
            this.determineDictionary();
            FastVector fv = new FastVector();
            int firstCopy = 0;
            for (i = 0; i < this.m_NumInstances; ++i) {
                firstCopy = this.convertInstancewoDocNorm(this.getInputFormat().instance(i), fv);
            }
            if (this.m_filterType != 0) {
                this.m_AvgDocLength = 0.0;
                for (i = 0; i < fv.size(); ++i) {
                    Instance inst = (Instance)fv.elementAt(i);
                    double docLength = 0.0;
                    for (int j = 0; j < inst.numValues(); ++j) {
                        if (inst.index(j) < firstCopy) continue;
                        docLength += inst.valueSparse(j) * inst.valueSparse(j);
                    }
                    this.m_AvgDocLength += Math.sqrt(docLength);
                }
                this.m_AvgDocLength /= (double)this.m_NumInstances;
            }
            if (this.m_filterType == 1) {
                for (i = 0; i < fv.size(); ++i) {
                    this.normalizeInstance((Instance)fv.elementAt(i), firstCopy);
                }
            }
            for (i = 0; i < fv.size(); ++i) {
                this.push((Instance)fv.elementAt(i));
            }
        }
        this.flushInput();
        this.m_NewBatch = true;
        this.m_FirstBatchDone = true;
        return this.numPendingOutput() != 0;
    }

    public String globalInfo() {
        return "Converts String attributes into a set of attributes representing word occurrence (depending on the tokenizer) information from the text contained in the strings. The set of words (attributes) is determined by the first batch filtered (typically training data).";
    }

    public boolean getOutputWordCounts() {
        return this.m_OutputCounts;
    }

    public void setOutputWordCounts(boolean outputWordCounts) {
        this.m_OutputCounts = outputWordCounts;
    }

    public String outputWordCountsTipText() {
        return "Output word counts rather than boolean 0 or 1(indicating presence or absence of a word).";
    }

    public Range getSelectedRange() {
        return this.m_SelectedRange;
    }

    public void setSelectedRange(String newSelectedRange) {
        this.m_SelectedRange = new Range(newSelectedRange);
    }

    public String attributeIndicesTipText() {
        return "Specify range of attributes to act on. This is a comma separated list of attribute indices, with \"first\" and \"last\" valid values. Specify an inclusive range with \"-\". E.g: \"first-3,5,6-10,last\".";
    }

    public String getAttributeIndices() {
        return this.m_SelectedRange.getRanges();
    }

    public void setAttributeIndices(String rangeList) {
        this.m_SelectedRange.setRanges(rangeList);
    }

    public void setAttributeIndicesArray(int[] attributes) {
        this.setAttributeIndices(Range.indicesToRangeList(attributes));
    }

    public String invertSelectionTipText() {
        return "Set attribute selection mode. If false, only selected attributes in the range will be worked on; if true, only non-selected attributes will be processed.";
    }

    public boolean getInvertSelection() {
        return this.m_SelectedRange.getInvert();
    }

    public void setInvertSelection(boolean invert) {
        this.m_SelectedRange.setInvert(invert);
    }

    public String getAttributeNamePrefix() {
        return this.m_Prefix;
    }

    public void setAttributeNamePrefix(String newPrefix) {
        this.m_Prefix = newPrefix;
    }

    public String attributeNamePrefixTipText() {
        return "Prefix for the created attribute names. (default: \"\")";
    }

    public int getWordsToKeep() {
        return this.m_WordsToKeep;
    }

    public void setWordsToKeep(int newWordsToKeep) {
        this.m_WordsToKeep = newWordsToKeep;
    }

    public String wordsToKeepTipText() {
        return "The number of words (per class if there is a class attribute assigned) to attempt to keep.";
    }

    public double getPeriodicPruning() {
        return this.m_PeriodicPruningRate;
    }

    public void setPeriodicPruning(double newPeriodicPruning) {
        this.m_PeriodicPruningRate = newPeriodicPruning;
    }

    public String periodicPruningTipText() {
        return "Specify the rate (x% of the input dataset) at which to periodically prune the dictionary. wordsToKeep prunes after creating a full dictionary. You may not have enough memory for this approach.";
    }

    public boolean getTFTransform() {
        return this.m_TFTransform;
    }

    public void setTFTransform(boolean TFTransform) {
        this.m_TFTransform = TFTransform;
    }

    public String TFTransformTipText() {
        return "Sets whether if the word frequencies should be transformed into:\n    log(1+fij) \n       where fij is the frequency of word i in document (instance) j.";
    }

    public boolean getIDFTransform() {
        return this.m_IDFTransform;
    }

    public void setIDFTransform(boolean IDFTransform) {
        this.m_IDFTransform = IDFTransform;
    }

    public String IDFTransformTipText() {
        return "Sets whether if the word frequencies in a document should be transformed into: \n   fij*log(num of Docs/num of Docs with word i) \n      where fij is the frequency of word i in document (instance) j.";
    }

    public SelectedTag getNormalizeDocLength() {
        return new SelectedTag(this.m_filterType, TAGS_FILTER);
    }

    public void setNormalizeDocLength(SelectedTag newType) {
        if (newType.getTags() == TAGS_FILTER) {
            this.m_filterType = newType.getSelectedTag().getID();
        }
    }

    public String normalizeDocLengthTipText() {
        return "Sets whether if the word frequencies for a document (instance) should be normalized or not.";
    }

    public boolean getLowerCaseTokens() {
        return this.m_lowerCaseTokens;
    }

    public void setLowerCaseTokens(boolean downCaseTokens) {
        this.m_lowerCaseTokens = downCaseTokens;
    }

    public String doNotOperateOnPerClassBasisTipText() {
        return "If this is set, the maximum number of words and the minimum term frequency is not enforced on a per-class basis but based on the documents in all the classes (even if a class attribute is set).";
    }

    public boolean getDoNotOperateOnPerClassBasis() {
        return this.m_doNotOperateOnPerClassBasis;
    }

    public void setDoNotOperateOnPerClassBasis(boolean newDoNotOperateOnPerClassBasis) {
        this.m_doNotOperateOnPerClassBasis = newDoNotOperateOnPerClassBasis;
    }

    public String minTermFreqTipText() {
        return "Sets the minimum term frequency. This is enforced on a per-class basis.";
    }

    public int getMinTermFreq() {
        return this.m_minTermFreq;
    }

    public void setMinTermFreq(int newMinTermFreq) {
        this.m_minTermFreq = newMinTermFreq;
    }

    public String lowerCaseTokensTipText() {
        return "If set then all the word tokens are converted to lower case before being added to the dictionary.";
    }

    public boolean getUseStoplist() {
        return this.m_useStoplist;
    }

    public void setUseStoplist(boolean useStoplist) {
        this.m_useStoplist = useStoplist;
    }

    public String useStoplistTipText() {
        return "Ignores all the words that are on the stoplist, if set to true.";
    }

    public void setStemmer(Stemmer value) {
        this.m_Stemmer = value != null ? value : new NullStemmer();
    }

    public Stemmer getStemmer() {
        return this.m_Stemmer;
    }

    public String stemmerTipText() {
        return "The stemming algorithm to use on the words.";
    }

    public void setStopwords(File value) {
        if (value == null) {
            value = new File(System.getProperty("user.dir"));
        }
        this.m_Stopwords = value;
        if (value.exists() && value.isFile()) {
            this.setUseStoplist(true);
        }
    }

    public File getStopwords() {
        return this.m_Stopwords;
    }

    public String stopwordsTipText() {
        return "The file containing the stopwords (if this is a directory then the default ones are used).";
    }

    public void setTokenizer(Tokenizer value) {
        this.m_Tokenizer = value;
    }

    public Tokenizer getTokenizer() {
        return this.m_Tokenizer;
    }

    public String tokenizerTipText() {
        return "The tokenizing algorithm to use on the strings.";
    }

    private static void sortArray(int[] array) {
        int N = array.length - 1;
        int h = 1;
        while (h <= N / 9) {
            h = 3 * h + 1;
        }
        while (h > 0) {
            for (int i = h + 1; i <= N; ++i) {
                int v = array[i];
                for (int j = i; j > h && array[j - h] > v; j -= h) {
                    array[j] = array[j - h];
                }
                array[j] = v;
            }
            h /= 3;
        }
    }

    private void determineSelectedRange() {
        int j;
        StringBuffer fields;
        Instances inputFormat = this.getInputFormat();
        if (this.m_SelectedRange == null) {
            fields = new StringBuffer();
            for (j = 0; j < inputFormat.numAttributes(); ++j) {
                if (inputFormat.attribute(j).type() != 2) continue;
                fields.append(j + 1 + ",");
            }
            this.m_SelectedRange = new Range(fields.toString());
        }
        this.m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);
        fields = new StringBuffer();
        for (j = 0; j < inputFormat.numAttributes(); ++j) {
            if (!this.m_SelectedRange.isInRange(j) || inputFormat.attribute(j).type() != 2) continue;
            fields.append(j + 1 + ",");
        }
        this.m_SelectedRange.setRanges(fields.toString());
        this.m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);
    }

    private void determineDictionary() {
        Count count;
        Stopwords stopwords = new Stopwords();
        if (this.getUseStoplist()) {
            try {
                if (this.getStopwords().exists() && !this.getStopwords().isDirectory()) {
                    stopwords.read(this.getStopwords());
                }
            }
            catch (Exception e) {
                e.printStackTrace();
            }
        }
        int classInd = this.getInputFormat().classIndex();
        int values = 1;
        if (!this.m_doNotOperateOnPerClassBasis && classInd != -1) {
            values = this.getInputFormat().attribute(classInd).numValues();
        }
        TreeMap[] dictionaryArr = new TreeMap[values];
        for (int i = 0; i < values; ++i) {
            dictionaryArr[i] = new TreeMap();
        }
        this.determineSelectedRange();
        long pruneRate = Math.round(this.m_PeriodicPruningRate / 100.0 * (double)this.getInputFormat().numInstances());
        for (int i = 0; i < this.getInputFormat().numInstances(); ++i) {
            String word;
            Instance instance = this.getInputFormat().instance(i);
            int vInd = 0;
            if (!this.m_doNotOperateOnPerClassBasis && classInd != -1) {
                vInd = (int)instance.classValue();
            }
            Hashtable<String, Integer> h = new Hashtable<String, Integer>();
            for (int j = 0; j < instance.numAttributes(); ++j) {
                if (!this.m_SelectedRange.isInRange(j) || instance.isMissing(j)) continue;
                this.m_Tokenizer.tokenize(instance.stringValue(j));
                while (this.m_Tokenizer.hasMoreElements()) {
                    Count count2;
                    word = ((String)this.m_Tokenizer.nextElement()).intern();
                    if (this.m_lowerCaseTokens) {
                        word = word.toLowerCase();
                    }
                    word = this.m_Stemmer.stem(word);
                    if (this.m_useStoplist && stopwords.is(word)) continue;
                    if (!h.contains(word)) {
                        h.put(word, new Integer(0));
                    }
                    if ((count2 = (Count)dictionaryArr[vInd].get(word)) == null) {
                        dictionaryArr[vInd].put(word, new Count(1));
                        continue;
                    }
                    ++count2.count;
                }
            }
            Enumeration e = h.keys();
            while (e.hasMoreElements()) {
                word = (String)e.nextElement();
                Count c = (Count)dictionaryArr[vInd].get(word);
                if (c != null) {
                    ++c.docCount;
                    continue;
                }
                System.err.println("Warning: A word should definitely be in the dictionary.Please check the code");
            }
            if (pruneRate <= 0L || (long)i % pruneRate != 0L || i <= 0) continue;
            for (int z = 0; z < values; ++z) {
                Vector<String> d = new Vector<String>(1000);
                for (String word2 : dictionaryArr[z].keySet()) {
                    count = (Count)dictionaryArr[z].get(word2);
                    if (count.count > 1) continue;
                    d.add(word2);
                }
                for (String word3 : d) {
                    dictionaryArr[z].remove(word3);
                }
            }
        }
        int totalsize = 0;
        int[] prune = new int[values];
        for (int z = 0; z < values; ++z) {
            totalsize += dictionaryArr[z].size();
            int[] array = new int[dictionaryArr[z].size()];
            int pos = 0;
            for (String word : dictionaryArr[z].keySet()) {
                Count count3 = (Count)dictionaryArr[z].get(word);
                array[pos] = count3.count;
                ++pos;
            }
            StringToWordVector.sortArray(array);
            prune[z] = array.length < this.m_WordsToKeep ? this.m_minTermFreq : Math.max(this.m_minTermFreq, array[array.length - this.m_WordsToKeep]);
        }
        FastVector<Attribute> attributes = new FastVector<Attribute>(totalsize + this.getInputFormat().numAttributes());
        int classIndex = -1;
        for (int i = 0; i < this.getInputFormat().numAttributes(); ++i) {
            if (this.m_SelectedRange.isInRange(i)) continue;
            if (this.getInputFormat().classIndex() == i) {
                classIndex = attributes.size();
            }
            attributes.addElement((Attribute)this.getInputFormat().attribute(i).copy());
        }
        TreeMap<String, Integer> newDictionary = new TreeMap<String, Integer>();
        int index = attributes.size();
        for (int z = 0; z < values; ++z) {
            for (String word2 : dictionaryArr[z].keySet()) {
                count = (Count)dictionaryArr[z].get(word2);
                if (count.count < prune[z] || newDictionary.get(word2) != null) continue;
                newDictionary.put(word2, new Integer(index++));
                attributes.addElement(new Attribute(this.m_Prefix + word2));
            }
        }
        this.m_DocsCounts = new int[attributes.size()];
        for (String word : newDictionary.keySet()) {
            int idx = (Integer)newDictionary.get(word);
            int docsCount = 0;
            for (int j = 0; j < values; ++j) {
                Count c = (Count)dictionaryArr[j].get(word);
                if (c == null) continue;
                docsCount += c.docCount;
            }
            this.m_DocsCounts[idx] = docsCount;
        }
        attributes.trimToSize();
        this.m_Dictionary = newDictionary;
        this.m_NumInstances = this.getInputFormat().numInstances();
        Instances outputFormat = new Instances(this.getInputFormat().relationName(), attributes, 0);
        outputFormat.setClassIndex(classIndex);
        this.setOutputFormat(outputFormat);
    }

    private int convertInstancewoDocNorm(Instance instance, FastVector v) {
        int i;
        Integer index;
        TreeMap<Integer, Double> contained = new TreeMap<Integer, Double>();
        int firstCopy = 0;
        for (int i2 = 0; i2 < this.getInputFormat().numAttributes(); ++i2) {
            if (this.m_SelectedRange.isInRange(i2)) continue;
            if (this.getInputFormat().attribute(i2).type() != 2) {
                if (instance.value(i2) != 0.0) {
                    contained.put(new Integer(firstCopy), new Double(instance.value(i2)));
                }
            } else if (instance.isMissing(i2)) {
                contained.put(new Integer(firstCopy), new Double(Utils.missingValue()));
            } else {
                if (this.outputFormatPeek().attribute(firstCopy).numValues() == 0) {
                    this.outputFormatPeek().attribute(firstCopy).addStringValue("Hack to defeat SparseInstance bug");
                }
                int newIndex = this.outputFormatPeek().attribute(firstCopy).addStringValue(instance.stringValue(i2));
                contained.put(new Integer(firstCopy), new Double(newIndex));
            }
            ++firstCopy;
        }
        for (int j = 0; j < instance.numAttributes(); ++j) {
            if (!this.m_SelectedRange.isInRange(j) || instance.isMissing(j)) continue;
            this.m_Tokenizer.tokenize(instance.stringValue(j));
            while (this.m_Tokenizer.hasMoreElements()) {
                String word = (String)this.m_Tokenizer.nextElement();
                if (this.m_lowerCaseTokens) {
                    word = word.toLowerCase();
                }
                if ((index = (Integer)this.m_Dictionary.get(word = this.m_Stemmer.stem(word))) == null) continue;
                if (this.m_OutputCounts) {
                    Double count = (Double)contained.get(index);
                    if (count != null) {
                        contained.put(index, new Double(count + 1.0));
                        continue;
                    }
                    contained.put(index, new Double(1.0));
                    continue;
                }
                contained.put(index, new Double(1.0));
            }
        }
        if (this.m_TFTransform) {
            Iterator it = contained.keySet().iterator();
            i = 0;
            while (it.hasNext()) {
                index = (Integer)it.next();
                if (index >= firstCopy) {
                    double val = (Double)contained.get(index);
                    val = Math.log(val + 1.0);
                    contained.put(index, new Double(val));
                }
                ++i;
            }
        }
        if (this.m_IDFTransform) {
            Iterator it = contained.keySet().iterator();
            i = 0;
            while (it.hasNext()) {
                index = (Integer)it.next();
                if (index >= firstCopy) {
                    double val = (Double)contained.get(index);
                    contained.put(index, new Double(val *= Math.log((double)this.m_NumInstances / (double)this.m_DocsCounts[index])));
                }
                ++i;
            }
        }
        double[] values = new double[contained.size()];
        int[] indices = new int[contained.size()];
        Iterator it = contained.keySet().iterator();
        int i3 = 0;
        while (it.hasNext()) {
            Integer index2 = (Integer)it.next();
            Double value = (Double)contained.get(index2);
            values[i3] = value;
            indices[i3] = index2;
            ++i3;
        }
        SparseInstance inst = new SparseInstance(instance.weight(), values, indices, this.outputFormatPeek().numAttributes());
        inst.setDataset(this.outputFormatPeek());
        v.addElement(inst);
        return firstCopy;
    }

    private void normalizeInstance(Instance inst, int firstCopy) throws Exception {
        int j;
        double docLength = 0.0;
        if (this.m_AvgDocLength < 0.0) {
            throw new Exception("Average document length not set.");
        }
        for (j = 0; j < inst.numValues(); ++j) {
            if (inst.index(j) < firstCopy) continue;
            docLength += inst.valueSparse(j) * inst.valueSparse(j);
        }
        docLength = Math.sqrt(docLength);
        for (j = 0; j < inst.numValues(); ++j) {
            if (inst.index(j) < firstCopy) continue;
            double val = inst.valueSparse(j) * this.m_AvgDocLength / docLength;
            inst.setValueSparse(j, val);
            if (val != 0.0) continue;
            System.err.println("setting value " + inst.index(j) + " to zero.");
            --j;
        }
    }

    @Override
    public String getRevision() {
        return RevisionUtils.extract("$Revision: 7871 $");
    }

    public static void main(String[] argv) {
        StringToWordVector.runFilter(new StringToWordVector(), argv);
    }

    private class Count
    implements Serializable,
    RevisionHandler {
        static final long serialVersionUID = 2157223818584474321L;
        public int count;
        public int docCount;

        public Count(int c) {
            this.count = c;
        }

        @Override
        public String getRevision() {
            return RevisionUtils.extract("$Revision: 7871 $");
        }
    }
}

