/*
 * Decompiled with CFR 0.152.
 */
package jsat.text;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicIntegerArray;
import jsat.DataSet;
import jsat.SimpleDataSet;
import jsat.classifiers.CategoricalData;
import jsat.classifiers.DataPoint;
import jsat.linear.SparseVector;
import jsat.linear.Vec;
import jsat.text.HashedTextVectorCreator;
import jsat.text.TextVectorCreator;
import jsat.text.tokenizer.Tokenizer;
import jsat.text.wordweighting.WordWeighting;
import jsat.utils.IntList;

public abstract class HashedTextDataLoader
implements TextVectorCreator {
    private static final long serialVersionUID = 8513621180409278670L;
    private final int dimensionSize;
    private Tokenizer tokenizer;
    private WordWeighting weighting;
    protected List<SparseVector> vectors;
    private AtomicIntegerArray termDocumentFrequencys;
    protected boolean noMoreAdding;
    private volatile int documents;
    protected ThreadLocal<StringBuilder> workSpace;
    protected ThreadLocal<List<String>> storageSpace;
    protected ThreadLocal<Map<String, Integer>> wordCounts;
    private TextVectorCreator tvc;

    public HashedTextDataLoader(Tokenizer tokenizer, WordWeighting weighting) {
        this(0x400000, tokenizer, weighting);
    }

    public HashedTextDataLoader(int dimensionSize, Tokenizer tokenizer, WordWeighting weighting) {
        this.dimensionSize = dimensionSize;
        this.tokenizer = tokenizer;
        this.weighting = weighting;
        this.termDocumentFrequencys = new AtomicIntegerArray(dimensionSize);
        this.vectors = new ArrayList<SparseVector>();
        this.tvc = new HashedTextVectorCreator(dimensionSize, tokenizer, weighting);
        this.noMoreAdding = false;
        this.workSpace = new ThreadLocal();
        this.storageSpace = new ThreadLocal();
        this.wordCounts = new ThreadLocal();
    }

    protected abstract void initialLoad();

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    protected int addOriginalDocument(String text) {
        if (this.noMoreAdding) {
            throw new RuntimeException("Initial data set has been finalized");
        }
        StringBuilder localWorkSpace = this.workSpace.get();
        List<String> localStorageSpace = this.storageSpace.get();
        Map<String, Integer> localWordCounts = this.wordCounts.get();
        if (localWorkSpace == null) {
            localWorkSpace = new StringBuilder();
            localStorageSpace = new ArrayList<String>();
            localWordCounts = new LinkedHashMap<String, Integer>();
            this.workSpace.set(localWorkSpace);
            this.storageSpace.set(localStorageSpace);
            this.wordCounts.set(localWordCounts);
        }
        localWorkSpace.setLength(0);
        localStorageSpace.clear();
        this.tokenizer.tokenize(text, localWorkSpace, localStorageSpace);
        for (String word : localStorageSpace) {
            Integer count = localWordCounts.get(word);
            if (count == null) {
                localWordCounts.put(word, 1);
                continue;
            }
            localWordCounts.put(word, count + 1);
        }
        SparseVector vec = new SparseVector(this.dimensionSize, localWordCounts.size());
        Iterator<Map.Entry<String, Integer>> iter = localWordCounts.entrySet().iterator();
        while (iter.hasNext()) {
            Map.Entry<String, Integer> entry = iter.next();
            String word = entry.getKey();
            int index = Math.abs(word.hashCode()) % this.dimensionSize;
            vec.set(index, entry.getValue().intValue());
            this.termDocumentFrequencys.addAndGet(index, entry.getValue());
            iter.remove();
        }
        List<SparseVector> list = this.vectors;
        synchronized (list) {
            this.vectors.add(vec);
            return this.documents++;
        }
    }

    protected void finishAdding() {
        this.noMoreAdding = true;
        this.workSpace = null;
        this.storageSpace = null;
        this.wordCounts = null;
        int[] frqs = new int[this.dimensionSize];
        for (int i = 0; i < this.termDocumentFrequencys.length(); ++i) {
            frqs[i] = this.termDocumentFrequencys.get(i);
        }
        this.weighting.setWeight(this.vectors, IntList.unmodifiableView(frqs, this.dimensionSize));
        for (SparseVector vec : this.vectors) {
            this.weighting.applyTo(vec);
        }
        this.termDocumentFrequencys = null;
    }

    public DataSet getDataSet() {
        if (!this.noMoreAdding) {
            this.initialLoad();
            this.finishAdding();
        }
        ArrayList<DataPoint> dataPoints = new ArrayList<DataPoint>(this.vectors.size());
        for (SparseVector vec : this.vectors) {
            dataPoints.add(new DataPoint(vec, new int[0], new CategoricalData[0]));
        }
        return new SimpleDataSet(dataPoints);
    }

    @Override
    public Vec newText(String input) {
        return this.getTextVectorCreator().newText(input);
    }

    @Override
    public Vec newText(String input, StringBuilder workSpace, List<String> storageSpace) {
        return this.getTextVectorCreator().newText(input, workSpace, storageSpace);
    }

    public TextVectorCreator getTextVectorCreator() {
        if (!this.noMoreAdding) {
            throw new RuntimeException("Initial documents have not yet loaded");
        }
        return this.tvc;
    }
}

