/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    SequenceLearner.java
 *    Copyright (C) 2010 Stefan Mutter
 *
 */
package weka.classifiers.sequence;

import java.io.Serializable;
import java.util.Enumeration;
import java.util.List;
import java.util.Vector;

import weka.classifiers.AbstractClassifier;
import weka.classifiers.sequence.core.Alphabet;
import weka.classifiers.sequence.core.IllegalSymbolException;
import weka.classifiers.sequence.core.ImpossibleStateProbabilityException;
import weka.classifiers.sequence.core.InvalidStructureException;
import weka.classifiers.sequence.core.InvalidViterbiPathException;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.Utils;

/**
<!-- globalinfo-start -->
* abstract class that offer common functionality for sequence learners
* <p/>
<!-- globalinfo-end -->
*
<!-- options-start -->
* Valid options are: <p/>
*
* <pre> -X &lt;num&gt;
*  specify the number of columns in a PHMM.
*  (when not specified, average length of sequences)</pre>
*
* <pre> -L &lt;num&gt;
*  use only the first L symbols of each sequences (or fewer if they are shorter).
*  (when not specified, sequences are not trimmed)</pre>
*
* <pre> -A classpath;
*  specifies the alphabet to be used i.e weka.classifiers.sequence.core.ExtendedProteinAlphabet</pre>
*
* <pre> -C &lt;double&gt;
*  threshold to stop training
*  (default: 0.0001)</pre>
*
* <pre> -B &lt;num&gt;
*  threshold option for Baum-Welch learning: (0) overall difference, (1) difference averaged over sequence number, (2) difference averaged over all residues in all sequences.
*  (default: 2)</pre>
*
* <pre> -F &lt;num&gt;
*  stop training after F iterations instead of difference in threshold.
*  (when not specified,training ends when difference is smaller than threshold)</pre>
*
* <pre> -N
*  use null model.</pre>
*
* <pre> -E
*  disables log space calculations for emissions (not recommended).</pre>
*
* <pre> -I
*  enables learning of emission probabilities in insert states.</pre>
*
* <pre> -S
*  more memory efficient calculation of the Baum-Welch algorithm.</pre>
*
* <pre> -J &lt;num&gt;
*  sets the kind of null model to be used. UNI uses uniform null model, POS uses the background distribution of the symbols in the positive class, NEG does liekwise for the negative class and BOTH uses both classes
*  (default: UNI)</pre>
*
<!-- options-end -->
*
* @author Stefan Mutter (pHMM4weka@gmail.com)
* @author Peter Sestoft, Royal Veterinary and Agricultural University, Copenhagen, Denmark (logplus method)
* @version $Revision: 6 $
*/
public abstract class SequenceLearner extends AbstractClassifier implements Serializable{

	/** for serialization */
	private static final long serialVersionUID = 6502780192411755341L;


	protected int restrictSequenceLength = -1;

	protected int restrictMatchColumns = -1;

	private Alphabet alphabet = new weka.classifiers.sequence.core.ExtendedProteinAlphabet();


	protected double logLikelihoodThreshold;


	protected int sequenceIndex;


	protected boolean useNullModel;


	protected boolean transitionsEmissionsNotInLog;


	protected boolean learnInsertEmissions;

	private int baumWelchOption;

	private boolean memorySensitive;


	protected boolean viterbiProb;


	protected boolean fwdProb;


	protected boolean allProb;

	protected boolean allProbOnly;

	protected boolean noBasic;

	protected boolean noPathLogScores;

	private int stopAfterIteration = -1;


	protected List<Boolean> converged;

	protected int sampleMethod = -1;

	//protected boolean useBackGroundNullModel;

	public static final int UniformBackgroundDist = -1;

	public static final int Pos4BackgroundDist = 0;

	public static final int Neg4BackgroundDist = 1;

	public static final int All4BackgroundDist = 2;

	public static final Tag[] TAGS_BackDist = {
		new Tag(UniformBackgroundDist, "UNI", "use uniform distribution as background null distribution"),
		new Tag(Pos4BackgroundDist, "POS", "use distribution of postives inst. as background null distribution"),
		new Tag(Neg4BackgroundDist, "NEG", "use distribution of negatives inst. as background null distribution"),
		new Tag(All4BackgroundDist, "BOTH", "use distribution of all inst. as background null distribution")
	};

	protected int backDist = Pos4BackgroundDist;


	public int getStopAfterIteration() {
		return stopAfterIteration;
	}

	public void setStopAfterIteration(int stopAfterIteration) {
		if(stopAfterIteration == -1){
			this.stopAfterIteration = Integer.MAX_VALUE;
		}
		else{
			this.stopAfterIteration = stopAfterIteration;
		}
	}

	public boolean isNoBasic() {
		return noBasic;
	}

	public void setNoBasic(boolean noBasic) {
		this.noBasic = noBasic;
	}

	public abstract Instances propositionalise(Instances instances) throws IllegalSymbolException, InvalidStructureException, InvalidViterbiPathException, ImpossibleStateProbabilityException;

	public abstract Instance propositionaliseTestInstance(Instance instance) throws IllegalSymbolException, InvalidStructureException, InvalidViterbiPathException,ImpossibleStateProbabilityException;

	/**
	 * Returns default capabilities of the classifier.
	 *
	 * @return      the capabilities of this classifier
	 */
	public Capabilities getCapabilities() {
		Capabilities result = super.getCapabilities();

		// attributes
		result.disableAllAttributes();
		result.disableAllAttributeDependencies();
		result.enable(Capability.STRING_ATTRIBUTES);

		// class
		result.disableAllClasses();
		result.disableAllClassDependencies();
		result.disable(Capability.MISSING_CLASS_VALUES);

		result.enable(Capability.NOMINAL_CLASS);
		result.enableDependency(Capability.NOMINAL_CLASS);

		// instances
		result.setMinimumNumberInstances(1);


		return result;
	}


	/**
	 * Returns an enumeration describing the available options.
	 *
	 * @return an enumeration of all the available options.
	 */
	public Enumeration listOptions() {
		Enumeration 	enm;
		Vector		result;

		result = new Vector();

		enm = super.listOptions();
		while (enm.hasMoreElements())
			result.addElement(enm.nextElement());

		result.addElement(new Option(
				"\tCuts all sequences after specified length.",
				"L", 1, "-L " ));

		result.addElement(new Option(
				"\tRestricts the number of match states to the specified values.",
				"X", 1, "-X " ));

		result.addElement(new Option(
				"\tFull class name of Alphabet to use",
				"A", 1, "-A <Alphabet>"));

		result.addElement(new Option(
				"\tLog likelihood threshold",
				"C", 1, "-C " ));

		result.addElement(new Option(
				"\tLog likelihood option for Baum-Welch: (0) overall loglikelihood, (1) average over sequence number, (2) average of all residues in all sequences (default)",
				"B", 1, "-B " ));

		result.addElement(new Option(
				"\tStop after the number of specified iterations instead of training to full convergence determined by log likelihood threshold",
				"F", 1, "-F " ));

		result.addElement(new Option(
				"\tUse NullModel",
				"N", 0, "-N " ));

		result.addElement(new Option(
				"\tDon't use logs for emission and transition probabilities",
				"E", 0, "-E " ));

		result.addElement(new Option(
				"\tFlag that indicate whether or not emission probabilities in insert states are subject to learning",
				"I", 0, "-I " ));

		result.addElement(new Option(
				"\tMore memory sensitive, but more time-consuming execution of BaumWelch",
				"S", 0, "-S " ));

		result.addElement(new Option(
				"\tThe background dist to use as null model\n"
				+ "\t(default: UNI)",
				"J", 1, "-J " + Tag.toOptionList(TAGS_BackDist)));

		return result.elements();
	}

	/**
	 * Gets the current settings.
	 *
	 * @return an array of strings suitable for passing to setOptions()
	 */
	public String [] getOptions() {
		int       	i;
		Vector    	result;
		String[]  	options;

		result = new Vector();

		options = super.getOptions();
		for (i = 0; i < options.length; i++)
			result.add(options[i]);


		if(getRestrictSequenceLength() != -1){
			result.add("-L");
			result.add("" + getRestrictSequenceLength());
		}

		if(getRestrictMatchColumns() != -1){
			result.add("-X");
			result.add("" + getRestrictMatchColumns());
		}

		result.add("-A");
		result.add( "" + getAlphabetSpec());

		result.add("-C");
		result.add("" + getLogLikelihoodThreshold());

		result.add("-B");
		result.add("" + getBaumWelchOption());

		result.add("-F");
		result.add("" + getStopAfterIteration());

		if(useNullModel == true){
			result.add("-N");
		}

		if(transitionsEmissionsNotInLog == true){
			result.add("-E");
		}

		if(learnInsertEmissions == true){
			result.add("-I");
		}

		if(memorySensitive == true){
			result.add("-S");
		}

		result.add("-J");
		result.add("" + getBackDist());

		return (String[]) result.toArray(new String[result.size()]);
	}



	private String getAlphabetSpec() {
		Alphabet alpha = getAlphabet();
		if (alpha instanceof OptionHandler) {
			return alpha.getClass().getName() + " "
			+ Utils.joinOptions(((OptionHandler)alpha).getOptions());
		}
		return alpha.getClass().getName();
	}

	/**
	 * Parses a given list of options. <p/>
	 *
	 * @param options the list of options as an array of strings
	 * @throws Exception if an option is not supported
	 */
	public void setOptions(String[] options) throws Exception {
		String 	tmpStr;

		String numberString = Utils.getOption('L', options);
		if (numberString.length() != 0) {
			restrictSequenceLength = Integer.parseInt(numberString);
		}
		else {
			restrictSequenceLength = -1;
		}

		numberString = Utils.getOption('X', options);
		if (numberString.length() != 0) {
			restrictMatchColumns = Integer.parseInt(numberString);
		}
		else {
			restrictMatchColumns = -1;
		}

		String alphabetString = Utils.getOption('A', options);
		if (alphabetString.length() > 0) {
			String [] alphabetSpec = Utils.splitOptions(alphabetString);
			if (alphabetSpec.length == 0) {
				throw new IllegalArgumentException("Invalid Alphabet specification string");
			}
			String alphabetName = alphabetSpec[0];
			alphabetSpec[0] = "";
			setAlphabet((Alphabet)Utils.forName(Alphabet.class, alphabetName, alphabetSpec));
		}
		else{
			throw new Exception("define alphabet");
		}

		numberString = Utils.getOption('C', options);
		if (numberString.length() != 0) {
			logLikelihoodThreshold = Double.parseDouble(numberString);
		}

		numberString = Utils.getOption('B', options);
		if (numberString.length() != 0) {
			baumWelchOption = Integer.parseInt(numberString);
		}
		if(baumWelchOption < 0 || baumWelchOption > 2){
			baumWelchOption = 2;
		}

		numberString = Utils.getOption('F', options);
		if (numberString.length() != 0) {
			stopAfterIteration = Integer.parseInt(numberString);
			if(stopAfterIteration == -1){
				stopAfterIteration = Integer.MAX_VALUE;
			}

		}
		else{
			stopAfterIteration = Integer.MAX_VALUE;
		}

		useNullModel = Utils.getFlag('N', options);

		transitionsEmissionsNotInLog = Utils.getFlag('E', options);

		learnInsertEmissions = Utils.getFlag('I', options);

		memorySensitive = Utils.getFlag('S', options);

		tmpStr = Utils.getOption('J', options);
		if (tmpStr.length() != 0)
			setBackDist(new SelectedTag(tmpStr, TAGS_BackDist));
		else
			setBackDist(new SelectedTag(UniformBackgroundDist, TAGS_BackDist));




		super.setOptions(options);
	}



	public SelectedTag getBackDist() {
		return new SelectedTag(backDist, TAGS_BackDist);
	}


	public void setBackDist(SelectedTag newBackDist) {
		if (newBackDist.getTags() == TAGS_BackDist)
			backDist = newBackDist.getSelectedTag().getID();
	}


	public int getRestrictSequenceLength(){
		return restrictSequenceLength;
	}

	public void setRestrictSequenceLength(int value){
		restrictSequenceLength = value;
	}

	public Alphabet getAlphabet() {
		return alphabet;
	}

	public void setAlphabet(Alphabet alphabet) {
		this.alphabet = alphabet;
	}

	public String alphabetTipText() {
		return "The alphabet to use.";
	}

	public double getLogLikelihoodThreshold() {
		return logLikelihoodThreshold;
	}

	public void setLogLikelihoodThreshold(double logLikelihoodThreshold) {
		this.logLikelihoodThreshold = logLikelihoodThreshold;
	}

	public String logLikelihoodThresholdTipText() {
		return "The threshold for convergence for the log likelihood using the Baum-Welch algorithm.";
	}

	public boolean isUseNullModel() {
		return useNullModel;
	}

	public void setUseNullModel(boolean useNullModel) {
		this.useNullModel = useNullModel;
	}

	public String useNullModelTipText() {
		return "All scores are log-odds. A NullModel is used";
	}

	public boolean getTransitionsEmissionsNotInLog() {
		return transitionsEmissionsNotInLog;
	}

	public void setTransitionsEmissionsNotInLog(boolean transitionsEmissionsNotInLog) {
		this.transitionsEmissionsNotInLog = transitionsEmissionsNotInLog;
	}

	public boolean isLearnInsertEmissions() {
		return learnInsertEmissions;
	}

	public void setLearnInsertEmissions(boolean learnInsertEmissions) {
		this.learnInsertEmissions = learnInsertEmissions;
	}

	public String learnInsertEmissionsTipText() {
		return "Flag that indicate whether or not emission probabilities in insert states are subject to learning";
	}


	protected Instances doPropositionalisation(String[] allAlignment, Instances transformed, double classValue) throws IllegalSymbolException, InvalidStructureException, InvalidViterbiPathException {

		Instance newInst = new DenseInstance(transformed.numAttributes());
		newInst.setDataset(transformed);
		newInst.setClassValue(classValue);

		int attributeCounter = 0;

		for(int k = 0; k < allAlignment.length; k++){
			String alignment = allAlignment[k];
			for(int i = 0; i < alignment.length(); i++){
				if(alignment.charAt(i)== '-'){
					newInst.setValue(attributeCounter, "-");
					attributeCounter++;
					newInst.setValue(attributeCounter,0);
					attributeCounter++;
				}
				else{
					if(Character.isUpperCase(alignment.charAt(i))){
						newInst.setValue(attributeCounter, alignment.charAt(i)+"");
						attributeCounter++;
					}
					if(i < alignment.length()-1 && Character.isLowerCase(alignment.charAt(i+1))){
						i++;
						int count = 0;
						while(i < alignment.length() && Character.isLowerCase(alignment.charAt(i))){
							i++;
							count++;
						}
						i--;
						newInst.setValue(attributeCounter,count);
						attributeCounter++;
					}
					else{
						if(i < alignment.length()-1){
							newInst.setValue(attributeCounter,0);
							attributeCounter++;
						}
					}
				}
			}
		}


		transformed.add(newInst);
		newInst = null;
		return transformed;
	}

	public int getBaumWelchOption() {
		return baumWelchOption;
	}

	public void setBaumWelchOption(int baumWelchOption) {
		this.baumWelchOption = baumWelchOption;
	}

	public String baumWelchOptionTipText() {
		return "Log likelihood option for Baum-Welch: (0) overall loglikelihood (default), (1) average over sequence number, (2) average of all residues in all sequences";
	}

	public boolean isMemorySensitive() {
		return memorySensitive;
	}

	public void setMemorySensitive(boolean memorySensitive) {
		this.memorySensitive = memorySensitive;
	}
	public String memorySensitiveTipText(){
		return"More memory sensitive, but more time-consuming execution of BaumWelch";
	}

	public boolean isAllProb() {
		return allProb;
	}

	public void setAllProb(boolean allProb) {
		this.allProb = allProb;
	}

	public boolean isFwdProb() {
		return fwdProb;
	}

	public void setFwdProb(boolean fwdProb) {
		this.fwdProb = fwdProb;
	}

	public boolean isViterbiProb() {
		return viterbiProb;
	}

	public void setViterbiProb(boolean viterbiProb) {
		this.viterbiProb = viterbiProb;
	}

	public boolean isAllProbOnly() {
		return allProbOnly;
	}

	public void setAllProbOnly(boolean allProbOnly) {
		this.allProbOnly = allProbOnly;
	}

	public List<Boolean> getConverged() {
		return converged;
	}

	/**
	 * compute log(p+q) from plog = log p and qlog = log q, using that
	 * log (p + q) = log (p(1 + q/p)) = log p + log(1 + q/p)
	 *  = log p + log(1 + exp(log q - log p)) = plog + log(1 + exp(logq - logp))
	 *  and that log(1 + exp(d)) < 1E-17 for d < -37.
	 *  This method is taken from http://www.itu.dk/~sestoft/bsa/Match3.java
	 *  For Licence agreement see licenceMatch3.txt
	 * @param plog plog = log p
	 * @param qlog qlog = log q
	 * @return log(p+q)
	 */
	protected static double logplus(double plog, double qlog) {
		double max, diff;
		if (plog > qlog) {
			if (qlog == Double.NEGATIVE_INFINITY)
				return plog;
			else {
				max = plog; diff = qlog - plog;
			}
		} else {
			if (plog == Double.NEGATIVE_INFINITY)
				return qlog;
			else {
				max = qlog; diff = plog - qlog;
			}
		}
		// Now diff <= 0 so Math.exp(diff) will not overflow
		return max + (diff < -37 ? 0 : Math.log(1 + Math.exp(diff)));
	}

	protected static double logsum(double alog, double blog) {
		if(alog == Double.NEGATIVE_INFINITY || blog == Double.NEGATIVE_INFINITY){
			return Double.NEGATIVE_INFINITY;
		}
		else{
			return alog+blog;
		}
	}

	public boolean isNoPathLogScores() {
		return noPathLogScores;
	}

	public void setNoPathLogScores(boolean noPathLogScores) {
		this.noPathLogScores = noPathLogScores;
	}

	public int getRestrictMatchColumns() {
		return restrictMatchColumns;
	}

	public void setRestrictMatchColumns(int numberOfMatchColumns) {
		restrictMatchColumns = numberOfMatchColumns;
	}

	public int getSampleMethod() {
		return sampleMethod;
	}

	public void setSampleMethod(int sampleMethod) {
		this.sampleMethod = sampleMethod;
	}



}

