/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 * WekaInstancesMerge.java
 * Copyright (C) 2009-2011 University of Waikato, Hamilton, New Zealand
 */

package adams.flow.transformer;

import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;
import weka.filters.unsupervised.attribute.Remove;
import adams.core.io.PlaceholderFile;
import adams.flow.core.Token;
import adams.flow.provenance.ActorType;
import adams.flow.provenance.Provenance;
import adams.flow.provenance.ProvenanceContainer;
import adams.flow.provenance.ProvenanceInformation;
import adams.flow.provenance.ProvenanceSupporter;

/**
 <!-- globalinfo-start -->
 * Merges multiple datasets.<br/>
 * If no 'ID' attribute is named, then all datasets must contain the same number of rows.<br/>
 * Attributes can be excluded from ending up in the final dataset via a regular expression. They can also be prefixed with name and/or index.
 * <p/>
 <!-- globalinfo-end -->
 *
 <!-- flow-summary-start -->
 * Input/output:<br/>
 * - accepts:<br/>
 * &nbsp;&nbsp;&nbsp;java.lang.String<br/>
 * &nbsp;&nbsp;&nbsp;java.lang.String[]<br/>
 * &nbsp;&nbsp;&nbsp;java.io.File<br/>
 * &nbsp;&nbsp;&nbsp;java.io.File[]<br/>
 * - generates:<br/>
 * &nbsp;&nbsp;&nbsp;weka.core.Instances<br/>
 * <p/>
 <!-- flow-summary-end -->
 *
 <!-- options-start -->
 * Valid options are: <p/>
 *
 * <pre>-D (property: debug)
 * &nbsp;&nbsp;&nbsp;If set to true, scheme may output additional info to the console.
 * </pre>
 *
 * <pre>-name &lt;java.lang.String&gt; (property: name)
 * &nbsp;&nbsp;&nbsp;The name of the actor.
 * &nbsp;&nbsp;&nbsp;default: InstancesMerge
 * </pre>
 *
 * <pre>-annotation &lt;adams.core.base.BaseText&gt; (property: annotations)
 * &nbsp;&nbsp;&nbsp;The annotations to attach to this actor.
 * &nbsp;&nbsp;&nbsp;default:
 * </pre>
 *
 * <pre>-skip (property: skip)
 * &nbsp;&nbsp;&nbsp;If set to true, transformation is skipped and the input token is just forwarded
 * &nbsp;&nbsp;&nbsp;as it is.
 * </pre>
 *
 * <pre>-use-prefix (property: usePrefix)
 * &nbsp;&nbsp;&nbsp;Whether to prefix the attribute names of each dataset with an index and
 * &nbsp;&nbsp;&nbsp;an optional string.
 * </pre>
 *
 * <pre>-add-index (property: addIndex)
 * &nbsp;&nbsp;&nbsp;Whether to add the index of the dataset to the prefix.
 * </pre>
 *
 * <pre>-prefix &lt;java.lang.String&gt; (property: prefix)
 * &nbsp;&nbsp;&nbsp;The optional prefix string to prefix the index number with (in case prefixes
 * &nbsp;&nbsp;&nbsp;are used); '&#64;' is a placeholder for the relation name.
 * &nbsp;&nbsp;&nbsp;default: dataset
 * </pre>
 *
 * <pre>-prefix-separator &lt;java.lang.String&gt; (property: prefixSeparator)
 * &nbsp;&nbsp;&nbsp;The separator string between the generated prefix and the original attribute
 * &nbsp;&nbsp;&nbsp;name.
 * &nbsp;&nbsp;&nbsp;default: -
 * </pre>
 *
 * <pre>-exclude-atts &lt;java.lang.String&gt; (property: excludedAttributes)
 * &nbsp;&nbsp;&nbsp;The regular expression used on the attribute names, to determine whether
 * &nbsp;&nbsp;&nbsp;an attribute should be excluded or not (matching sense can be inverted);
 * &nbsp;&nbsp;&nbsp;leave empty to include all attributes.
 * &nbsp;&nbsp;&nbsp;default:
 * </pre>
 *
 * <pre>-invert (property: invertMatchingSense)
 * &nbsp;&nbsp;&nbsp;Whether to invert the matching sense of excluding attributes, ie, the regular
 * &nbsp;&nbsp;&nbsp;expression is used for including attributes.
 * </pre>
 *
 * <pre>-unique-id &lt;java.lang.String&gt; (property: uniqueID)
 * &nbsp;&nbsp;&nbsp;The name of the attribute (string/numeric) used for uniquely identifying
 * &nbsp;&nbsp;&nbsp;rows among the datasets.
 * &nbsp;&nbsp;&nbsp;default:
 * </pre>
 *
 <!-- options-end -->
 *
 * @author  fracpete (fracpete at waikato dot ac dot nz)
 * @version $Revision: 4584 $
 */
public class WekaInstancesMerge
  extends AbstractTransformer
  implements ProvenanceSupporter {

  /** for serialization. */
  private static final long serialVersionUID = -2923715594018710295L;

  /** whether to prefix the attribute names of each dataset with an index. */
  protected boolean m_UsePrefix;

  /** whether to add the index to the prefix. */
  protected boolean m_AddIndex;

  /** the additional prefix name to use, apart from the index. */
  protected String m_Prefix;

  /** the separator between index and actual attribute name. */
  protected String m_PrefixSeparator;

  /** regular expression for excluding attributes from the datasets. */
  protected String m_ExcludedAttributes;

  /** whether to invert the matching sense for excluding attributes. */
  protected boolean m_InvertMatchingSense;

  /** the string or numeric attribute to use as unique identifier for rows. */
  protected String m_UniqueID;

  /** the attribute type of the ID attribute. */
  protected int m_AttType;

  /**
   * Returns a string describing the object.
   *
   * @return 			a description suitable for displaying in the gui
   */
  public String globalInfo() {
    return
        "Merges multiple datasets.\n"
      + "If no 'ID' attribute is named, then all datasets must contain the same number of rows.\n"
      + "Attributes can be excluded from ending up in the final dataset via "
      + "a regular expression. They can also be prefixed with name and/or index.";
  }

  /**
   * Adds options to the internal list of options.
   */
  public void defineOptions() {
    super.defineOptions();

    m_OptionManager.add(
	    "use-prefix", "usePrefix",
	    false);

    m_OptionManager.add(
	    "add-index", "addIndex",
	    false);

    m_OptionManager.add(
	    "prefix", "prefix",
	    "dataset");

    m_OptionManager.add(
	    "prefix-separator", "prefixSeparator",
	    "-");

    m_OptionManager.add(
	    "exclude-atts", "excludedAttributes",
	    "");

    m_OptionManager.add(
	    "invert", "invertMatchingSense",
	    false);

    m_OptionManager.add(
	    "unique-id", "uniqueID",
	    "");
  }

  /**
   * Sets whether to use prefixes.
   *
   * @param value	if true then the attributes will get prefixed
   */
  public void setUsePrefix(boolean value) {
    m_UsePrefix = value;
    reset();
  }

  /**
   * Returns whether to use prefixes.
   *
   * @return		true if the attributes will get prefixed
   */
  public boolean getUsePrefix() {
    return m_UsePrefix;
  }

  /**
   * Returns the tip text for this property.
   *
   * @return 		tip text for this property suitable for
   * 			displaying in the GUI or for listing the options.
   */
  public String usePrefixTipText() {
    return
        "Whether to prefix the attribute names of each dataset with an index "
      + "and an optional string.";
  }

  /**
   * Sets whether to add the dataset index number to the prefix.
   *
   * @param value	if true then the index will be used in the prefix
   */
  public void setAddIndex(boolean value) {
    m_AddIndex = value;
    reset();
  }

  /**
   * Returns whether to add the dataset index number to the prefix.
   *
   * @return		true if the index will be used in the prefix
   */
  public boolean getAddIndex() {
    return m_AddIndex;
  }

  /**
   * Returns the tip text for this property.
   *
   * @return 		tip text for this property suitable for
   * 			displaying in the GUI or for listing the options.
   */
  public String addIndexTipText() {
    return "Whether to add the index of the dataset to the prefix.";
  }

  /**
   * Sets the optional prefix string.
   *
   * @param value	the optional prefix string
   */
  public void setPrefix(String value) {
    m_Prefix = value;
    reset();
  }

  /**
   * Returns the optional prefix string.
   *
   * @return		the optional prefix string
   */
  public String getPrefix() {
    return m_Prefix;
  }

  /**
   * Returns the tip text for this property.
   *
   * @return 		tip text for this property suitable for
   * 			displaying in the GUI or for listing the options.
   */
  public String prefixTipText() {
    return
        "The optional prefix string to prefix the index number with (in "
      + "case prefixes are used); '@' is a placeholder for the relation name.";
  }

  /**
   * Sets the prefix separator string.
   *
   * @param value	the prefix separator string
   */
  public void setPrefixSeparator(String value) {
    m_PrefixSeparator = value;
    reset();
  }

  /**
   * Returns the prefix separator string.
   *
   * @return		the prefix separator string
   */
  public String getPrefixSeparator() {
    return m_PrefixSeparator;
  }

  /**
   * Returns the tip text for this property.
   *
   * @return 		tip text for this property suitable for
   * 			displaying in the GUI or for listing the options.
   */
  public String prefixSeparatorTipText() {
    return
        "The separator string between the generated prefix and the original "
      + "attribute name.";
  }

  /**
   * Sets the regular expression for excluding attributes.
   *
   * @param value	the regular expression
   */
  public void setExcludedAttributes(String value) {
    m_ExcludedAttributes = value;
    reset();
  }

  /**
   * Returns the prefix separator string.
   *
   * @return		the prefix separator string
   */
  public String getExcludedAttributes() {
    return m_ExcludedAttributes;
  }

  /**
   * Returns the tip text for this property.
   *
   * @return 		tip text for this property suitable for
   * 			displaying in the GUI or for listing the options.
   */
  public String excludedAttributesTipText() {
    return
        "The regular expression used on the attribute names, to determine whether "
      + "an attribute should be excluded or not (matching sense can be inverted); "
      + "leave empty to include all attributes.";
  }

  /**
   * Sets whether to invert the matching sense.
   *
   * @param value	if true then matching sense gets inverted
   */
  public void setInvertMatchingSense(boolean value) {
    m_InvertMatchingSense = value;
    reset();
  }

  /**
   * Returns whether to invert the matching sense.
   *
   * @return		true if the attributes will get prefixed
   */
  public boolean getInvertMatchingSense() {
    return m_InvertMatchingSense;
  }

  /**
   * Returns the tip text for this property.
   *
   * @return 		tip text for this property suitable for
   * 			displaying in the GUI or for listing the options.
   */
  public String invertMatchingSenseTipText() {
    return
        "Whether to invert the matching sense of excluding attributes, ie, "
      + "the regular expression is used for including attributes.";
  }

  /**
   * Sets the attribute (string/numeric) to use for uniquely identifying rows.
   *
   * @param value	the attribute name
   */
  public void setUniqueID(String value) {
    m_UniqueID = value;
    reset();
  }

  /**
   * Returns the attribute (string/numeric) to use for uniquely identifying rows.
   *
   * @return		the attribute name
   */
  public String getUniqueID() {
    return m_UniqueID;
  }

  /**
   * Returns the tip text for this property.
   *
   * @return 		tip text for this property suitable for
   * 			displaying in the GUI or for listing the options.
   */
  public String uniqueIDTipText() {
    return
        "The name of the attribute (string/numeric) used for uniquely "
      + "identifying rows among the datasets.";
  }

  /**
   * Returns the class that the consumer accepts.
   *
   * @return		<!-- flow-accepts-start -->java.lang.String.class, java.lang.String[].class, java.io.File.class, java.io.File[].class<!-- flow-accepts-end -->
   */
  public Class[] accepts() {
    return new Class[]{String.class, String[].class, File.class, File[].class};
  }

  /**
   * Returns the class of objects that it generates.
   *
   * @return		<!-- flow-generates-start -->weka.core.Instances.class<!-- flow-generates-end -->
   */
  public Class[] generates() {
    return new Class[]{Instances.class};
  }

  /**
   * Excludes attributes from the data.
   *
   * @param index	the index of the dataset
   * @param inst	the data to process
   * @return		the processed data
   */
  protected Instances excludeAttributes(Instances inst, int index) {
    Instances		result;
    StringBuilder	atts;
    int			i;
    Remove		filter;

    // determine attribute indices
    atts = new StringBuilder();
    for (i = 0; i < inst.numAttributes(); i++) {
      if (inst.attribute(i).name().matches(m_ExcludedAttributes)) {
	if (atts.length() > 0)
	  atts.append(",");
	atts.append((i+1));
      }
    }

    // filter data
    try {
      filter = new Remove();
      filter.setAttributeIndices(atts.toString());
      filter.setInvertSelection(m_InvertMatchingSense);
      filter.setInputFormat(inst);
      result = weka.filters.Filter.useFilter(inst, filter);
    }
    catch (Exception e) {
      result = inst;
      getSystemErr().println("Error filtering data:");
      getSystemErr().printStackTrace(e);
    }

    return result;
  }

  /**
   * Prefixes the attributes.
   *
   * @param index	the index of the dataset
   * @param inst	the data to process
   * @return		the processed data
   */
  protected Instances prefixAttributes(Instances inst, int index) {
    Instances			result;
    String			prefix;
    ArrayList<Attribute>	atts;
    int				i;

    // generate prefix
    if (m_Prefix.equals("@"))
      prefix = inst.relationName();
    else
      prefix = m_Prefix;
    if (m_AddIndex)
      prefix += m_PrefixSeparator + (index + 1);
    prefix += m_PrefixSeparator;

    // header
    atts = new ArrayList<Attribute>();
    for (i = 0; i < inst.numAttributes(); i++)
      atts.add(inst.attribute(i).copy(prefix + inst.attribute(i).name()));

    // data
    result = new Instances(inst.relationName(), atts, inst.numInstances());
    result.setClassIndex(inst.classIndex());
    for (i = 0; i < inst.numInstances(); i++)
      result.add((Instance) inst.instance(i).copy());

    return result;
  }

  /**
   * Prepares the data, prefixing attributes, removing columns, etc, before
   * merging it.
   *
   * @param inst	the data to process
   * @param index	the 0-based index of the dataset being processed
   * @return		the prepared data
   */
  protected Instances prepareData(Instances inst, int index) {
    Instances	result;

    result = inst;

    // exclude attributes
    if (m_ExcludedAttributes.length() > 0)
      result = excludeAttributes(result, index);

    // prefix
    if (m_UsePrefix)
      result = prefixAttributes(inst, index);

    return result;
  }

  /**
   * Updates the IDs in the hashset with the ones stored in the ID attribute
   * of the provided dataset.
   *
   * @param inst	the dataset to obtain the IDs from
   * @param ids		the hashset to store the IDs in
   */
  protected void updateIDs(Instances inst, HashSet ids) {
    Attribute	att;
    int		i;

    att = inst.attribute(m_UniqueID);
    if (att == null)
      throw new IllegalStateException(
	  "Attribute '" + m_UniqueID + "' not found in relation '" + inst.relationName() + "'!");

    // determine/check type
    if (m_AttType == -1) {
      if ((att.type() == Attribute.NUMERIC) || (att.type() == Attribute.STRING))
	m_AttType = att.type();
      else
	throw new IllegalStateException(
	    "Attribute '" + m_UniqueID + "' must be either NUMERIC or STRING!");
    }
    else {
      if (m_AttType != att.type())
	throw new IllegalStateException(
	    "Attribute '" + m_UniqueID + "' must have same attribute type in all the datasets!");
    }

    // get IDs
    for (i = 0; i < inst.numInstances(); i++) {
      if (m_AttType == Attribute.NUMERIC)
	ids.add(inst.instance(i).value(att));
      else
	ids.add(inst.instance(i).stringValue(att));
    }
  }

  /**
   * Merges the datasets based on the collected IDs.
   *
   * @param orig	the original datasets
   * @param inst	the processed datasets to merge into one
   * @param ids		the IDs for identifying the rows
   * @return		the merged dataset
   */
  protected Instances merge(Instances[] orig, Instances[] inst, HashSet ids) {
    Instances			result;
    ArrayList<Attribute>	atts;
    int				i;
    int				n;
    int				m;
    int				index;
    String			relation;
    Vector			sortedIDs;
    Attribute			att;
    int[]			indexStart;
    double			value;
    double[]			values;

    // create header
    if (isDebugOn())
      debug("Creating merged header...");
    atts       = new ArrayList<Attribute>();
    relation   = "";
    indexStart = new int[inst.length];
    for (i = 0; i < inst.length; i++) {
      indexStart[i] = atts.size();
      for (n = 0; n < inst[i].numAttributes(); n++)
	atts.add((Attribute) inst[i].attribute(n).copy());
      // assemble relation name
      if (i > 0)
	relation += "_";
      relation += inst[i].relationName();
    }
    result = new Instances(relation, atts, ids.size());

    // fill with missing values
    if (isDebugOn())
      debug("Filling with missing values...");
    for (i = 0; i < ids.size(); i++) {
      if (isStopped())
	return null;
      // progress
      if (isDebugOn() && ((i+1) % 1000 == 0))
	debug("" + (i+1));
      result.add(new DenseInstance(result.numAttributes()));
    }

    // sort IDs
    if (isDebugOn())
      debug("Sorting indices...");
    sortedIDs = new Vector(ids);
    Collections.sort(sortedIDs);

    // generate rows
    for (i = 0; i < inst.length; i++) {
      if (isStopped())
	return null;
      if (isDebugOn())
	debug("Adding file #" + (i+1));
      att = orig[i].attribute(m_UniqueID);
      for (n = 0; n < inst[i].numInstances(); n++) {
	// progress
	if (isDebugOn() && ((n+1) % 1000 == 0))
	  debug("" + (n+1));

	// determine index of row
	if (m_AttType == Attribute.NUMERIC)
	  index = Collections.binarySearch(sortedIDs, inst[i].instance(n).value(att));
	else
	  index = Collections.binarySearch(sortedIDs, inst[i].instance(n).stringValue(att));
	if (index < 0)
	  throw new IllegalStateException(
	      "Failed to determine index for row #" + (n+1) + " of dataset #" + (i+1) + "!");

	// use internal representation for faster access
	values = result.instance(index).toDoubleArray();

	// add attribute values
	for (m = 0; m < inst[i].numAttributes(); m++) {
	  // missing value?
	  if (inst[i].instance(n).isMissing(m))
	    continue;

	  switch (inst[i].attribute(m).type()) {
	    case Attribute.NUMERIC:
	    case Attribute.DATE:
	    case Attribute.NOMINAL:
	      values[indexStart[i] + m] = inst[i].instance(n).value(m);
	      break;

	    case Attribute.STRING:
	      value = result.attribute(indexStart[i] + m).addStringValue(inst[i].instance(n).stringValue(m));
	      values[indexStart[i] + m] = value;
	      break;

	    case Attribute.RELATIONAL:
	      value = result.attribute(indexStart[i] + m).addRelation(inst[i].instance(n).relationalValue(m));
	      values[indexStart[i] + m] = value;
	      break;

	    default:
	      throw new IllegalStateException(
		  "Unhandled attribute type: " + inst[i].attribute(m).type());
	  }
	}

	// update row
	result.set(index, new DenseInstance(1.0, values));
      }
    }

    return result;
  }

  /**
   * Executes the flow item.
   *
   * @return		null if everything is fine, otherwise error message
   */
  protected String doExecute() {
    String	result;
    String[]	filesStr;
    File[]	files;
    int		i;
    Instances	output;
    Instances[]	orig;
    Instances[]	inst;
    HashSet	ids;
    int		max;

    result = null;

    // get filenames
    if (m_InputToken.getPayload() instanceof String) {
      files = new File[]{new PlaceholderFile((String) m_InputToken.getPayload())};
    }
    else if (m_InputToken.getPayload() instanceof String[]) {
      filesStr = (String[]) m_InputToken.getPayload();
      files    = new File[filesStr.length];
      for (i = 0; i < filesStr.length; i++)
	files[i] = new PlaceholderFile(filesStr[i]);
    }
    else if (m_InputToken.getPayload() instanceof File) {
      files = new File[]{(File) m_InputToken.getPayload()};
    }
    else if (m_InputToken.getPayload() instanceof File[]) {
      files = (File[]) m_InputToken.getPayload();
    }
    else {
      throw new IllegalStateException("Unhandled input type: " + m_InputToken.getPayload().getClass());
    }

    try {
      output = null;

      // simple merge
      if (m_UniqueID.length() == 0) {
	inst = new Instances[1];
	for (i = 0; i < files.length; i++) {
	  if (isStopped())
	    break;
	  inst[0] = DataSource.read(files[i].getAbsolutePath());
	  inst[0] = prepareData(inst[0], i);
	  if (i == 0) {
	    output = inst[0];
	  }
	  else {
	    if (isDebugOn())
	      debug("Merging with file #" + (i+1) + ": " + files[i]);
	    output = Instances.mergeInstances(output, inst[0]);
	  }
	}
      }
      // merge based on row IDs
      else {
	orig      = new Instances[files.length];
	inst      = new Instances[files.length];
	m_AttType = -1;
	max       = 0;
	for (i = 0; i < files.length; i++) {
	  if (isStopped())
	    break;
	  if (isDebugOn())
	    debug("Loading file #" + (i+1) + ": " + files[i]);
	  orig[i] = DataSource.read(files[i].getAbsolutePath());
	  max     = Math.max(max, orig[i].numInstances());
	}
	ids = new HashSet(max);
	for (i = 0; i < files.length; i++) {
	  if (isStopped())
	    break;
	  if (isDebugOn())
	    debug("Updating IDs #" + (i+1));
	  updateIDs(orig[i], ids);
	  if (isDebugOn())
	    debug("Preparing dataset #" + (i+1));
	  inst[i] = prepareData(orig[i], i);
	}
	output = merge(orig, inst, ids);
      }

      if (!isStopped()) {
	m_OutputToken = new Token(output);
	updateProvenance(m_OutputToken);
      }
    }
    catch (Exception e) {
      getSystemErr().printStackTrace(e);
      result = e.toString();
    }

    return result;
  }

  /**
   * Updates the provenance information in the provided container.
   *
   * @param cont	the provenance container to update
   */
  public void updateProvenance(ProvenanceContainer cont) {
    if (Provenance.getSingleton().isEnabled())
      cont.addProvenance(new ProvenanceInformation(ActorType.DATAGENERATOR, m_InputToken.getPayload().getClass(), this, m_OutputToken.getPayload().getClass()));
  }
}
