/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 * TwitterConverter.java
 * Copyright (C) 2010-2012 University of Waikato, Hamilton, New Zealand
 */

package adams.flow.transformer;

import java.util.ArrayList;
import java.util.Hashtable;

import twitter4j.Status;
import twitter4j.Tweet;
import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import adams.core.Constants;
import adams.core.DateFormat;
import adams.core.TechnicalInformation;
import adams.core.TechnicalInformationHandler;
import adams.core.Utils;
import adams.flow.core.Token;

/**
 <!-- globalinfo-start -->
 * Turns a Twitter tweet or status into different representation.<br/>
 * Please note that tweet and status objects differ in what fields are available:<br/>
 * - tweet-specific: LANGUAGE_CODE<br/>
 * - status-specific: PLACE, COUNTRY, COUNTRY_CODE<br/>
 * Also, GEO location data might not be available.<br/>
 * For more information on the date format, see:<br/>
 * Javadoc. java.text.SimpleDateFormat.
 * <p/>
 <!-- globalinfo-end -->
 *
 <!-- technical-bibtex-start -->
 * BibTeX:
 * <pre>
 * &#64;misc{missing_id,
 *    author = {Javadoc},
 *    title = {java.text.SimpleDateFormat},
 *    HTTP = {http:&#47;&#47;download.oracle.com&#47;javase&#47;1,5.0&#47;docs&#47;api&#47;java&#47;text&#47;SimpleDateFormat.html}
 * }
 * </pre>
 * <p/>
 <!-- technical-bibtex-end -->
 *
 <!-- flow-summary-start -->
 * Input&#47;output:<br/>
 * - accepts:<br/>
 * &nbsp;&nbsp;&nbsp;twitter4j.Tweet<br/>
 * &nbsp;&nbsp;&nbsp;twitter4j.Status<br/>
 * - generates:<br/>
 * &nbsp;&nbsp;&nbsp;java.lang.String<br/>
 * <p/>
 <!-- flow-summary-end -->
 *
 <!-- options-start -->
 * Valid options are: <p/>
 *
 * <pre>-D &lt;int&gt; (property: debugLevel)
 * &nbsp;&nbsp;&nbsp;The greater the number the more additional info the scheme may output to
 * &nbsp;&nbsp;&nbsp;the console (0 = off).
 * &nbsp;&nbsp;&nbsp;default: 0
 * &nbsp;&nbsp;&nbsp;minimum: 0
 * </pre>
 *
 * <pre>-name &lt;java.lang.String&gt; (property: name)
 * &nbsp;&nbsp;&nbsp;The name of the actor.
 * &nbsp;&nbsp;&nbsp;default: TwitterConverter
 * </pre>
 *
 * <pre>-annotation &lt;adams.core.base.BaseText&gt; (property: annotations)
 * &nbsp;&nbsp;&nbsp;The annotations to attach to this actor.
 * &nbsp;&nbsp;&nbsp;default:
 * </pre>
 *
 * <pre>-skip (property: skip)
 * &nbsp;&nbsp;&nbsp;If set to true, transformation is skipped and the input token is just forwarded
 * &nbsp;&nbsp;&nbsp;as it is.
 * </pre>
 *
 * <pre>-stop-flow-on-error (property: stopFlowOnError)
 * &nbsp;&nbsp;&nbsp;If set to true, the flow gets stopped in case this actor encounters an error;
 * &nbsp;&nbsp;&nbsp; useful for critical actors.
 * </pre>
 *
 * <pre>-output-type &lt;STRING|INSTANCE&gt; (property: outputType)
 * &nbsp;&nbsp;&nbsp;The type of output to generate.
 * &nbsp;&nbsp;&nbsp;default: STRING
 * </pre>
 *
 * <pre>-field &lt;ID|USER_ID|USER_NAME|SOURCE|TEXT|CREATED|GEO_LATITUDE|GEO_LONGITUDE|LANGUAGE_CODE|PLACE|COUNTRY|COUNTRY_CODE&gt; [-field ...] (property: fields)
 * &nbsp;&nbsp;&nbsp;The fields to use for generating the output.
 * &nbsp;&nbsp;&nbsp;default: TEXT
 * </pre>
 *
 * <pre>-separator &lt;java.lang.String&gt; (property: separator)
 * &nbsp;&nbsp;&nbsp;The separator to use when generating strings as output; tab, new line, carriage
 * &nbsp;&nbsp;&nbsp;return and backslash need to be escaped, ie, '&nbsp;&nbsp;&nbsp;', '
 * &nbsp;&nbsp;&nbsp;', '
', '\'.
 * &nbsp;&nbsp;&nbsp;default: \t
 * </pre>
 *
 * <pre>-quote (property: quote)
 * &nbsp;&nbsp;&nbsp;If enabled all sub-strings are quoted if necessary when generating string
 * &nbsp;&nbsp;&nbsp;output.
 * </pre>
 *
 * <pre>-date-format &lt;java.lang.String&gt; (property: dateFormat)
 * &nbsp;&nbsp;&nbsp;The format for the dates.
 * &nbsp;&nbsp;&nbsp;default: yyyy-MM-dd HH:mm:ss
 * </pre>
 *
 <!-- options-end -->
 *
 * @author  fracpete (fracpete at waikato dot ac dot nz)
 * @version $Revision: 5867 $
 */
public class TwitterConverter
  extends AbstractTransformer
  implements TechnicalInformationHandler {

  /** for serialization. */
  private static final long serialVersionUID = -4249772734326614365L;

  /**
   * The output to generate.
   *
   * @author  fracpete (fracpete at waikato dot ac dot nz)
   * @version $Revision: 5867 $
   */
  public enum OutputType {
    /** simple string. */
    STRING,
    /** Weka instance. */
    INSTANCE
  }

  /**
   * The available fields for generating the output.
   *
   * @author  fracpete (fracpete at waikato dot ac dot nz)
   * @version $Revision: 5867 $
   */
  public enum TwitterField {
    /** the ID of the tweet/status. */
    ID,
    /** the user ID. */
    USER_ID,
    /** the user name. */
    USER_NAME,
    /** the source. */
    SOURCE,
    /** the text of the tweet/status. */
    TEXT,
    /** the creation date. */
    CREATED,
    /** the associated latitude. */
    GEO_LATITUDE,
    /** the associated longitude. */
    GEO_LONGITUDE,
    /** the language code (only tweet). */
    LANGUAGE_CODE,
    /** the place (only status). */
    PLACE,
    /** the country (only status). */
    COUNTRY,
    /** the country code (only status). */
    COUNTRY_CODE
  }

  /** the key for storing the current header in the backup. */
  public final static String BACKUP_HEADER = "header";

  /** the type of output to generate. */
  protected OutputType m_OutputType;

  /** the fields to generate the output from. */
  protected TwitterField[] m_Fields;

  /** the separator, when generating a string. */
  protected String m_Separator;

  /** whether to quote strings. */
  protected boolean m_Quote;

  /** the dataset header when generating Instance objects. */
  protected Instances m_Header;

  /** for format for dates. */
  protected String m_DateFormat;

  /** for formatting/parsing dates. */
  protected transient DateFormat m_DateFormatter;

  /**
   * Returns a string describing the object.
   *
   * @return 			a description suitable for displaying in the gui
   */
  @Override
  public String globalInfo() {
    return
        "Turns a Twitter tweet or status into different representation.\n"
      + "Please note that tweet and status objects differ in what fields are "
      + "available:\n"
      + "- tweet-specific: " + TwitterField.LANGUAGE_CODE + "\n"
      + "- status-specific: " + TwitterField.PLACE + ", " + TwitterField.COUNTRY + ", " + TwitterField.COUNTRY_CODE + "\n"
      + "Also, GEO location data might not be available.\n"
      + "For more information on the date format, see:\n"
      + getTechnicalInformation().toString();
  }

  /**
   * Returns an instance of a TechnicalInformation object, containing
   * detailed information about the technical background of this class,
   * e.g., paper reference or book this class is based on.
   *
   * @return the technical information about this class
   */
  public TechnicalInformation getTechnicalInformation() {
    return new DateFormat().getTechnicalInformation();
  }

  /**
   * Adds options to the internal list of options.
   */
  @Override
  public void defineOptions() {
    super.defineOptions();

    m_OptionManager.add(
	    "output-type", "outputType",
	    OutputType.STRING);

    m_OptionManager.add(
	    "field", "fields",
	    new TwitterField[]{TwitterField.TEXT});

    m_OptionManager.add(
	    "separator", "separator",
	    "\t");

    m_OptionManager.add(
	    "quote", "quote",
	    false);

    m_OptionManager.add(
	    "date-format", "dateFormat",
	    Constants.TIMESTAMP_FORMAT);
  }

  /**
   * Initializes the members.
   */
  @Override
  protected void initialize() {
    super.initialize();

    m_Header        = null;
    m_DateFormatter = null;
  }

  /**
   * Sets the type of output to generate.
   *
   * @param value	the type
   */
  public void setOutputType(OutputType value) {
    m_OutputType = value;
    reset();
  }

  /**
   * Returns the type of output to generate.
   *
   * @return		the type
   */
  public OutputType getOutputType() {
    return m_OutputType;
  }

  /**
   * Returns the tip text for this property.
   *
   * @return 		tip text for this property suitable for
   * 			displaying in the GUI or for listing the options.
   */
  public String outputTypeTipText() {
    return "The type of output to generate.";
  }

  /**
   * Sets fields to generate the output from.
   *
   * @param value	the fields
   */
  public void setFields(TwitterField[] value) {
    m_Fields = value;
    reset();
  }

  /**
   * Returns the fields to generate the output from.
   *
   * @return		the fields
   */
  public TwitterField[] getFields() {
    return m_Fields;
  }

  /**
   * Returns the tip text for this property.
   *
   * @return 		tip text for this property suitable for
   * 			displaying in the GUI or for listing the options.
   */
  public String fieldsTipText() {
    return "The fields to use for generating the output.";
  }

  /**
   * Sets the separator to use. \t, \n, \r, \\ must be quoted.
   *
   * @param value	the separator
   */
  public void setSeparator(String value) {
    m_Separator = Utils.unbackQuoteChars(value);
    reset();
  }

  /**
   * Returns the separator in use. \t, \r, \n, \\ get returned quoted.
   *
   * @return		the separator
   */
  public String getSeparator() {
    return Utils.backQuoteChars(m_Separator);
  }

  /**
   * Returns the tip text for this property.
   *
   * @return 		tip text for this property suitable for
   * 			displaying in the GUI or for listing the options.
   */
  public String separatorTipText() {
    return
        "The separator to use when generating strings as output; tab, "
      + "new line, carriage return and backslash need to be escaped, ie, "
      + "'\t', '\n', '\r', '\\'.";
  }

  /**
   * Sets whether to quote the sub-strings when generating string output.
   *
   * @param value	if true then the sub-strings get quoted
   */
  public void setQuote(boolean value) {
    m_Quote = value;
    reset();
  }

  /**
   * Returns whether to quote the sub-strings when generating string output.
   *
   * @return		true if quoting enabled
   */
  public boolean getQuote() {
    return m_Quote;
  }

  /**
   * Returns the tip text for this property.
   *
   * @return 		tip text for this property suitable for
   * 			displaying in the GUI or for listing the options.
   */
  public String quoteTipText() {
    return
        "If enabled all sub-strings are quoted if necessary when generating "
      + "string output.";
  }

  /**
   * Sets the date format. See
   * <a href="http://java.sun.com/j2se/1.6.0/docs/api/java/text/SimpleDateFormat.html"
   * target="_blank">SimpleDateFormat</a> for more information.
   *
   * @param value	the separator
   */
  public void setDateFormat(String value) {
    m_DateFormat = value;
    reset();
  }

  /**
   * Returns the current date format.
   *
   * @return		the date format
   */
  public String getDateFormat() {
    return m_DateFormat;
  }

  /**
   * Returns the tip text for this property.
   *
   * @return 		tip text for this property suitable for
   * 			displaying in the GUI or for listing the options.
   */
  public String dateFormatTipText() {
    return "The format for the dates.";
  }

  /**
   * Returns a quick info about the actor, which will be displayed in the GUI.
   *
   * @return		null if no info available, otherwise short string
   */
  @Override
  public String getQuickInfo() {
    String	result;
    int		i;

    result = m_OutputType + ": ";
    for (i = 0; i < m_Fields.length; i++) {
      if (i > 0)
	result += ", ";
      result += m_Fields[i].toString();
    }

    return result;
  }

  /**
   * Returns the class that the consumer accepts.
   *
   * @return		<!-- flow-accepts-start -->twitter4j.Tweet.class, twitter4j.Status.class<!-- flow-accepts-end -->
   */
  public Class[] accepts() {
    return new Class[]{Tweet.class, Status.class};
  }

  /**
   * Returns the class of objects that it generates.
   *
   * @return		<!-- flow-generates-start -->java.lang.String.class<!-- flow-generates-end -->
   */
  public Class[] generates() {
    switch (m_OutputType) {
      case STRING:
	return new Class[]{String.class};

      case INSTANCE:
	return new Class[]{Instance.class};

      default:
	throw new IllegalStateException("Unhandled output type: " + m_OutputType);
    }
  }

  /**
   * Removes entries from the backup.
   */
  @Override
  protected void pruneBackup() {
    super.pruneBackup();

    pruneBackup(BACKUP_HEADER);
  }

  /**
   * Backs up the current state of the actor before update the variables.
   *
   * @return		the backup
   */
  @Override
  protected Hashtable<String,Object> backupState() {
    Hashtable<String,Object>	result;

    result = super.backupState();

    if (m_Header != null)
      result.put(BACKUP_HEADER, m_Header);

    return result;
  }

  /**
   * Restores the state of the actor before the variables got updated.
   *
   * @param state	the backup of the state to restore from
   */
  @Override
  protected void restoreState(Hashtable<String,Object> state) {
    if (state.containsKey(BACKUP_HEADER)) {
      m_Header = (Instances) state.get(BACKUP_HEADER);
      state.remove(BACKUP_HEADER);
    }

    super.restoreState(state);
  }

  /**
   * Returns the date formatter to use.
   *
   * @return		the formatter
   */
  protected DateFormat getDateFormatter() {
    if (m_DateFormatter == null) {
      m_DateFormatter = new DateFormat(m_DateFormat);
      m_DateFormatter.setLenient(false);
    }

    return m_DateFormatter;
  }

  /**
   * Generates the dataset header, if necessary.
   */
  protected void generateHeader() {
    ArrayList<Attribute>	atts;
    int				i;

    if (m_Header != null)
      return;

    atts = new ArrayList<Attribute>();
    for (i = 0; i < m_Fields.length; i++) {
      switch (m_Fields[i]) {
	case ID:
	case USER_ID:
	case GEO_LATITUDE:
	case GEO_LONGITUDE:
	  atts.add(new Attribute(m_Fields[i].toString()));
	  break;

	case USER_NAME:
	case SOURCE:
	case TEXT:
	case LANGUAGE_CODE:
	case PLACE:
	case COUNTRY:
	case COUNTRY_CODE:
	  atts.add(new Attribute(m_Fields[i].toString(), (ArrayList) null));
	  break;

	case CREATED:
	  atts.add(new Attribute(m_Fields[i].toString(), m_DateFormat));
	  break;
      }
    }

    m_Header = new Instances(getName(), atts, 0);
  }

  /**
   * Processes the specified tweet.
   *
   * @param tweet	the tweet to process
   * @return		the association between fields and tweet values
   */
  protected Hashtable<TwitterField,Object> processTweet(Tweet tweet) {
    Hashtable<TwitterField,Object>	result;

    result = new Hashtable<TwitterField,Object>();

    result.put(TwitterField.ID, tweet.getId());
    result.put(TwitterField.USER_ID, tweet.getFromUserId());
    result.put(TwitterField.USER_NAME, tweet.getFromUser());
    if (tweet.getSource() != null)
      result.put(TwitterField.SOURCE, tweet.getSource());
    result.put(TwitterField.TEXT, tweet.getText());
    result.put(TwitterField.CREATED, getDateFormatter().format(tweet.getCreatedAt()));
    if (tweet.getGeoLocation() != null) {
      result.put(TwitterField.GEO_LATITUDE, tweet.getGeoLocation().getLatitude());
      result.put(TwitterField.GEO_LONGITUDE, tweet.getGeoLocation().getLongitude());
    }
    if (tweet.getIsoLanguageCode() != null)
      result.put(TwitterField.LANGUAGE_CODE, tweet.getIsoLanguageCode());

    return result;
  }

  /**
   * Processes the specified status.
   *
   * @param status	the status to process
   * @return		the association between fields and status values
   */
  protected Hashtable<TwitterField,Object> processStatus(Status status) {
    Hashtable<TwitterField,Object>	result;

    result = new Hashtable<TwitterField,Object>();

    result.put(TwitterField.ID, status.getId());
    result.put(TwitterField.USER_ID, status.getUser().getId());
    result.put(TwitterField.USER_NAME, status.getUser().getName());
    result.put(TwitterField.SOURCE, status.getSource());
    result.put(TwitterField.TEXT, status.getText());
    result.put(TwitterField.CREATED, getDateFormatter().format(status.getCreatedAt()));
    if (status.getGeoLocation() != null) {
      result.put(TwitterField.GEO_LATITUDE, status.getGeoLocation().getLatitude());
      result.put(TwitterField.GEO_LONGITUDE, status.getGeoLocation().getLongitude());
    }
    if (status.getPlace() != null) {
      if (status.getPlace().getCountryCode() != null)
	result.put(TwitterField.COUNTRY_CODE, status.getPlace().getCountryCode());
      if (status.getPlace().getCountry() != null)
	result.put(TwitterField.COUNTRY, status.getPlace().getCountry());
      if (status.getPlace().getName() != null)
	result.put(TwitterField.PLACE, status.getPlace().getName());
    }

    return result;
  }

  /**
   * Creates an output token from the given fields.
   *
   * @param fields	the data to use
   * @return		the generated token
   */
  protected Token processFields(Hashtable<TwitterField,Object> fields) {
    Token		result;
    Instance		inst;
    StringBuilder	str;
    Attribute		att;
    int			i;
    Object		obj;

    switch (m_OutputType) {
      case STRING:
	str = new StringBuilder();
	for (i = 0; i < m_Fields.length; i++) {
	  if (i > 0)
	    str.append(m_Separator);
	  obj = fields.get(m_Fields[i]);
	  if (obj == null) {
	    str.append("?");
	  }
	  else {
	    if (m_Quote)
	      str.append(Utils.quote("" + obj));
	    else
	      str.append("" + obj);
	  }
	}
	result = new Token(str.toString());
	break;

      case INSTANCE:
	generateHeader();
	inst = new DenseInstance(m_Header.numAttributes());
	inst.setDataset(m_Header);
	for (i = 0; i < m_Fields.length; i++) {
	  att = m_Header.attribute(m_Fields[i].toString());
	  if (att == null)
	    continue;
	  obj = fields.get(m_Fields[i]);
	  if (obj == null)
	    continue;
	  switch (m_Fields[i]) {
	    case ID:
	      inst.setValue(att, ((Long) obj).doubleValue());
	      break;

	    case USER_ID:
	      inst.setValue(att, ((Integer) obj).doubleValue());
	      break;

	    case USER_NAME:
	    case SOURCE:
	    case TEXT:
	    case LANGUAGE_CODE:
	    case PLACE:
	    case COUNTRY:
	    case COUNTRY_CODE:
	      inst.setValue(att, (String) obj);
	      break;

	    case CREATED:
	      try {
		inst.setValue(att, (double) m_DateFormatter.parse((String) obj).getTime());
	      }
	      catch (Exception e) {
		handleException(
		    "Failed to parse date '" + obj + "' with pattern "
		    + "'" + m_DateFormat + "':", e);
	      }
	      break;

	    case GEO_LATITUDE:
	    case GEO_LONGITUDE:
	      inst.setValue(att, (Double) obj);
	      break;
	  }
	}
	result = new Token(inst);
	break;

      default:
	throw new IllegalStateException("Unhandled output type: " + m_OutputType);
    }

    return result;
  }

  /**
   * Executes the flow item.
   *
   * @return		null if everything is fine, otherwise error message
   */
  @Override
  protected String doExecute() {
    String				result;
    Hashtable<TwitterField,Object>	fields;

    result = null;

    try {
      if (m_InputToken.getPayload() instanceof Tweet)
	fields = processTweet((Tweet) m_InputToken.getPayload());
      else
	fields = processStatus((Status) m_InputToken.getPayload());
      m_OutputToken = processFields(fields);
    }
    catch (Exception e) {
      result = handleException("Failed to process fields: " + Utils.arrayToString(m_Fields), e);
    }

    return result;
  }
}
