/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/**
 * ExcelSAXSpreadSheetReader.java
 * Copyright (C) 2010-2012 University of Waikato, Hamilton, New Zealand
 * Copyright (C) Apache Foundation
 */
package adams.data.io.input;

import java.io.File;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Iterator;

import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

import adams.core.License;
import adams.core.annotation.MixedCopyright;
import adams.data.spreadsheet.Cell;
import adams.data.spreadsheet.Cell.ContentType;
import adams.data.spreadsheet.Row;
import adams.data.spreadsheet.SpreadSheet;

/**
 <!-- globalinfo-start -->
 * Reads large MS Excel XML files (using SAX).<br/>
 * Caveat: date&#47;time columns don't get imported correctly.
 * <p/>
 <!-- globalinfo-end -->
 *
 <!-- options-start -->
 * Valid options are: <p/>
 * 
 * <pre>-D &lt;int&gt; (property: debugLevel)
 * &nbsp;&nbsp;&nbsp;The greater the number the more additional info the scheme may output to 
 * &nbsp;&nbsp;&nbsp;the console (0 = off).
 * &nbsp;&nbsp;&nbsp;default: 0
 * &nbsp;&nbsp;&nbsp;minimum: 0
 * </pre>
 * 
 * <pre>-missing &lt;java.lang.String&gt; (property: missingValue)
 * &nbsp;&nbsp;&nbsp;The placeholder for missing values.
 * &nbsp;&nbsp;&nbsp;default: ?
 * </pre>
 * 
 * <pre>-sheet &lt;adams.core.Index&gt; (property: sheetIndex)
 * &nbsp;&nbsp;&nbsp;The index of the sheet to load; An index is a number starting with 1; the 
 * &nbsp;&nbsp;&nbsp;following placeholders can be used as well: first, second, third, last_2,
 * &nbsp;&nbsp;&nbsp; last_1, last
 * &nbsp;&nbsp;&nbsp;default: first
 * </pre>
 * 
 * <pre>-no-auto-extend-header (property: autoExtendHeader)
 * &nbsp;&nbsp;&nbsp;If enabled, the header gets automatically extended if rows have more cells 
 * &nbsp;&nbsp;&nbsp;than the header.
 * </pre>
 * 
 * <pre>-text-columns &lt;java.lang.String&gt; (property: textColumns)
 * &nbsp;&nbsp;&nbsp;The range of columns to treat as text; A range is a comma-separated list 
 * &nbsp;&nbsp;&nbsp;of single 1-based indices or sub-ranges of indices ('start-end'); 'inv(..
 * &nbsp;&nbsp;&nbsp;.)' inverts the range '...'; the following placeholders can be used as well:
 * &nbsp;&nbsp;&nbsp; first, second, third, last_2, last_1, last
 * &nbsp;&nbsp;&nbsp;default: 
 * </pre>
 * 
 <!-- options-end -->
 *
 * @author  fracpete (fracpete at waikato dot ac dot nz)
 * @version $Revision: 6291 $
 */
@MixedCopyright(
    copyright = "Apache Foundation",
    license = License.APACHE2,
    url = "http://poi.apache.org/spreadsheet/how-to.html#xssf_sax_api",
    note = "Adapted example from Apache website"
)
public class ExcelSAXSpreadSheetReader
  extends AbstractExcelSpreadSheetReader {

  /** for serialization. */
  private static final long serialVersionUID = 4755872204697328246L;

  /**
   * For reading a sheet from XML.
   *
   * @author  Apache Foundation (POI)
   * @author  fracpete (fracpete at waikato dot ac dot nz)
   * @version $Revision: 6291 $
   */
  @MixedCopyright(
      copyright = "Apache Foundation",
      license = License.APACHE2,
      url = "http://poi.apache.org/spreadsheet/how-to.html#xssf_sax_api",
      note = "Adapted example from Apache website"
  )
  public static class SheetHandler extends DefaultHandler {
    
    /** the reader this handler belongs to. */
    protected ExcelSAXSpreadSheetReader m_Owner;
    
    /** the spreadsheet to add the content to. */
    protected SpreadSheet m_Sheet;
    
    /** the table for shared strings. */
    protected SharedStringsTable m_SST;
    
    /** the assembled cell content. */
    protected StringBuilder m_LastContents;
    
    /** what type the current cell is. */
    protected ContentType m_ContentType;
    
    /** the cell reference. */
    protected String m_Reference;
    
    /** for caching unhandled cell types. */
    protected HashSet<String> m_UnhandledCellTypes;

    /**
     * Initializes the SAX handler.
     * 
     * @param owner	the reader this handler belongs to
     * @param sheet	the spreadsheet to add the content to
     * @param sst	the table for shared strings
     */
    public SheetHandler(ExcelSAXSpreadSheetReader owner, SpreadSheet sheet, SharedStringsTable sst) {
      m_Owner              = owner;
      m_Sheet              = sheet;
      m_SST                = sst;
      m_LastContents       = new StringBuilder();
      m_Reference          = "";
      m_ContentType        = ContentType.MISSING;
      m_UnhandledCellTypes = new HashSet<String>();
    }

    /**
     * Receive notification of the start of an element.
     *
     * @param uri The Namespace URI, or the empty string if the
     *        element has no Namespace URI or if Namespace
     *        processing is not being performed.
     * @param localName The local name (without prefix), or the
     *        empty string if Namespace processing is not being
     *        performed.
     * @param qName The qualified name (with prefix), or the
     *        empty string if qualified names are not available.
     * @param attributes The attributes attached to the element.  If
     *        there are no attributes, it shall be an empty
     *        Attributes object.
     * @throws org.xml.sax.SAXException Any SAX exception, possibly wrapping another exception.
     */
    @Override
    public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
      String 	cellType;
      
      // c => cell
      if (name.equals("c")) {
	// Print the cell reference
	m_Reference = attributes.getValue("r");
	// Figure out if the value is an index in the SST
	cellType = attributes.getValue("t");
	m_ContentType = ContentType.MISSING;
	if (cellType != null) {
	  if (cellType.equals("s")) {
	    m_ContentType = ContentType.STRING;
	  }
	  else if (cellType.equals("n")) {
	    m_ContentType = ContentType.DOUBLE;
	  }
	  else {
	    if (!m_UnhandledCellTypes.contains(cellType)) {
	      m_Owner.getSystemErr().println("Unhandled cell type: " + cellType);
	      m_UnhandledCellTypes.add(cellType);
	    }
	  }
	}
	else {
	  m_ContentType = ContentType.DOUBLE;
	}
      }
      
      // Clear contents cache
      if (m_LastContents.length() > 0)
	m_LastContents.delete(0, m_LastContents.length());
    }

    /**
     * Receive notification of the end of an element.
     *
     * @param uri The Namespace URI, or the empty string if the
     *        element has no Namespace URI or if Namespace
     *        processing is not being performed.
     * @param localName The local name (without prefix), or the
     *        empty string if Namespace processing is not being
     *        performed.
     * @param qName The qualified name (with prefix), or the
     *        empty string if qualified names are not available.
     * @throws org.xml.sax.SAXException Any SAX exception, possibly wrapping another exception.
     */
    @Override
    public void endElement(String uri, String localName, String name) throws SAXException {
      int 	idx;
      int[]	loc;
      Row	row;
      Cell	cell;
      
      // Process the last contents as required.
      // Do now, as characters() may be called more than once
      switch (m_ContentType) {
	case STRING:
	  idx            = Integer.parseInt(m_LastContents.toString());
	  m_LastContents = new StringBuilder(new XSSFRichTextString(m_SST.getEntryAt(idx)).toString());
	  break;
      }

      // v => contents of a cell
      // Output after we've seen the string contents
      if (name.equals("v")) {
	try {
	  loc = SpreadSheet.getCellLocation(m_Reference);
	  // fill in rows, if necessary
	  while (m_Sheet.getRowCount() < loc[0])
	    m_Sheet.addRow();
	  if (loc[0] == 0) {
	    row = m_Sheet.getHeaderRow();
	    row.addCell("" + loc[1]).setContent(m_LastContents.toString());
	  }
	  else {
	    row  = m_Sheet.getRow(loc[0] - 1);
	    cell = row.addCell("" + loc[1]);
	      switch (m_ContentType) {
		case STRING:
		  cell.setContentAsString(m_LastContents.toString());
		  break;
		case DOUBLE:
		  cell.setContent(new Double(m_LastContents.toString()));
		  break;
	      }
	  }
	}
	catch (Exception e) {
	  m_Owner.getSystemErr().printStackTrace(
	      "Failed to set cell content at " + m_Reference + ":", e);
	}
      }

      m_ContentType = ContentType.MISSING;
    }

    /**
     * Receive notification of character data inside an element.
     *
     * <p>By default, do nothing.  Application writers may override this
     * method to take specific actions for each chunk of character data
     * (such as adding the data to a node or buffer, or printing it to
     * a file).</p>
     *
     * @param ch The characters.
     * @param start The start position in the character array.
     * @param length The number of characters to use from the
     *               character array.
     * @throws org.xml.sax.SAXException Any SAX exception, possibly
     *            wrapping another exception.
     */
    @Override
    public void characters(char[] ch, int start, int length) throws SAXException {
      m_LastContents.append(ch, start, length);
    }
  }
	
  /**
   * Returns a string describing the object.
   *
   * @return 			a description suitable for displaying in the gui
   */
  @Override
  public String globalInfo() {
    return 
	"Reads large MS Excel XML files (using SAX).\n"
	+ "Caveat: date/time columns don't get imported correctly.";
  }

  /**
   * Returns a string describing the format (used in the file chooser).
   *
   * @return 			a description suitable for displaying in the
   * 				file chooser
   */
  @Override
  public String getFormatDescription() {
    return "MS Excel spreadsheets (large XML)";
  }

  /**
   * Returns the extension(s) of the format.
   *
   * @return 			the extension (without the dot!)
   */
  @Override
  public String[] getFormatExtensions() {
    return new String[]{"xlsx"};
  }

  /**
   * Returns how to read the data, from a file, stream or reader.
   *
   * @return		how to read the data
   */
  @Override
  protected InputType getInputType() {
    return InputType.FILE;
  }

  /**
   * Reads the spreadsheet content from the specified file.
   *
   * @param file	the file to read from
   * @return		the spreadsheet or null in case of an error
   */
  @Override
  protected SpreadSheet doRead(File file) {
    SpreadSheet			result;
    int				count;
    OPCPackage 			pkg;
    XSSFReader 			reader;
    SharedStringsTable 		sst;
    XMLReader 			parser;
    ContentHandler 		handler;
    Iterator<InputStream> 	sheets;
    InputStream 		sheet;
    InputSource 		sheetSource;

    result = new SpreadSheet();
    result.setDataRowClass(m_DataRowType.getRowClass());

    try {
      pkg     = OPCPackage.open(file.getAbsolutePath());
      reader  = new XSSFReader(pkg);
      sst     = reader.getSharedStringsTable();
      parser  = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
      handler = new SheetHandler(this, result, sst);
      parser.setContentHandler(handler);
      sheets  = reader.getSheetsData();
      count   = 0;
      while (sheets.hasNext()) {
	if (m_Stopped)
	  break;
	m_SheetIndex.setMax(count + 1);
	sheet = sheets.next();
	if (m_SheetIndex.getIntIndex() == count) {
	  sheetSource = new InputSource(sheet);
	  // TODO how to stop parsing?
	  parser.parse(sheetSource);
	}
	sheet.close();
	count++;
      }
    }
    catch (Exception e) {
      getSystemErr().printStackTrace("Failed to read spreadsheet:", e);
    }
    
    return result;
  }
}
