Changeset 7655


Ignore:
Timestamp:
Mar 19, 2019, 11:09:07 AM (4 years ago)
Author:
Nicklas Nordborg
Message:

References #2161: Import data from Excel files

Moved auto-detection of Excel files into the FlatFileParser and added some methods for date/timestamp formatting with sensible defaults. This should make it possible for more or less all plug-ins or code that uses the FlatFileParser to import from Excel files (via auto-conversion to CSV) as long as the data is in the first sheet. To get access to other sheets the 'charset' must specify the sheet name or index instead.

Location:
trunk
Files:
1 added
5 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/core/net/sf/basedb/util/excel/XlsxToCsvUtil.java

    r7654 r7655  
    1010import java.util.List;
    1111
     12import org.apache.poi.poifs.filesystem.FileMagic;
    1213import org.apache.poi.ss.usermodel.Cell;
    1314import org.apache.poi.ss.usermodel.CellType;
     
    5758  }
    5859 
     60  /**
     61    Check if the given stream is an Excel file by looking at the first few bytes.
     62    The stream must support seek(). Call {@link #prepareForCheck(InputStream)}
     63    before calling this method if it is not sure.
     64  */
     65  public static boolean isAnExcelFile(InputStream in)
     66  {
     67    try
     68    {
     69      return FileMagic.valueOf(in) == FileMagic.OOXML;
     70    }
     71    catch (Exception ex)
     72    {}
     73    return false;
     74  }
     75 
     76  /**
     77    Prepare the input stream for checking with
     78    {@link #isAnExcelFile(InputStream)}. If the given stream
     79    supports seek() it is returned as it is, otherwise it is
     80    wrapped by a buffered stream.
     81  */
     82  public static InputStream prepareForCheck(InputStream in)
     83  {
     84    return FileMagic.prepareToCheckMagic(in);
     85  }
     86
    5987  private Charset charset;
    6088  private String columnSeparator;
     
    6492  private EncoderDecoder encoder;
    6593  private Formatter<Date> dateFormat;
     94  private Formatter<Date> timestampFormat;
    6695  private Formatter<Number> numberFormat;
    6796 
     
    192221    this.dateFormat = dateFormat;
    193222  }
     223 
     224  /**
     225    Get the date formatter that is used for formatting cells with timestamp-like values.
     226    If null, the default implementation is used (which may not be predicatable).
     227  */
     228  public Formatter<Date> getTimestampFormat()
     229  {
     230    return timestampFormat;
     231  }
     232 
     233  /**
     234    Set a date formatter to use for formatting cells with timestamp-like values.
     235    If null, the default implementation is used (which may not be predicatable).
     236  */
     237  public void setTimestampFormat(Formatter<Date> timestampFormat)
     238  {
     239    this.timestampFormat = timestampFormat;
     240  }
     241
    194242 
    195243  /**
     
    405453              if (isDate)
    406454              {
    407                 if (dateFormat != null)
     455                String formatString = cell.getCellStyle().getDataFormatString();
     456                boolean hasTimePart = hasTimePart(formatString);
     457                boolean hasDatePart = hasDatePart(formatString);         
     458                Date d = cell.getDateCellValue();
     459                if (hasDatePart && hasTimePart)
    408460                {
    409                   value = dateFormat.format(cell.getDateCellValue());
     461                  if (timestampFormat != null) value = timestampFormat.format(d);
     462                }
     463                else if (hasDatePart)
     464                {
     465                  if (dateFormat != null) value = dateFormat.format(d);
     466                }
     467                else
     468                {
     469                  // Only time part -- let it be handled by the DataFormat
    410470                }
    411471              }
     
    431491  }
    432492 
    433  
     493  private boolean hasTimePart(String formatString)
     494  {
     495    if (formatString == null) return false;
     496    return formatString.contains("S") || formatString.contains("H");
     497  }
     498 
     499  private boolean hasDatePart(String formatString)
     500  {
     501    if (formatString == null) return false;
     502    return formatString.contains("Y") || formatString.contains("D");
     503  }
     504
    434505}
  • trunk/src/core/net/sf/basedb/util/parser/FlatFileParser.java

    r7627 r7655  
    3434import java.util.regex.Matcher;
    3535import java.util.regex.PatternSyntaxException;
     36
     37
    3638import java.util.Arrays;
    3739import java.util.LinkedList;
     
    4244import java.util.Set;
    4345import java.util.Collections;
     46import java.util.Date;
    4447
    4548import net.sf.basedb.core.BaseException;
     
    4750import net.sf.basedb.core.signal.ThreadSignalHandler;
    4851import net.sf.basedb.util.InputStreamTracker;
     52import net.sf.basedb.util.NumberFormatUtil;
     53import net.sf.basedb.util.Values;
    4954import net.sf.basedb.util.charset.CharsetUtil;
     55import net.sf.basedb.util.excel.XlsxToCsvUtil;
     56import net.sf.basedb.util.formatter.DateFormatter;
     57import net.sf.basedb.util.formatter.Formatter;
     58import net.sf.basedb.util.formatter.NumberFormatFormatter;
    5059import net.sf.basedb.util.jep.JepFunction;
    5160
     
    270279 
    271280  /**
     281    The default date formatter to use when creating mappers.
     282  */
     283  private Formatter<Date> dateFormat = null;
     284 
     285  /**
     286    The default timestamp format to use when creating mappers.
     287  */
     288  private Formatter<Date> timestampFormat = null;
     289
     290  /**
    272291    The value that was captured by the bofMarker pattern.
    273292  */
     
    545564 
    546565  /**
    547     Set the input stream that will be parsed.
     566    Set the input stream that will be parsed. The stream can be either a
     567    text CSV-like stream or an Excel workbook (xlsx).
     568   
     569    If the stream is an Excel workbook the following apply:
     570   
     571    * If no date format has been specified, yyyy-MM-dd is used
     572    * If no timestamp format has been specified, yyyy-MM-dd hh:mm:ss is used
     573    * If no number format has been specified, 'dot' is used
     574    * The data splitter regular expression is changed to \\t
     575   
    548576    @param in The <code>InputStream</code>
    549     @param charsetName The name of the character set to use when parsing
    550       the file, or null to use the default charset specified by
     577    @param charsetOrSheetName If CSV, the name of the character set to
     578      use when parsing the file, or null to use the default charset specified by
    551579      {@link Config#getCharset()}
     580      If Excel, the name or index of the worksheet in the workbook, the default is
     581      to parse the first sheet (index=0)
    552582    @since 2.1.1
    553583  */
    554   public void setInputStream(InputStream in, String charsetName)
    555   {
     584  public void setInputStream(InputStream in, String charsetOrSheetName)
     585  {
     586    in = XlsxToCsvUtil.prepareForCheck(in);
     587    String charsetName = charsetOrSheetName;
     588    if (XlsxToCsvUtil.isAnExcelFile(in))
     589    {
     590      XlsxToCsvUtil xlsUtil = new XlsxToCsvUtil();
     591      if (dateFormat == null) dateFormat = new DateFormatter("yyyy-MM-dd");
     592      if (timestampFormat == null) timestampFormat = new DateFormatter("yyyy-MM-dd hh:mm:ss");
     593      if (numberFormat == null) numberFormat = NumberFormatUtil.getNumberFormat('.', (char)0);
     594      xlsUtil.setDateFormat(dateFormat);
     595      xlsUtil.setTimestampFormat(timestampFormat);
     596      xlsUtil.setNumberFormat(new NumberFormatFormatter(numberFormat));
     597      xlsUtil.setEvaluateFormulas(true);
     598      setDataSplitterRegexp(Pattern.compile("\\t"));
     599      charsetName = "UTF-8";
     600      try
     601      {
     602        xlsUtil.readWorkbook(in);
     603      }
     604      catch (IOException ex)
     605      {
     606        throw new RuntimeException(ex);
     607      }
     608      if (xlsUtil.getSheetNames().contains(charsetOrSheetName))
     609      {
     610        // Get the named sheet
     611        in = xlsUtil.getSheetAsCsv(charsetOrSheetName).parseToCsv();
     612      }
     613      else
     614      {
     615        // Or try to convert to index with 0 as default
     616        in = xlsUtil.getSheetAsCsv(Values.getInt(charsetOrSheetName)).parseToCsv();
     617      }
     618    }
    556619    Charset cs = CharsetUtil.getCharset(charsetName);
    557620    this.tracker = new InputStreamTracker(in);
     
    914977  {
    915978    return numberFormat;
     979  }
     980 
     981  /**
     982    Set the default date format to use when creating mappers.
     983    If null, xxx is used.
     984    @since 3.15
     985  */
     986  public void setDefaultDateFormat(Formatter<Date> dateFormat)
     987  {
     988    this.dateFormat = dateFormat;
     989  }
     990 
     991  /**
     992    Get the default date format.
     993    @since 3.15
     994  */
     995  public Formatter<Date> getDefaultDateFormat()
     996  {
     997    return dateFormat;
     998  }
     999 
     1000  /**
     1001    Set the default timestamp format to use when creating mappers.
     1002    If null, xxx is used.
     1003    @since 3.15
     1004  */
     1005  public void setDefaultTimestampFormat(Formatter<Date> timestampFormat)
     1006  {
     1007    this.timestampFormat = timestampFormat;
     1008  }
     1009 
     1010  /**
     1011    Get the default timestamp format.
     1012    @since 3.15
     1013  */
     1014  public Formatter<Date> getDefaultTimestampFormat()
     1015  {
     1016    return timestampFormat;
    9161017  }
    9171018 
  • trunk/www/common/plugin/parse_file.jsp

    r7654 r7655  
    169169      fileInputStream = wex.wrapInputStream(fileInputStream);
    170170    }
    171    
    172     if (excelMode)
    173     {
    174       // TODO - this should be inside the FlatFileParser
    175       String dateFormat = Values.getString(request.getParameter("dateFormat"), "yyyy-MM-dd");
    176       XlsxToCsvUtil util = new XlsxToCsvUtil();
    177       util.setDateFormat(new DateFormatter(dateFormat));
    178       util.setEvaluateFormulas(true);
    179       util.readWorkbook(fileInputStream);
    180       SheetInfo sheetInfo = util.getSheetAsCsv(sheet);
    181       fileInputStream = sheetInfo.parseToCsv();
    182     }
    183    
    184     parser.setInputStream(fileInputStream, charsetName);
     171
     172    String dateFormat = Values.getStringOrNull(request.getParameter("dateFormat"));
     173    if (dateFormat != null) parser.setDefaultDateFormat(new DateFormatter(dateFormat));
     174    String timestampFormat = Values.getStringOrNull(request.getParameter("timestampFormat"));
     175    if (timestampFormat != null) parser.setDefaultTimestampFormat(new DateFormatter(timestampFormat));
     176    String decimalSeparator = Values.getStringOrNull(request.getParameter("decimalSeparator"));
     177    if (decimalSeparator != null)
     178    {
     179      char ds = "dot".equals(decimalSeparator) ? '.' : ',';
     180      parser.setDefaultNumberFormat(NumberFormatUtil.getNumberFormat(ds, (char)0));
     181    }
     182   
     183    parser.setInputStream(fileInputStream, excelMode ? sheet : charsetName);
    185184    lastLine = parser.parseHeaders();
    186185    dataIsFound = lastLine == FlatFileParser.LineType.DATA || lastLine == FlatFileParser.LineType.DATA_HEADER;
  • trunk/www/common/plugin/test_with_file.js

    r7654 r7655  
    175175   
    176176    var parseWin = window.frames['parsed'];
    177     parseWin.ParseFile.pleaseWaitWhileParsing();
     177    if (parseWin && parseWin.ParseFile)
     178    {
     179      parseWin.ParseFile.pleaseWaitWhileParsing();
     180    }
    178181    frm.submit();
    179182  }
     
    192195    frm.maxDataColumns.value = configWin.Configure.getParameterValue('maxDataColumns');
    193196    frm.trimQuotes.checked = configWin.Configure.getParameterValue('trimQuotes') != 'false';
     197    frm.decimalSeparator.value = configWin.Configure.getParameterValue('decimalSeparator');
    194198    frm.dateFormat.value = configWin.Configure.getParameterValue('dateFormat');
     199    frm.timestampFormat.value = configWin.Configure.getParameterValue('timestampFormat');
    195200     
    196201    // Copy the list since ith is not certain that it is the same as Config.allCharsets()
  • trunk/www/common/plugin/test_with_file.jsp

    r7654 r7655  
    180180    <input type="hidden" name="file_id" value="">
    181181    <input type="hidden" name="filemode" value="csv">
     182    <input type="hidden" name="decimalSeparator" value="">
    182183    <input type="hidden" name="dateFormat" value="">
     184    <input type="hidden" name="timestampFormat" value="">
    183185
    184186    <div class="content">
Note: See TracChangeset for help on using the changeset viewer.