Changeset 7673


Ignore:
Timestamp:
Mar 26, 2019, 2:58:12 PM (3 years ago)
Author:
Nicklas Nordborg
Message:

References #2161: Import data from Excel files

Added functionality for parsing all worksheets in an Excel file as a single CSV-like stream. It is made possible by sections with the sheet name in brackets. For example:

[Sheet1]
Data from sheet
[Sheet2]
Data from sheet 2

The section functionality in the FlatFileParser has been updated to allow for this kind of file.

Location:
trunk/src/core/net/sf/basedb/util
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/core/net/sf/basedb/util/excel/XlsxToCsvUtil.java

    r7671 r7673  
    88import java.util.ArrayList;
    99import java.util.Date;
     10import java.util.HashMap;
    1011import java.util.Iterator;
    1112import java.util.List;
     13import java.util.Map;
    1214
    1315import org.apache.poi.poifs.filesystem.FileMagic;
     
    2527import net.sf.basedb.core.Type;
    2628import net.sf.basedb.util.FileUtil;
     29import net.sf.basedb.util.InputStreamCombiner;
    2730import net.sf.basedb.util.encode.EncoderDecoder;
    2831import net.sf.basedb.util.encode.ToSpaceEncoderDecoder;
     
    99102  private Workbook workbook;
    100103  private List<String> sheetNames;
     104  private Map<String, SheetInfo> sheets;
    101105 
    102106  /**
     
    111115    this.charset = Charset.forName("UTF-8");
    112116    this.encoder = new ToSpaceEncoderDecoder();
     117    this.sheets = new HashMap<>();
    113118  }
    114119 
     
    302307  public SheetInfo getSheetAsCsv(String name)
    303308  {
    304     Sheet sheet = workbook.getSheet(name);
    305     if (sheet == null) throw new IllegalArgumentException("No sheet with name: " + name);
    306     return new SheetInfo(sheet);
     309    SheetInfo si = sheets.get(name);
     310    if (si == null)
     311    {
     312      Sheet sheet = workbook.getSheet(name);
     313      if (sheet == null) throw new IllegalArgumentException("No sheet with name: " + name);
     314      si = new SheetInfo(sheet);
     315      sheets.put(name, si);
     316    }
     317    return si;
    307318  }
    308319
     
    313324  {
    314325    Sheet sheet = workbook.getSheetAt(index);
    315     return new SheetInfo(sheet);
    316   }
    317 
     326    SheetInfo si = sheets.get(sheet.getSheetName());
     327    if (si == null)
     328    {
     329      si = new SheetInfo(sheet);
     330      sheets.put(si.getName(), si);
     331    }
     332    return si;
     333  }
     334
     335  /**
     336    Convert all sheets in the workbook to a single CSV-like stream. Each sheet
     337    is converted as if {@link SheetInfo#parseToCsv(boolean)} has been called
     338    with a TRUE parameter.
     339   
     340    started with [Name-of-sheet] and then followed by the sheet data. Use
     341  */
     342  public InputStream getWorkbookAsCsv()
     343  {
     344    return new WorkbookStream();
     345  }
     346 
     347 
    318348  /**
    319349    Closes the current workbook. It is safe to do this before
     
    552582      Note that the CSV will only contain data for the range of cells between
    553583      firstRow, lastRow, firstColumn and lastColumn.
     584      @param addPrefix If TRUE, adds the sheet name within brackets to the top of the stream
    554585    */
    555586    public InputStream parseToCsv()
     
    604635  }
    605636
     637  /**
     638    Input stream implementation that combines streams from all sheets in the current workbook.
     639  */
     640  class WorkbookStream
     641    extends InputStream
     642  {
     643    private SheetInfo currentSheet;
     644    private InputStream currentStream;
     645    private int nextIndex;
     646   
     647    WorkbookStream()
     648    {
     649      this.nextIndex = 0;
     650      nextSheet();
     651    }
     652
     653    /**
     654      Switch to the next worksheet.
     655    */
     656    private void nextSheet()
     657    {
     658      FileUtil.close(currentStream);
     659      if (nextIndex < getNumSheets())
     660      {
     661        currentSheet = getSheetAsCsv(nextIndex);
     662        currentStream = getPrefixedStream(currentSheet);
     663        nextIndex++;
     664      }
     665      else
     666      {
     667        currentSheet = null;
     668        currentStream = null;
     669      }
     670    }
     671   
     672    /**
     673      Create a combined stream by adding a "prefix": [sheet-name]
     674      and then the regular CSV stream for the given worksheet.
     675    */
     676    private InputStream getPrefixedStream(SheetInfo sheet)
     677    {
     678      String prefix = "["+encoder.encode(sheet.getName())+"]"+rowSeparator;
     679      return new InputStreamCombiner(
     680        new ByteArrayInputStream(prefix.getBytes(charset)),
     681        sheet.parseToCsv());
     682    }
     683
     684   
     685    @Override
     686    public int available()
     687      throws IOException
     688    {
     689      return currentStream != null ? currentStream.available() : 0;
     690    }
     691
     692    @Override
     693    public void close()
     694      throws IOException
     695    {
     696      if (currentStream != null) currentStream.close();
     697      currentStream = null;
     698      currentSheet = null;
     699    }
     700
     701    @Override
     702    public int read()
     703      throws IOException
     704    {
     705      int result = -1;
     706      while (result == -1 && currentStream != null)
     707      {
     708        result = currentStream.read();
     709        if (result == -1) nextSheet();
     710      }
     711      return result;
     712    }
     713
     714    @Override
     715    public int read(byte[] b, int off, int len)
     716      throws IOException
     717    {
     718      int result = -1;
     719      while (result == -1 && currentStream != null)
     720      {
     721        result = currentStream.read(b, off, len);
     722        if (result == -1) nextSheet();
     723      }
     724      return result;
     725    }
     726   
     727  }
     728 
    606729}
  • trunk/src/core/net/sf/basedb/util/parser/FlatFileParser.java

    r7668 r7673  
    213213 
    214214  /**
     215    Flag to indicate if only a single (=false) or all (=true) Excel sheets should
     216    be parsed in one go.
     217  */
     218  private boolean parseAllExcelSheets;
     219 
     220  /**
     221    Excel workbook that has been loaded.
     222  */
     223  private XlsxToCsvUtil excelWorkbook;
     224 
     225  /**
    215226    Excel sheet that is currently being parsed.
    216227  */
     
    329340  private int parsedLines;
    330341
     342  /**
     343    The number of sections parsed so far.
     344  */
     345  private int parsedSections;
     346 
    331347  /**
    332348    The total number of parsed characters so far.
     
    590606 
    591607  /**
     608    If this flag is set and the source file is an Excel file,
     609    then all sheets will be parsed unless a named sheet
     610    specified. Each sheet is handled like a section with the sheet
     611    name inside brackets ([name]). The regular expression for
     612    detecting a section is automatically updated to match this
     613    pattern.
     614    @since 3.15
     615  */
     616  public void setParseAllExcelSheets(boolean parseAllExcelSheets)
     617  {
     618    this.parseAllExcelSheets = parseAllExcelSheets;
     619  }
     620 
     621  /**
     622    @see #setParseAllExcelSheets(boolean)
     623    @since 3.15
     624  */
     625  public boolean getParseAllExcelSheets()
     626  {
     627    return parseAllExcelSheets;
     628  }
     629 
     630  /**
    592631    Set the name of Excel worksheet to parse if the given file is an Excel
    593632    file, otherwise this is ignored.
     
    618657    * If no number format has been specified, 'dot' is used
    619658    * The data splitter regular expression is changed to \\t
     659    * The section regular expression is changed to [.*] (if the {@link #getParseAllExcelSheets()} flag is set
    620660   
    621661    @param in The <code>InputStream</code>
     
    624664      {@link Config#getCharset()}
    625665      If Excel, the name or index of the worksheet in the workbook, the default is
    626       to parse the first sheet (index=0)
     666      to parse the first sheet (index=0) or the whole workbook if the {@link #getParseAllExcelSheets()} flag is set
    627667    @since 2.1.1
    628668  */
     
    633673    if (XlsxToCsvUtil.isAnExcelFile(in))
    634674    {
    635       XlsxToCsvUtil xlsUtil = new XlsxToCsvUtil();
     675      excelWorkbook = new XlsxToCsvUtil();
    636676      if (dateFormat == null) dateFormat = new DateFormatter("yyyy-MM-dd");
    637677      if (timestampFormat == null) timestampFormat = new DateFormatter("yyyy-MM-dd hh:mm:ss");
    638678      if (numberFormat == null) numberFormat = NumberFormatUtil.getNumberFormat('.', (char)0);
    639       xlsUtil.setDateFormat(dateFormat);
    640       xlsUtil.setTimestampFormat(timestampFormat);
    641       xlsUtil.setNumberFormat(new NumberFormatFormatter(numberFormat));
    642       xlsUtil.setEvaluateFormulas(true);
     679      excelWorkbook.setDateFormat(dateFormat);
     680      excelWorkbook.setTimestampFormat(timestampFormat);
     681      excelWorkbook.setNumberFormat(new NumberFormatFormatter(numberFormat));
     682      excelWorkbook.setEvaluateFormulas(true);
    643683      setDataSplitterRegexp(Pattern.compile("\\t"));
    644684      charsetName = "UTF-8";
    645685      try
    646686      {
    647         xlsUtil.readWorkbook(in);
     687        excelWorkbook.readWorkbook(in);
    648688      }
    649689      catch (IOException ex)
     
    651691        throw new RuntimeException(ex);
    652692      }
    653       List<String> sheets = xlsUtil.getSheetNames();
    654       if (sheets.contains(charsetOrSheetName))
     693     
     694      if (parseAllExcelSheets)
    655695      {
    656         excelSheet = xlsUtil.getSheetAsCsv(charsetOrSheetName);
    657       }
    658       else if (sheets.contains(excelSheetName))
    659       {
    660         excelSheet = xlsUtil.getSheetAsCsv(excelSheetName);
     696        setSectionRegexp(Pattern.compile("\\[(.*)\\]"));
     697        in = excelWorkbook.getWorkbookAsCsv();
    661698      }
    662699      else
    663700      {
    664         // Or try to convert to index with 0 as default
    665         excelSheet = xlsUtil.getSheetAsCsv(Values.getInt(charsetOrSheetName));
     701        List<String> sheets = excelWorkbook.getSheetNames();
     702        if (sheets.contains(charsetOrSheetName))
     703        {
     704          excelSheet = excelWorkbook.getSheetAsCsv(charsetOrSheetName);
     705        }
     706        else if (sheets.contains(excelSheetName))
     707        {
     708          excelSheet = excelWorkbook.getSheetAsCsv(excelSheetName);
     709        }
     710        else
     711        {
     712          // Or try to convert to index with 0 as default
     713          excelSheet = excelWorkbook.getSheetAsCsv(Values.getInt(charsetOrSheetName));
     714        }
     715        excelSheetName = excelSheet.getName();
     716        in = excelSheet.parseToCsv();
    666717      }
    667       excelSheetName = excelSheet.getName();
    668       in = excelSheet.parseToCsv();
    669718    }
    670719    Charset cs = CharsetUtil.getCharset(charsetName);
     
    805854            nextSection = new Line(parsedLines, line, m.group(1), LineType.SECTION);
    806855            lines.add(nextSection);
     856            if (parseAllExcelSheets)
     857            {
     858              excelSheet = excelWorkbook.getSheetAsCsv(parsedSections);
     859              excelSheetName = excelSheet.getName();
     860            }
     861            parsedSections++;
    807862          }
    808863        }
     
    13521407              nextSection = new Line(parsedLines, line, m.group(1), LineType.SECTION);
    13531408              lines.add(nextSection);
     1409              if (parseAllExcelSheets)
     1410              {
     1411                excelSheet = excelWorkbook.getSheetAsCsv(parsedSections);
     1412                excelSheetName = excelSheet.getName();
     1413              }
     1414              parsedSections++;
    13541415            }
    13551416          }
     
    14341495 
    14351496  /**
     1497    Get the number of found sections so far.
     1498    @since 3.15
     1499  */
     1500  public int getParsedSections()
     1501  {
     1502    return parsedSections;
     1503  }
     1504 
     1505  /**
    14361506    Get the number of parsed data lines so far in the current section.
    14371507    This value is reset for each new section.
     
    15801650            nextSection = new Line(parsedLines, line, m.group(1), LineType.SECTION);
    15811651            parsedDataLines = 0;
     1652            if (parseAllExcelSheets)
     1653            {
     1654              excelSheet = excelWorkbook.getSheetAsCsv(parsedSections);
     1655              excelSheetName = excelSheet.getName();
     1656            }
     1657            parsedSections++;
    15821658          }
    15831659        }
Note: See TracChangeset for help on using the changeset viewer.