Changeset 7623


Ignore:
Timestamp:
Mar 7, 2019, 10:55:50 AM (5 years ago)
Author:
Nicklas Nordborg
Message:

References #2156: Check UTF-8 for text files that are uploaded without a selected character set

Implemented a utility class CharsetDetector that can be used for simple testing of encoding in text files. It works best with encodings that can be technically detected. For example, UTF-8 is very unlikely to be mixed up with other encodings while any of the ISO-8859-x encodings can typically be used for all files. The StringDetector is intended to be used for discriminating between ISO-8859-x encodings but it requires prior knowledge of text that is expected to be found in the file that is unique to an encoding.

The file upload functionality has been extended to check for UTF-8 text files. It is enabled automatically when the MIME type is set to something in the 'text/*' subset and no character set has been explicitely specified.

Location:
trunk/src/core/net/sf/basedb
Files:
4 added
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/core/net/sf/basedb/core/File.java

    r7573 r7623  
    3232import net.sf.basedb.core.data.MimeTypeData;
    3333import net.sf.basedb.core.hibernate.TypeWrapper;
     34import net.sf.basedb.util.charset.CharsetDetector;
    3435import net.sf.basedb.util.EqualsHelper;
    3536import net.sf.basedb.util.FileUtil;
     
    5152import java.io.InputStream;
    5253import java.io.OutputStream;
     54import java.io.PipedInputStream;
     55import java.io.PipedOutputStream;
    5356import java.net.URI;
    5457import java.net.URISyntaxException;
     
    14091412      newInternalFile = getNewFile(compress);
    14101413    }
     1414    String mimeType = getMimeType();
     1415    String charset = getCharacterSet();
     1416    // If the file is a text file and no charset has been detected we try UTF-8 detection
     1417    boolean utf8Detect = mimeType != null && mimeType.startsWith("text/") && charset == null;
    14111418    try
    14121419    {
    1413       uploadStream = new UploadStream(newInternalFile, true, checkMd5, compress);
     1420      uploadStream = new UploadStream(newInternalFile, true, checkMd5, compress, utf8Detect);
    14141421    }
    14151422    catch (IOException ex)
     
    16891696    private boolean checkMd5;
    16901697
     1698    private CharsetTesterThread utf8Tester;
     1699   
    16911700    /**
    16921701      Create a new <code>UploadStream</code> and connect it to the
     
    16971706        previously stored md5 sum, useful for making sure secondary storage is
    16981707        working properly
     1708      @param utf8Detect If set, we try to parse the file as UTF-8. If this works we set the
     1709        encoding to UTF-8
    16991710      @throws IOException If there is an error during the upload
    17001711    */
    1701     private UploadStream(java.io.File file, boolean calculateMd5, boolean checkMd5, boolean compress)
     1712    private UploadStream(java.io.File file, boolean calculateMd5, boolean checkMd5, boolean compress, boolean utf8Detect)
    17021713      throws IOException
    17031714    {
     
    17121723      }
    17131724      closed = false;
     1725      if (utf8Detect)
     1726      {
     1727        // Start a thread for parallell testing if the file can be parsed as UTF-8
     1728        utf8Tester = new CharsetTesterThread(Charset.forName("UTF-8"));
     1729        new Thread(utf8Tester).start();
     1730      }
    17141731    }
    17151732   
     
    17211738      if (md5 != null) md5.update(b, off, len);
    17221739      super.write(b, off, len);
    1723     }
    1724    
     1740      if (utf8Tester != null) utf8Tester.write(b, off, len);
     1741    }
     1742     
    17251743    @Override
    17261744    public void write(byte[] b)
     
    17441762      if (md5 != null) md5.update((byte)b);
    17451763      super.write(b);
     1764      if (utf8Tester != null) utf8Tester.write(b);
    17461765    }
    17471766
     
    17591778        return;
    17601779      }
     1780      String charset = null;
     1781      if (utf8Tester != null)
     1782      {
     1783        utf8Tester.close();
     1784        if (utf8Tester.couldParse())
     1785        {
     1786          charset = utf8Tester.getCharset().name();
     1787        }
     1788      }
    17611789      super.close();
    17621790      FileData data = getData();
     
    17671795      data.setRemovedBy(null);
    17681796      data.setUrl(null);
     1797      if (charset != null && data.getCharacterSet() == null)
     1798      {
     1799        data.setCharacterSet(charset);
     1800      }
    17691801      if (md5 != null)
    17701802      {
     
    17831815    }
    17841816  }
     1817 
     1818  class CharsetTesterThread
     1819    implements Runnable
     1820  {
     1821    private final PipedOutputStream pop;
     1822    private final PipedInputStream pip;
     1823    private final CharsetDetector detector;
     1824   
     1825    private Thread thread;
     1826    // These variables are used by both threads
     1827    private volatile boolean couldParse;
     1828    private volatile boolean isRunning;
     1829   
     1830    CharsetTesterThread(Charset charset)
     1831      throws IOException
     1832    {
     1833      this.detector = new CharsetDetector(charset);
     1834      this.pop = new PipedOutputStream();
     1835      this.pip = new PipedInputStream(pop, 4096);
     1836      this.isRunning = true;
     1837    }
     1838   
     1839    /**
     1840      Runs in a separate thread.
     1841    */
     1842    @Override
     1843    public void run()
     1844    {
     1845      thread = Thread.currentThread();
     1846      couldParse = detector.testIt(pip);
     1847      isRunning = false;
     1848    }
     1849
     1850    public Charset getCharset()
     1851    {
     1852      return detector.getCharset();
     1853    }
     1854   
     1855    /**
     1856      Returns TRUE if the file seems like valid (UTF-8). Note
     1857      that this may not be correct until the close() method has
     1858      been called.
     1859    */
     1860    public boolean couldParse()
     1861    {
     1862      return couldParse;
     1863    }
     1864   
     1865    /**
     1866      Runs in the main thread. Copy bytes to charset detector via
     1867      the piped streams.
     1868    */
     1869    public void write(byte[] b, int off, int len)
     1870    {
     1871      if (isRunning)
     1872      {
     1873        try
     1874        {
     1875          pop.write(b, off, len);
     1876        }
     1877        catch (IOException ex)
     1878        {
     1879          isRunning = false;
     1880        }
     1881      }
     1882    }
     1883   
     1884    /**
     1885      Runs in the main thread. Copy bytes to charset detector via
     1886      the piped streams.
     1887    */
     1888    public void write(int b)
     1889    {
     1890      if (isRunning)
     1891      {
     1892        try
     1893        {
     1894          pop.write(b);
     1895        }
     1896        catch (IOException ex)
     1897        {
     1898          isRunning = false;
     1899        }
     1900      }
     1901    }
     1902   
     1903    /**
     1904      The file has been uploaded. Wait for the charset detector
     1905      threas to finish and the close everything.
     1906     */
     1907    public void close()
     1908    {
     1909      try
     1910      {
     1911        // Flush and close the pipe and...
     1912        pop.flush();
     1913        FileUtil.close(pop);
     1914        if (isRunning && thread != null)
     1915        {
     1916          // ... wait for the reading thread to terminate
     1917          thread.join();
     1918        }
     1919      }
     1920      catch (IOException | InterruptedException ex)
     1921      {}
     1922      finally
     1923      {
     1924        thread = null;
     1925        FileUtil.close(pop);
     1926        FileUtil.close(pip);
     1927      }
     1928    }
     1929  }
     1930 
    17851931}
Note: See TracChangeset for help on using the changeset viewer.