Changeset 7629


Ignore:
Timestamp:
Mar 11, 2019, 8:31:29 AM (3 years ago)
Author:
Nicklas Nordborg
Message:

References #2157: Investigate if we can implement UTF-8 with ISO-8859-1 fallback when parsing text files

Added test case for the UTF-8 with fallback charsets.

Location:
trunk/src
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/core/net/sf/basedb/util/charset/CharsetUtil.java

    r7627 r7629  
    1313
    1414/**
    15   Helper class for working with charsets and enabling the UTF-8 with fallback charsets.
     15  Helper class for working with charsets and enabling the UTF-8 with fallback charsets:
     16 
     17  * X-UTF-8_with_ISO-8859-1_fallback
     18  * X-UTF-8_with_windows-1252_fallback
    1619 
    1720  @author nicklas
  • trunk/src/test/TestCharsetDetector.java

    r7624 r7629  
    2222import java.io.IOException;
    2323import java.io.InputStream;
    24 import java.nio.charset.Charset;
    2524
    2625import net.sf.basedb.util.FileUtil;
    2726import net.sf.basedb.util.charset.CharsetDetector;
     27import net.sf.basedb.util.charset.CharsetUtil;
    2828import net.sf.basedb.util.charset.SimpleStringDetector;
    2929import net.sf.basedb.util.charset.StringDetector;
     
    4949    test_charset("data/charset-tester-utf-8.txt", "ISO-8859-1", null, null, false); // No failure but we get incorrect text for Å and ö
    5050    test_charset("data/charset-tester-utf-8.txt", "ISO-8859-1", "Namn", "Ålder", true); // Fails since we can't find Ålder (Å is parsed as Ã…)
    51    
     51
    5252    test_charset("data/charset-tester-iso-8859-1.txt", "ISO-8859-1", null, null, false);
    5353    test_charset("data/charset-tester-iso-8859-1.txt", "ISO-8859-1", "Namn", "Ålder", false);
     
    5656    test_charset("data/charset-tester-iso-8859-1.txt", "ISO-8859-7", null, null, false); // No failure but we get incorrect text for Å and ö
    5757    test_charset("data/charset-tester-iso-8859-1.txt", "ISO-8859-7", "Namn", "Ålder", true); // Fails since Å is Ε (greek)
     58
     59    // Both files should work with the UTF-8 fallback charset
     60    test_charset("data/charset-tester-utf-8.txt", "X-UTF-8_with_ISO-8859-1_fallback", "Namn", "Ålder", false);
     61    test_charset("data/charset-tester-iso-8859-1.txt", "X-UTF-8_with_ISO-8859-1_fallback", "Namn", "Ålder", false);
    5862   
    5963    write("++Testing CharsetDetector "+(ok ? "OK" : "Failed")+"\n");
     
    7680        lineChecker = new SimpleStringDetector(lookFor, match);
    7781      }
    78       CharsetDetector detector = new CharsetDetector(Charset.forName(charset), lineChecker);
     82      CharsetDetector detector = new CharsetDetector(CharsetUtil.getCharset(charset), lineChecker);
    7983     
    8084      in = FileUtil.getInputStream(new java.io.File(file));
Note: See TracChangeset for help on using the changeset viewer.