Changeset 6459


Ignore:
Timestamp:
May 27, 2014, 10:46:05 AM (8 years ago)
Author:
Nicklas Nordborg
Message:

Fixes #1808: Support for comment lines in GTF parser

Location:
trunk
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • trunk/data/plugin_configfile.xml

    r5773 r6459  
    957957    </parameter>
    958958    <parameter>
     959      <name>ignoreRegexp</name>
     960      <label>Ignore</label>
     961      <description>A regular expression that matches any line that should be ignored. For example, ignore lines starting with #: ^#.*</description>
     962      <class>java.lang.String</class>
     963      <value>^#.*</value>
     964    </parameter>
     965    <parameter>
    959966      <name>minDataColumns</name>
    960967      <label>Min data columns</label>
     
    10391046    </parameter>
    10401047    <parameter>
     1048      <name>ignoreRegexp</name>
     1049      <label>Ignore</label>
     1050      <description>A regular expression that matches any line that should be ignored. For example, ignore lines starting with #: ^#.*</description>
     1051      <class>java.lang.String</class>
     1052      <value>^#.*</value>
     1053    </parameter>
     1054    <parameter>
    10411055      <name>minDataColumns</name>
    10421056      <label>Min data columns</label>
     
    11211135    </parameter>
    11221136    <parameter>
     1137      <name>ignoreRegexp</name>
     1138      <label>Ignore</label>
     1139      <description>A regular expression that matches any line that should be ignored. For example, ignore lines starting with #: ^#.*</description>
     1140      <class>java.lang.String</class>
     1141      <value>^#.*</value>
     1142    </parameter>
     1143    <parameter>
    11231144      <name>minDataColumns</name>
    11241145      <label>Min data columns</label>
     
    11821203    </parameter>
    11831204    <parameter>
     1205      <name>ignoreRegexp</name>
     1206      <label>Ignore</label>
     1207      <description>A regular expression that matches any line that should be ignored. For example, ignore lines starting with #: ^#.*</description>
     1208      <class>java.lang.String</class>
     1209      <value>^#.*</value>
     1210    </parameter>
     1211    <parameter>
    11841212      <name>trimQuotes</name>
    11851213      <label>Remove quotes</label>
  • trunk/src/core/net/sf/basedb/util/gtf/GtfInputStream.java

    r5770 r6459  
    5252  in most cases we are not interested in this when importing data to BASE.
    5353  This option also remove the feature, start, end, score, strand and frame
    54   columns from the output.
     54  columns from the output. Lines that can't be split into at least 9 columns
     55  (eg. comment lines starting with #) are ignored and forwarded without modification.
    5556
    5657  @author Nicklas
     
    103104    this.reader = new BufferedReader(new InputStreamReader(master, this.charset));
    104105    this.ATTRIBUTE_PATTERN = Pattern.compile("([^ ]+) (([^\";]*)|(\"[^\"]*\"));");
    105     init();
     106    this.buffer = readMore();
    106107  }
    107108
     
    197198  {
    198199    return transcriptIds.size();
    199   }
    200  
    201   /**
    202     Initialize the converter by reading the first line from the GTF
    203     file. The attributes will be extracted and a header row is created
    204     with the 8 required columns + new colums for each attribute.
    205     @throws IOException
    206   */
    207   private void init()
    208     throws IOException
    209   {
    210     // Read first line
    211     String[] line = getNextLine();
    212     if (line == null) return;
    213    
    214     // Parse the attributes in 9th column
    215     parseAttributes(line[8]);
    216    
    217     // Generate header line
    218     StringBuffer sb = new StringBuffer();
    219     transcriptIds.add(attributes[transcriptIdIndex].value+ '@' + line[0]);
    220     if (skipRepeatedTranscriptIds)
    221     {
    222       sb.append("<seqname>\t<source>");
    223     }
    224     else
    225     {
    226       sb.append("<seqname>\t<source>\t<feature>\t<start>\t<end>\t<score>\t<strand>\t<frame>");
    227     }
    228     for (Attribute attribute : attributes)
    229     {
    230       sb.append("\t<").append(attribute.key).append(">");
    231     }
    232     sb.append("\n");
    233 
    234     // Append first line of data to the buffer
    235     appendLine(sb, line, attributes);
    236     buffer = sb.toString().getBytes(charset);
    237200  }
    238201 
     
    248211  {
    249212    String[] line;
     213    StringBuffer sb = new StringBuffer();
    250214    do
    251215    {
     
    258222     
    259223      // Parse attributes in 9th column
    260       parseAttributes(line[8]);
    261      
    262       String id = attributes[transcriptIdIndex].value + '@' + line[0];
    263       if (transcriptIds.add(id) && skipRepeatedTranscriptIds)
    264       {
     224      if (line.length >= 9)
     225      {
     226        // Generate headers if this is the first line with data
     227        boolean generateHeaders = attributes == null;
     228        parseAttributes(line[8]);
     229        if (generateHeaders)
     230        {
     231          if (skipRepeatedTranscriptIds)
     232          {
     233            sb.append("<seqname>\t<source>");
     234          }
     235          else
     236          {
     237            sb.append("<seqname>\t<source>\t<feature>\t<start>\t<end>\t<score>\t<strand>\t<frame>");
     238          }
     239          for (Attribute attribute : attributes)
     240          {
     241            sb.append("\t<").append(attribute.key).append(">");
     242          }
     243          sb.append("\n");
     244        }
     245       
     246        // Break out of the loop as soon as we see a new transcript id or if we are including repeated ids
     247        String id = attributes[transcriptIdIndex].value + '@' + line[0];
     248        if (transcriptIds.add(id) || !skipRepeatedTranscriptIds)
     249        {
     250          appendLine(sb, line, attributes);
     251          break;
     252        }
     253      }
     254      else
     255      {
     256        // Append line as it is
     257        appendLine(sb, line, null);
    265258        break;
    266259      }
     
    268261   
    269262    // Convert to byte[]
    270     return appendLine(new StringBuffer(), line, attributes).toString().getBytes(charset);
    271   }
    272  
    273   /**
    274     Read the next line from the GTF file and split on tab character
    275     into 9 or 10 columns. If the line doesn't contain at least 9 columns an
    276     exception is thrown. If the 10th column exists it must be a comment
    277     column that starts with a # character.
    278 
     263    return sb.toString().getBytes(charset);
     264  }
     265 
     266  /**
     267    Read the next line from the GTF file and split on tab character.
    279268  */
    280269  private String[] getNextLine()
     
    285274    lineNum++;
    286275    String[] columns = line.split("\\t", 10);
    287     if (columns.length < 9)
    288     {
    289       throw new IOException("A line must have at least 9 columns, at line: " + lineNum);
    290     }
    291276    return columns;
    292277  }
     
    357342 
    358343  /**
    359     Append the first 8 columns to the buffer and then add all values
    360     from the attributes.
     344    Append columns to the buffer and separate each with a tab.
     345    If attributes are given, the first 8 (or 2 if skipRepeatedTranscriptIds=true)
     346    columns are appended, then each of the attributes are appended.
     347    If no attributes are given, all columns are copied as they are.
     348   
    361349    @param sb The buffer to append to
    362350    @param columns The regular columns (must be at least 8)
     
    367355    // First 8 columns are copied with tab separator
    368356    sb.append(columns[0]);
    369     int end = skipRepeatedTranscriptIds ? 2 : 8;
     357    int end = attr == null ? columns.length : (skipRepeatedTranscriptIds ? 2 : 8);
    370358    for (int i = 1; i < end; ++i)
    371359    {
    372360      sb.append("\t").append(columns[i]);
    373361    }
    374     // Then follows the attributes
    375     for (Attribute attribute : attr)
    376     {
    377       sb.append("\t").append(attribute.value);
    378       attribute.value = null;
    379     }
    380     /*
    381     TODO - Handle Comments
    382     if (line.length == 10)
    383     {
    384       sb.append("\t").append(line[9]);
    385     }
    386     */
     362    if (attr != null)
     363    {
     364      // Then follows the attributes
     365      for (Attribute attribute : attr)
     366      {
     367        sb.append("\t").append(attribute.value);
     368        attribute.value = null;
     369      }
     370      if (columns.length == 10)
     371      {
     372        sb.append("\t").append(columns[9]);
     373      }
     374    }
    387375    sb.append("\n");
    388376    return sb;
  • trunk/src/plugins/core/net/sf/basedb/plugins/gtf/DefaultConfigurationValues.java

    r5770 r6459  
    9494        defaultValues.put("dataSplitterRegexp", "\\t");
    9595        defaultValues.put("dataHeaderRegexp", "<seqname>\\t.*<transcript_id>.*");
     96        defaultValues.put("ignoreRegexp", "^#.*");
    9697        defaultValues.put("minDataColumns", 4);
    9798        defaultValues.put("complexExpressions", "allow");
  • trunk/src/test/TestGtfInputStream.java

    r5770 r6459  
    6262      ffp.setDataHeaderRegexp(Pattern.compile("\\<seqname\\>.*"));
    6363      ffp.setDataSplitterRegexp(Pattern.compile("\\t"));
     64      ffp.setIgnoreRegexp(Pattern.compile("^#.*"));
    6465      ffp.setMinDataColumns(4);
    6566      ffp.setMaxDataColumns(-1);
  • trunk/src/test/data/test.gtf

    r5759 r6459  
     1# comment
    12chr1  hg19_refGene  start_codon 67000042  67000044  0 + . gene_id "NM_032291"; transcript_id "NM_032291"; gene_name2 "SGIP1";
    23chr1  hg19_refGene  CDS 67000042  67000051  0 + 0 gene_id "NM_032291"; transcript_id "NM_032291"; gene_name2 "SGIP1";
Note: See TracChangeset for help on using the changeset viewer.