Changeset 5770


Ignore:
Timestamp:
Sep 29, 2011, 1:27:33 PM (10 years ago)
Author:
Nicklas Nordborg
Message:

References #1623: Create reporter importer for GTF files

The default configuration now uses transcript_id@seqname as the reporter id. We discovered that this is needed to make up a unique id for all entries that we want to track in array design features and raw data.

Location:
trunk
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • trunk/data/plugin_configfile.xml

    r5764 r5770  
    934934  <configuration pluginClassName="net.sf.basedb.plugins.gtf.GtfReporterImporter">
    935935    <configname>gene_id (no prefix)</configname>
    936     <description>A configuration that uses the gene_id (no prefix) instead of the transcript_id as reporter id.</description>
     936    <description>A configuration that uses the gene_id instead of the transcript_id as reporter id.</description>
    937937    <parameter>
    938938      <name>trimQuotes</name>
     
    10011001  </configuration>
    10021002  <configuration pluginClassName="net.sf.basedb.plugins.gtf.GtfReporterImporter">
    1003     <configname>transcript_id (no prefix)</configname>
    1004     <description>A configuration that uses the transcript_id (no prefix) as reporter id.</description>
     1003    <configname>transcript_id@chr (no prefix)</configname>
     1004    <description>A configuration that uses the &lt;transcript_id&gt;@&lt;seqname&gt; as reporter id. &lt;seqname&gt; is usually the chromosome ID (eg. chr1).</description>
     1005    <parameter>
     1006      <name>dataHeaderRegexp</name>
     1007      <label>Data header</label>
     1008      <description>A regular expression that matches the header line just before the data begins. For example: Block\tRow\tColumn.*</description>
     1009      <class>java.lang.String</class>
     1010      <value>&lt;seqname&gt;\t.*&lt;transcript_id&gt;.*</value>
     1011    </parameter>
    10051012    <parameter>
    10061013      <name>trimQuotes</name>
     
    10111018    </parameter>
    10121019    <parameter>
    1013       <name>dataHeaderRegexp</name>
    1014       <label>Data header</label>
    1015       <description>A regular expression that matches the header line just before the data begins. For example: Block\tRow\tColumn.*</description>
    1016       <class>java.lang.String</class>
    1017       <value>&lt;seqname&gt;\t.*&lt;transcript_id&gt;.*</value>
    1018     </parameter>
    1019     <parameter>
    10201020      <name>reporterIdColumnMapping</name>
    10211021      <label>External ID</label>
    10221022      <description>Mapping that picks the reporter's external ID from the data columns. For example: \ID\</description>
    10231023      <class>java.lang.String</class>
    1024       <value>\&lt;transcript_id&gt;\</value>
     1024      <value>\&lt;transcript_id&gt;\@\&lt;seqname&gt;\</value>
    10251025    </parameter>
    10261026    <parameter>
     
    10371037allow = Allow expression and complex mappings, for example, '\Row\, \Column\' or '=2*col('radius')'</description>
    10381038      <class>java.lang.String</class>
    1039       <value>disallow</value>
     1039      <value>allow</value>
    10401040    </parameter>
    10411041    <parameter>
     
    10511051      <description>Mapping that picks the reporter's name from the data columns. For example: \Name\</description>
    10521052      <class>java.lang.String</class>
    1053       <value>\&lt;transcript_id&gt;\</value>
     1053      <value>\&lt;transcript_id&gt;\@\&lt;seqname&gt;\</value>
    10541054    </parameter>
    10551055    <parameter>
     
    10611061    </parameter>
    10621062    <parameter>
     1063      <name>extendedColumnMapping.chromosome</name>
     1064      <label>Chromosome</label>
     1065      <description>The chromosome from which the reporter is derived</description>
     1066      <class>java.lang.String</class>
     1067      <value>\&lt;seqname&gt;\</value>
     1068    </parameter>
     1069    <parameter>
    10631070      <name>decimalSeparator</name>
    10641071      <label>Decimal separator</label>
     
    10661073      <class>java.lang.String</class>
    10671074      <value>dot</value>
     1075    </parameter>
     1076    <parameter>
     1077      <name>symbolColumnMapping</name>
     1078      <label>Gene symbol</label>
     1079      <description>Mapping that picks the reporter's gene symbol from the data columns. For example: \Gene symbol\</description>
     1080      <class>java.lang.String</class>
     1081      <value>\&lt;gene_id&gt;\</value>
    10681082    </parameter>
    10691083  </configuration>
  • trunk/src/core/net/sf/basedb/util/gtf/GtfInputStream.java

    r5764 r5770  
    4848  attributes are lined up with the first line. Note that any attributes
    4949  that are not present in the first line are skipped. The parser also has an
    50   option to skip lines with a <code>transcript_id</code> that is not unique.
     50  option to skip lines with a <code>transcript_id+seqname</code> that is not unique.
    5151  Normally, a GTF file will contain multiple entries with the same id:s, but
    5252  in most cases we are not interested in this when importing data to BASE.
     
    9191    @param charset The character set used in the file
    9292    @param skipRepeatedTranscriptIds TRUE to skip lines with non-unique
    93       values for transcript_id
     93      values for transcript_id+seqname
    9494    @throws IOException
    9595   */
     
    142142    {
    143143      // read next data line
    144       readMore();
     144      buffer = readMore();
    145145      index = 0;
    146146      if (buffer == null) return -1;
     
    217217    // Generate header line
    218218    StringBuffer sb = new StringBuffer();
    219     transcriptIds.add(attributes[transcriptIdIndex].value);
     219    transcriptIds.add(attributes[transcriptIdIndex].value+ '@' + line[0]);
    220220    if (skipRepeatedTranscriptIds)
    221221    {
     
    244244    @throws IOException
    245245  */
    246   private void readMore()
     246  private byte[] readMore()
    247247    throws IOException
    248248  {
     
    254254      {
    255255        buffer = null;
    256         return;
     256        return null;
    257257      }
    258258     
     
    260260      parseAttributes(line[8]);
    261261     
    262       String id = attributes[transcriptIdIndex].value;
     262      String id = attributes[transcriptIdIndex].value + '@' + line[0];
    263263      if (transcriptIds.add(id) && skipRepeatedTranscriptIds)
    264264      {
     
    268268   
    269269    // Convert to byte[]
    270     buffer = appendLine(new StringBuffer(), line, attributes).toString().getBytes(charset);
     270    return appendLine(new StringBuffer(), line, attributes).toString().getBytes(charset);
    271271  }
    272272 
  • trunk/src/plugins/core/core-plugins.xml

    r5764 r5770  
    767767      <description>
    768768        Creates reporters and reporter lists from GTF (Gene transfer format)
    769         files. The default configuration is to use the transcript_id value
    770         as the reporter id and name. No other fields are used, but this can
     769        files. The default configuration uses the transcript_id+seqname value
     770        as the reporter id and name, and gene_id as "symbol". This can
    771771        be changed by user configurations. For example, to use the gene_id
    772772        instead or to add prefixes to the id values. The importer
  • trunk/src/plugins/core/net/sf/basedb/plugins/gtf/DefaultConfigurationValues.java

    r5764 r5770  
    2727
    2828import net.sf.basedb.core.BaseException;
     29import net.sf.basedb.core.ExtendedProperties;
    2930import net.sf.basedb.core.InvalidDataException;
    3031import net.sf.basedb.core.Job;
     
    9495        defaultValues.put("dataHeaderRegexp", "<seqname>\\t.*<transcript_id>.*");
    9596        defaultValues.put("minDataColumns", 4);
    96         defaultValues.put("featureIdColumnMapping", "\\<transcript_id>\\");
    97         defaultValues.put("reporterIdColumnMapping", "\\<transcript_id>\\");
    98         defaultValues.put("nameColumnMapping", "\\<transcript_id>\\");
     97        defaultValues.put("complexExpressions", "allow");
     98       
     99        // Reporter importer mappings
     100        defaultValues.put("reporterIdColumnMapping", "\\<transcript_id>\\@\\<seqname>\\");
     101        defaultValues.put("nameColumnMapping", "\\<transcript_id>\\@\\<seqname>\\");
     102        defaultValues.put("symbolColumnMapping", "\\<gene_id>\\");
     103        if (ExtendedProperties.getProperty("ReporterData", "chromosome") != null)
     104        {
     105          defaultValues.put("extendedColumnMapping.chromosome", "\\<seqname>\\");
     106        }
     107
     108        // Reporter map importer mappings (also use reporterIdColumnMapping)
     109        defaultValues.put("featureIdColumnMapping", "\\<transcript_id>\\@\\<seqname>\\");
    99110      }
    100111      value = defaultValues.get(name);
  • trunk/src/plugins/core/net/sf/basedb/plugins/gtf/GtfReporterImporter.java

    r5764 r5770  
    4141  as a wrapper to generate a pure column-based output which can be used
    4242  by the regular tools for file parsing. The importer will also skip
    43   lines with a non-unique transcript_id.
     43  lines with a non-unique transcript_id+seqname.
    4444  <p>
    4545 
    46   The default configuration is to use the transcript_id as the reporter id
    47   and name. No other information is extracted, but this can be changed by
    48   user configurations depending on what additional attributes that are
    49   present in the GTF file.
     46  The default configuration is to use the transcript_id+seqname as the reporter id
     47  and name. gene_id is stored as "gene symbol" and seqname as "chromosome".
     48  The default configuration can be changed by user configurations depending on
     49  what additional attributes that are present in the GTF file.
    5050
    5151  @author Nicklas
  • trunk/src/test/TestGtfImporters.java

    r5764 r5770  
    5353  static boolean test_all()
    5454  {
    55     write("++Testing GTF imports using plugin");
     55    write("++Testing GTF importers using plugin");
    5656    // Upload GTF file
    5757    int fileId = TestFile.test_create("data/test.gtf", false, false);
     
    6565    TestReporter.test_list(35);
    6666   
    67 
     67/*
    6868    // Test reporter map importer
    6969    int arrayDesignId = TestArrayDesign.test_create(PlatformVariant.SEQUENCING_EXPRESSION, false);
     
    7575    TestArrayDesign.write_feature_header();
    7676    TestArrayDesign.test_list_features(arrayDesignId, 35);
    77 
     77*/
    7878    if (TestUtil.waitBeforeDelete()) TestUtil.waitForEnter();
    79     TestArrayDesign.test_delete(arrayDesignId);
     79//    TestArrayDesign.test_delete(arrayDesignId);
    8080    // Delete reporters
    8181    int deleteReporterJobId = test_create_reporter_job(gtfReporterImporterId, fileId, "delete");
     
    8484    TestJob.test_delete(reporterJobId);
    8585    TestJob.test_delete(deleteReporterJobId);
    86     TestJob.test_delete(featureJobId);
    87     TestPluginConfiguration.test_delete(featureConfigurationId);
     86//    TestJob.test_delete(featureJobId);
     87//    TestPluginConfiguration.test_delete(featureConfigurationId);
    8888    TestFile.test_delete(fileId);
    8989
  • trunk/src/test/TestGtfInputStream.java

    r5759 r5770  
    4444   
    4545    test_parse("data/test.gtf",
    46       "\\<seqname>\\", "\\<source>\\", "\\<gene_id>\\", "\\<transcript_id>\\", "\\<gene_name2>\\");
     46      "\\<transcript_id>\\@\\<seqname>\\", "\\<source>\\", "\\<gene_id>\\", "\\<gene_name2>\\");
    4747
    4848    write("++Testing GTFInputStream "+(ok ? "OK" : "Failed")+"\n");
Note: See TracChangeset for help on using the changeset viewer.