Changeset 5771


Ignore:
Timestamp:
Sep 29, 2011, 1:48:47 PM (10 years ago)
Author:
Nicklas Nordborg
Message:

References #1624: Create array design (feature) importer for GTF files

The default configuration now uses transcript_id@seqname as the reporter and feature id. We discovered that this is needed to make up a unique id for all entries that we want to track in array design features and raw data.

Location:
trunk
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/data/plugin_configfile.xml

    r5770 r5771  
    934934  <configuration pluginClassName="net.sf.basedb.plugins.gtf.GtfReporterImporter">
    935935    <configname>gene_id (no prefix)</configname>
    936     <description>A configuration that uses the gene_id instead of the transcript_id as reporter id.</description>
     936    <description>A configuration that uses the &lt;gene_id&gt; as reporter id.</description>
     937    <parameter>
     938      <name>reporterIdColumnMapping</name>
     939      <label>External ID</label>
     940      <description>Mapping that picks the reporter's external ID from the data columns. For example: \ID\</description>
     941      <class>java.lang.String</class>
     942      <value>\&lt;gene_id&gt;\</value>
     943    </parameter>
    937944    <parameter>
    938945      <name>trimQuotes</name>
     
    950957    </parameter>
    951958    <parameter>
    952       <name>reporterIdColumnMapping</name>
    953       <label>External ID</label>
    954       <description>Mapping that picks the reporter's external ID from the data columns. For example: \ID\</description>
    955       <class>java.lang.String</class>
    956       <value>\&lt;gene_id&gt;\</value>
    957     </parameter>
    958     <parameter>
    959959      <name>minDataColumns</name>
    960960      <label>Min data columns</label>
     
    986986    </parameter>
    987987    <parameter>
     988      <name>extendedColumnMapping.chromosome</name>
     989      <label>Chromosome</label>
     990      <description>The chromosome from which the reporter is derived</description>
     991      <class>java.lang.String</class>
     992      <value>\&lt;seqname&gt;\</value>
     993    </parameter>
     994    <parameter>
     995      <name>decimalSeparator</name>
     996      <label>Decimal separator</label>
     997      <description>The decimal separator used in numeric values, if not specified dot is assumed.</description>
     998      <class>java.lang.String</class>
     999      <value>dot</value>
     1000    </parameter>
     1001    <parameter>
    9881002      <name>dataSplitterRegexp</name>
    9891003      <label>Data splitter</label>
     
    9931007    </parameter>
    9941008    <parameter>
    995       <name>decimalSeparator</name>
    996       <label>Decimal separator</label>
    997       <description>The decimal separator used in numeric values, if not specified dot is assumed.</description>
    998       <class>java.lang.String</class>
    999       <value>dot</value>
     1009      <name>symbolColumnMapping</name>
     1010      <label>Gene symbol</label>
     1011      <description>Mapping that picks the reporter's gene symbol from the data columns. For example: \Gene symbol\</description>
     1012      <class>java.lang.String</class>
     1013      <value>\&lt;gene_id&gt;\</value>
    10001014    </parameter>
    10011015  </configuration>
    10021016  <configuration pluginClassName="net.sf.basedb.plugins.gtf.GtfReporterImporter">
    1003     <configname>transcript_id@chr (no prefix)</configname>
     1017    <configname>transcript_id@seqname (no prefix)</configname>
    10041018    <description>A configuration that uses the &lt;transcript_id&gt;@&lt;seqname&gt; as reporter id. &lt;seqname&gt; is usually the chromosome ID (eg. chr1).</description>
    10051019    <parameter>
    1006       <name>dataHeaderRegexp</name>
    1007       <label>Data header</label>
    1008       <description>A regular expression that matches the header line just before the data begins. For example: Block\tRow\tColumn.*</description>
    1009       <class>java.lang.String</class>
    1010       <value>&lt;seqname&gt;\t.*&lt;transcript_id&gt;.*</value>
     1020      <name>reporterIdColumnMapping</name>
     1021      <label>External ID</label>
     1022      <description>Mapping that picks the reporter's external ID from the data columns. For example: \ID\</description>
     1023      <class>java.lang.String</class>
     1024      <value>\&lt;transcript_id&gt;\@\&lt;seqname&gt;\</value>
    10111025    </parameter>
    10121026    <parameter>
     
    10181032    </parameter>
    10191033    <parameter>
    1020       <name>reporterIdColumnMapping</name>
    1021       <label>External ID</label>
    1022       <description>Mapping that picks the reporter's external ID from the data columns. For example: \ID\</description>
    1023       <class>java.lang.String</class>
    1024       <value>\&lt;transcript_id&gt;\@\&lt;seqname&gt;\</value>
     1034      <name>dataHeaderRegexp</name>
     1035      <label>Data header</label>
     1036      <description>A regular expression that matches the header line just before the data begins. For example: Block\tRow\tColumn.*</description>
     1037      <class>java.lang.String</class>
     1038      <value>&lt;seqname&gt;\t.*&lt;transcript_id&gt;.*</value>
    10251039    </parameter>
    10261040    <parameter>
     
    10541068    </parameter>
    10551069    <parameter>
     1070      <name>extendedColumnMapping.chromosome</name>
     1071      <label>Chromosome</label>
     1072      <description>The chromosome from which the reporter is derived</description>
     1073      <class>java.lang.String</class>
     1074      <value>\&lt;seqname&gt;\</value>
     1075    </parameter>
     1076    <parameter>
     1077      <name>decimalSeparator</name>
     1078      <label>Decimal separator</label>
     1079      <description>The decimal separator used in numeric values, if not specified dot is assumed.</description>
     1080      <class>java.lang.String</class>
     1081      <value>dot</value>
     1082    </parameter>
     1083    <parameter>
    10561084      <name>dataSplitterRegexp</name>
    10571085      <label>Data splitter</label>
     
    10611089    </parameter>
    10621090    <parameter>
    1063       <name>extendedColumnMapping.chromosome</name>
    1064       <label>Chromosome</label>
    1065       <description>The chromosome from which the reporter is derived</description>
    1066       <class>java.lang.String</class>
    1067       <value>\&lt;seqname&gt;\</value>
    1068     </parameter>
    1069     <parameter>
    1070       <name>decimalSeparator</name>
    1071       <label>Decimal separator</label>
    1072       <description>The decimal separator used in numeric values, if not specified dot is assumed.</description>
    1073       <class>java.lang.String</class>
    1074       <value>dot</value>
    1075     </parameter>
    1076     <parameter>
    10771091      <name>symbolColumnMapping</name>
    10781092      <label>Gene symbol</label>
     
    10831097  </configuration>
    10841098  <configuration pluginClassName="net.sf.basedb.plugins.gtf.GtfReporterMapImporter">
    1085     <configname>transcript_id (no prefix)</configname>
    1086     <description>A configuration that uses the transcript_id (no prefix) as reporter and feature id.</description>
     1099    <configname>transcript_id@seqname (no prefix)</configname>
     1100    <description>A configuration that uses &lt;transcript_id&gt;@&lt;seqname&gt; as reporter and feature id. &lt;seqname&gt; is usually the chromosome ID (eg. chr1).</description>
    10871101    <parameter>
    10881102      <name>reporterIdColumnMapping</name>
     
    11071121    </parameter>
    11081122    <parameter>
     1123      <name>minDataColumns</name>
     1124      <label>Min data columns</label>
     1125      <description>The minimum number of columns for a line to be counted as a data line.</description>
     1126      <class>java.lang.Integer</class>
     1127      <value>4</value>
     1128    </parameter>
     1129    <parameter>
    11091130      <name>featureIdentification</name>
    11101131      <label />
     
    11191140allow = Allow expression and complex mappings, for example, '\Row\, \Column\' or '=2*col('radius')'</description>
    11201141      <class>java.lang.String</class>
    1121       <value>disallow</value>
     1142      <value>allow</value>
    11221143    </parameter>
    11231144    <parameter>
     
    11311152      <name>featureIdColumnMapping</name>
    11321153      <label>Feature ID</label>
    1133       <description>Mapping that picks the feature's ID from the data columns. This column is only used when the array design uses the FEATURE_ID method for identifying features. In the other cases, the value is just stored as it is.For example: \Feature ID\</description>
    1134       <class>java.lang.String</class>
    1135       <value>\&lt;transcript_id&gt;\</value>
     1154      <description>Mapping that picks the feature's ID from the data columns. For example: \&lt;transcript_id&gt;\</description>
     1155      <class>java.lang.String</class>
     1156      <value>\&lt;transcript_id&gt;\@\&lt;seqname&gt;\</value>
    11361157    </parameter>
    11371158    <parameter>
     
    11451166  <configuration pluginClassName="net.sf.basedb.plugins.gtf.GtfReporterMapImporter">
    11461167    <configname>gene_id (no prefix)</configname>
    1147     <description>A configuration that uses the gene_id (no prefix) as reporter id and transcript_id as feature id. Note that gene_id may not be unique so it is not recommended to use that as feature id.</description>
     1168    <description>A configuration that uses the &lt;gene_id&gt; as reporter id and &lt;transcript_id&gt;@&lt;seqname&gt; as feature id. Note that &lt;gene_id&gt; may not be unique so it is not recommended to use that as feature id.</description>
    11481169    <parameter>
    11491170      <name>reporterIdColumnMapping</name>
     
    11521173      <class>java.lang.String</class>
    11531174      <value>\&lt;gene_id&gt;\</value>
     1175    </parameter>
     1176    <parameter>
     1177      <name>dataHeaderRegexp</name>
     1178      <label>Data header</label>
     1179      <description>A regular expression that matches the header line just before the data begins. For example: Block\tRow\tColumn.*</description>
     1180      <class>java.lang.String</class>
     1181      <value>&lt;seqname&gt;\t.*&lt;gene_id&gt;.*</value>
    11541182    </parameter>
    11551183    <parameter>
     
    11611189    </parameter>
    11621190    <parameter>
    1163       <name>dataHeaderRegexp</name>
    1164       <label>Data header</label>
    1165       <description>A regular expression that matches the header line just before the data begins. For example: Block\tRow\tColumn.*</description>
    1166       <class>java.lang.String</class>
    1167       <value>&lt;seqname&gt;\t.*&lt;gene_id&gt;.*</value>
     1191      <name>minDataColumns</name>
     1192      <label>Min data columns</label>
     1193      <description>The minimum number of columns for a line to be counted as a data line.</description>
     1194      <class>java.lang.Integer</class>
     1195      <value>4</value>
    11681196    </parameter>
    11691197    <parameter>
     
    11801208allow = Allow expression and complex mappings, for example, '\Row\, \Column\' or '=2*col('radius')'</description>
    11811209      <class>java.lang.String</class>
    1182       <value>disallow</value>
     1210      <value>allow</value>
    11831211    </parameter>
    11841212    <parameter>
     
    11921220      <name>featureIdColumnMapping</name>
    11931221      <label>Feature ID</label>
    1194       <description>Mapping that picks the feature's ID from the data columns. This column is only used when the array design uses the FEATURE_ID method for identifying features. In the other cases, the value is just stored as it is.For example: \Feature ID\</description>
    1195       <class>java.lang.String</class>
    1196       <value>\&lt;transcript_id&gt;\</value>
     1222      <description>Mapping that picks the feature's ID from the data columns. For example: \&lt;transcript_id&gt;\</description>
     1223      <class>java.lang.String</class>
     1224      <value>\&lt;transcript_id&gt;\@\&lt;seqname&gt;\</value>
    11971225    </parameter>
    11981226    <parameter>
  • trunk/src/plugins/core/net/sf/basedb/plugins/ReporterFlatFileImporter.java

    r5759 r5771  
    851851      // Column mappings
    852852      parameters.add(mappingSection);
    853       parameters.add(complexMappings);
     853      parameters.add(cloneParameterWithDefaultValue(complexMappings));
    854854      parameters.addAll(getAllColumnMappings(reporterListContext));
    855855     
     
    917917      // Column mappings
    918918      parameters.add(mappingSection);
    919       parameters.add(complexMappings);
     919      parameters.add(cloneParameterWithDefaultValue(complexMappings));
    920920      parameters.addAll(getAllColumnMappings(false));
    921921     
  • trunk/src/plugins/core/net/sf/basedb/plugins/gtf/GtfReporterMapImporter.java

    r5764 r5771  
    5050/**
    5151  Import features to an array design from a GTF file. The default
    52   settings use transcript_id as the feature id and reporter id. This
     52  settings use transcript_id+seqname as the feature id and reporter id. This
    5353  can be changed by user configuration, but it is recommended that
    54   transcript_id is used as feature id since other values may not be
    55   unique.
     54  transcript_id+seqname is used as feature id since other combinations of
     55  values may not be unique in raw data files.
    5656 
    5757  @author Nicklas
     
    7979      "Feature ID",
    8080      "Mapping that picks the feature's ID from the data columns. " +
    81       "For example: \\<transcript_id>\\",
     81      "For example: \\<transcript_id>\\@\\<seqname>\\",
    8282      new StringParameterType(255, null, true)
    8383      );
     
    254254      // Column mappings
    255255      parameters.add(mappingSection);
    256       parameters.add(complexMappings);
     256      parameters.add(cloneParameterWithDefaultValue(complexMappings));
    257257      parameters.addAll(getAllColumnMappings());
    258258
  • trunk/src/test/TestGtfImporters.java

    r5770 r5771  
    6565    TestReporter.test_list(35);
    6666   
    67 /*
     67
    6868    // Test reporter map importer
    6969    int arrayDesignId = TestArrayDesign.test_create(PlatformVariant.SEQUENCING_EXPRESSION, false);
     
    7575    TestArrayDesign.write_feature_header();
    7676    TestArrayDesign.test_list_features(arrayDesignId, 35);
    77 */
     77
    7878    if (TestUtil.waitBeforeDelete()) TestUtil.waitForEnter();
    79 //    TestArrayDesign.test_delete(arrayDesignId);
     79    TestArrayDesign.test_delete(arrayDesignId);
    8080    // Delete reporters
    8181    int deleteReporterJobId = test_create_reporter_job(gtfReporterImporterId, fileId, "delete");
     
    8484    TestJob.test_delete(reporterJobId);
    8585    TestJob.test_delete(deleteReporterJobId);
    86 //    TestJob.test_delete(featureJobId);
    87 //    TestPluginConfiguration.test_delete(featureConfigurationId);
     86    TestJob.test_delete(featureJobId);
     87    TestPluginConfiguration.test_delete(featureConfigurationId);
    8888    TestFile.test_delete(fileId);
    8989
     
    160160    {
    161161      ParameterType pt = pp.getParameterType();
    162       System.out.println("\tParameter:\t"+pp.getName() + "\t" + pp.getLabel() + "\t" + pt );
     162      System.out.println("\tParameter:\t"+pp.getName() + "\t" + pp.getLabel() + "\t" + pt + "\t" + pp.getDefaultValue() );
    163163    }
    164164  }
Note: See TracChangeset for help on using the changeset viewer.