Changeset 5773


Ignore:
Timestamp:
Sep 29, 2011, 3:14:56 PM (10 years ago)
Author:
Nicklas Nordborg
Message:

References #1152: Handling short read transcript sequence data

  • Adds a 'Cufflinks' raw data type with five columns: coverage, fpkm, fpkm_lo, fpkm_hi and status
  • Define FPKM_TRACKING file type and MIME type.
  • Adds left(string, index) as a JEP formula so that we can parse out the chromosome from the 'locus' column in the tracking files.
  • Define two new configurations for the raw data importer that parses cufflinks isoform files.
Location:
trunk
Files:
1 added
5 edited

Legend:

Unmodified
Added
Removed
  • trunk/config/dist/raw-data-types.xml

    r4888 r5773  
    35363536    </intensity-formula>
    35373537  </raw-data-type>
     3538  <raw-data-type
     3539    id="cufflinks"
     3540    channels="1"
     3541    name="Cufflinks"
     3542    table="RawDataCufflinks"
     3543    description="Cufflinks isoforms/gene-level expression values in FPKM tracking format">
     3544    <property
     3545      name="coverage"
     3546      title="Coverage"
     3547      description="Estimate for the absolute depth of read coverage across the object."
     3548      column="coverage"
     3549      type="float"
     3550      averagemethod="arithmetic_mean"
     3551    />
     3552    <property
     3553      name="fpkm"
     3554      title="FPKM"
     3555      description="Fragments Per Kilobase of exon per Million fragments mapped."
     3556      column="fpkm"
     3557      type="float"
     3558      averagemethod="geometric_mean"
     3559    />
     3560    <property
     3561      name="fpkm_lo"
     3562      title="FPKM lo"
     3563      description="The lower bound of the 95% confidence interval on the FPKM."
     3564      column="fpkm_lo"
     3565      type="float"
     3566      averagemethod="geometric_mean"
     3567    />
     3568    <property
     3569      name="fpkm_hi"
     3570      title="FPKM hi"
     3571      description="The upper bound of the 95% confidence interval on the FPKM."
     3572      column="fpkm_hi"
     3573      type="float"
     3574      averagemethod="geometric_mean"
     3575    />
     3576    <property
     3577      name="status"
     3578      title="Status"
     3579      description="Quantification status. Can be one of OK (deconvolution successful), LOWDATA (too complex or shallowly sequenced), HIDATA (too many fragments in locus), or FAIL, when an ill-conditioned covariance matrix or other numerical exception prevents deconvolution."
     3580      column="status"
     3581      type="string"
     3582      length="255"
     3583      averagemethod="none"
     3584    />
     3585    <intensity-formula
     3586      name="fpkm"
     3587      title="FPKM"
     3588      description="Fragments Per Kilobase of exon per Million fragments mapped."
     3589      >
     3590      <formula
     3591        channel="1"
     3592        expression="raw('fpkm')"
     3593      />
     3594    </intensity-formula>
     3595 
     3596  </raw-data-type>
    35383597</raw-data-types>
    35393598
  • trunk/data/plugin_configfile.xml

    r5772 r5773  
    12321232    </parameter>
    12331233  </configuration>
     1234  <configuration pluginClassName="net.sf.basedb.plugins.RawDataFlatFileImporter">
     1235    <configname>Cufflinks isoform FPKM (transcript_id@seqname; no prefix)</configname>
     1236    <description>A configuration that import isoforms.fpkm_tracking files and uses &lt;transcript_id&gt;@&lt;seqname&gt; as reporter and feature id.</description>
     1237    <parameter>
     1238      <name>dataHeaderRegexp</name>
     1239      <label>Data header</label>
     1240      <description>A regular expression that matches the header line just before the data begins. For example: Block\tRow\tColumn.*</description>
     1241      <class>java.lang.String</class>
     1242      <value>tracking_id\t.*FPKM.*</value>
     1243    </parameter>
     1244    <parameter>
     1245      <name>complexExpressions</name>
     1246      <label>Complex column mappings</label>
     1247      <description>disallow = Only allow simple mappings that are constant value or pick the value from one column only, for example, '1.6' or '\Row\'
     1248allow = Allow expression and complex mappings, for example, '\Row\, \Column\' or '=2*col('radius')'</description>
     1249      <class>java.lang.String</class>
     1250      <value>allow</value>
     1251    </parameter>
     1252    <parameter>
     1253      <name>propertyMapping.status</name>
     1254      <label>Status</label>
     1255      <description>Quantification status. Can be one of OK (deconvolution successful), LOWDATA (too complex or shallowly sequenced), HIDATA (too many fragments in locus), or FAIL, when an ill-conditioned covariance matrix or other numerical exception prevents deconvolution.</description>
     1256      <class>java.lang.String</class>
     1257      <value>\status\</value>
     1258    </parameter>
     1259    <parameter>
     1260      <name>charset</name>
     1261      <label>Character set</label>
     1262      <description>The character set to use when reading the file. This setting overrides the character set specified by the file. If neither this parameter nor the file specifies a character set, the system default is used (ISO-8859-1).</description>
     1263      <class>java.lang.String</class>
     1264      <value>ISO-8859-1</value>
     1265    </parameter>
     1266    <parameter>
     1267      <name>featureIdColumnMapping</name>
     1268      <label>Feature ID</label>
     1269      <description>Mapping that picks the spot's feature ID from the data columns. This column is only used when the raw data is connected to an array design which uses the FEATURE_ID method for identifying features. The value is not saved to the database.For example: \Feature ID\</description>
     1270      <class>java.lang.String</class>
     1271      <value>=col('tracking_id')+'@'+left(col('locus'), ':')</value>
     1272    </parameter>
     1273    <parameter>
     1274      <name>propertyMapping.fpkm_lo</name>
     1275      <label>FPKM lo</label>
     1276      <description>The lower bound of the 95% confidence interval on the FPKM.</description>
     1277      <class>java.lang.String</class>
     1278      <value>\FPKM_conf_lo\</value>
     1279    </parameter>
     1280    <parameter>
     1281      <name>dataSplitterRegexp</name>
     1282      <label>Data splitter</label>
     1283      <description>A regular expression that splits each data line into individual columns. For example, split on tabs: \t</description>
     1284      <class>java.lang.String</class>
     1285      <value>\t</value>
     1286    </parameter>
     1287    <parameter>
     1288      <name>decimalSeparator</name>
     1289      <label>Decimal separator</label>
     1290      <description>The decimal separator used in numeric values, if not specified dot is assumed.</description>
     1291      <class>java.lang.String</class>
     1292      <value>dot</value>
     1293    </parameter>
     1294    <parameter>
     1295      <name>rawDataType</name>
     1296      <label>Raw data type</label>
     1297      <description>The type of raw data that this importer will import.</description>
     1298      <class>java.lang.String</class>
     1299      <value>cufflinks</value>
     1300    </parameter>
     1301    <parameter>
     1302      <name>propertyMapping.coverage</name>
     1303      <label>Coverage</label>
     1304      <description>Estimate for the absolute depth of read coverage across the object.</description>
     1305      <class>java.lang.String</class>
     1306      <value>\coverage\</value>
     1307    </parameter>
     1308    <parameter>
     1309      <name>propertyMapping.fpkm_hi</name>
     1310      <label>FPKM hi</label>
     1311      <description>The upper bound of the 95% confidence interval on the FPKM.</description>
     1312      <class>java.lang.String</class>
     1313      <value>\FPKM_conf_hi\</value>
     1314    </parameter>
     1315    <parameter>
     1316      <name>reporterIdColumnMapping</name>
     1317      <label>Reporter ID</label>
     1318      <description>Mapping that picks the 'External ID' of the spot's reporter from the data columns. For example: \ID\</description>
     1319      <class>java.lang.String</class>
     1320      <value>=col('tracking_id')+'@'+left(col('locus'), ':')</value>
     1321    </parameter>
     1322    <parameter>
     1323      <name>trimQuotes</name>
     1324      <label>Remove quotes</label>
     1325      <description>If true quotes (" or ') around data value will be removed.</description>
     1326      <class>java.lang.Boolean</class>
     1327      <value>true</value>
     1328    </parameter>
     1329    <parameter>
     1330      <name>propertyMapping.fpkm</name>
     1331      <label>FPKM</label>
     1332      <description>Fragments Per Kilobase of exon per Million fragments mapped.</description>
     1333      <class>java.lang.String</class>
     1334      <value>\FPKM\</value>
     1335    </parameter>
     1336  </configuration>
     1337  <configuration pluginClassName="net.sf.basedb.plugins.RawDataFlatFileImporter">
     1338    <configname>Cufflinks isoform FPKM (gene_id; no prefix)</configname>
     1339    <description>A configuration that import isoforms.fpkm_tracking files and uses &lt;gene_id&gt; as reporter id and &lt;transcript_id&gt;@&lt;seqname&gt; as feature id.</description>
     1340    <parameter>
     1341      <name>dataHeaderRegexp</name>
     1342      <label>Data header</label>
     1343      <description>A regular expression that matches the header line just before the data begins. For example: Block\tRow\tColumn.*</description>
     1344      <class>java.lang.String</class>
     1345      <value>tracking_id\t.*FPKM.*</value>
     1346    </parameter>
     1347    <parameter>
     1348      <name>complexExpressions</name>
     1349      <label>Complex column mappings</label>
     1350      <description>disallow = Only allow simple mappings that are constant value or pick the value from one column only, for example, '1.6' or '\Row\'
     1351allow = Allow expression and complex mappings, for example, '\Row\, \Column\' or '=2*col('radius')'</description>
     1352      <class>java.lang.String</class>
     1353      <value>allow</value>
     1354    </parameter>
     1355    <parameter>
     1356      <name>propertyMapping.status</name>
     1357      <label>Status</label>
     1358      <description>Quantification status. Can be one of OK (deconvolution successful), LOWDATA (too complex or shallowly sequenced), HIDATA (too many fragments in locus), or FAIL, when an ill-conditioned covariance matrix or other numerical exception prevents deconvolution.</description>
     1359      <class>java.lang.String</class>
     1360      <value>\status\</value>
     1361    </parameter>
     1362    <parameter>
     1363      <name>charset</name>
     1364      <label>Character set</label>
     1365      <description>The character set to use when reading the file. This setting overrides the character set specified by the file. If neither this parameter nor the file specifies a character set, the system default is used (ISO-8859-1).</description>
     1366      <class>java.lang.String</class>
     1367      <value>ISO-8859-1</value>
     1368    </parameter>
     1369    <parameter>
     1370      <name>propertyMapping.fpkm_lo</name>
     1371      <label>FPKM lo</label>
     1372      <description>The lower bound of the 95% confidence interval on the FPKM.</description>
     1373      <class>java.lang.String</class>
     1374      <value>\FPKM_conf_lo\</value>
     1375    </parameter>
     1376    <parameter>
     1377      <name>featureIdColumnMapping</name>
     1378      <label>Feature ID</label>
     1379      <description>Mapping that picks the spot's feature ID from the data columns. This column is only used when the raw data is connected to an array design which uses the FEATURE_ID method for identifying features. The value is not saved to the database.For example: \Feature ID\</description>
     1380      <class>java.lang.String</class>
     1381      <value>=col('tracking_id')+'@'+left(col('locus'), ':')</value>
     1382    </parameter>
     1383    <parameter>
     1384      <name>decimalSeparator</name>
     1385      <label>Decimal separator</label>
     1386      <description>The decimal separator used in numeric values, if not specified dot is assumed.</description>
     1387      <class>java.lang.String</class>
     1388      <value>dot</value>
     1389    </parameter>
     1390    <parameter>
     1391      <name>propertyMapping.coverage</name>
     1392      <label>Coverage</label>
     1393      <description>Estimate for the absolute depth of read coverage across the object.</description>
     1394      <class>java.lang.String</class>
     1395      <value>\coverage\</value>
     1396    </parameter>
     1397    <parameter>
     1398      <name>dataSplitterRegexp</name>
     1399      <label>Data splitter</label>
     1400      <description>A regular expression that splits each data line into individual columns. For example, split on tabs: \t</description>
     1401      <class>java.lang.String</class>
     1402      <value>\t</value>
     1403    </parameter>
     1404    <parameter>
     1405      <name>rawDataType</name>
     1406      <label>Raw data type</label>
     1407      <description>The type of raw data that this importer will import.</description>
     1408      <class>java.lang.String</class>
     1409      <value>cufflinks</value>
     1410    </parameter>
     1411    <parameter>
     1412      <name>propertyMapping.fpkm_hi</name>
     1413      <label>FPKM hi</label>
     1414      <description>The upper bound of the 95% confidence interval on the FPKM.</description>
     1415      <class>java.lang.String</class>
     1416      <value>\FPKM_conf_hi\</value>
     1417    </parameter>
     1418    <parameter>
     1419      <name>reporterIdColumnMapping</name>
     1420      <label>Reporter ID</label>
     1421      <description>Mapping that picks the 'External ID' of the spot's reporter from the data columns. For example: \ID\</description>
     1422      <class>java.lang.String</class>
     1423      <value>\&lt;gene_id&gt;\</value>
     1424    </parameter>
     1425    <parameter>
     1426      <name>trimQuotes</name>
     1427      <label>Remove quotes</label>
     1428      <description>If true quotes (" or ') around data value will be removed.</description>
     1429      <class>java.lang.Boolean</class>
     1430      <value>true</value>
     1431    </parameter>
     1432    <parameter>
     1433      <name>propertyMapping.fpkm</name>
     1434      <label>FPKM</label>
     1435      <description>Fragments Per Kilobase of exon per Million fragments mapped.</description>
     1436      <class>java.lang.String</class>
     1437      <value>\FPKM\</value>
     1438    </parameter>
     1439  </configuration>
    12341440</configfile>
  • trunk/src/core/net/sf/basedb/core/DataFileType.java

    r5764 r5773  
    157157  */
    158158  public static final String REF_SEQ_GTF = "refseq.gtf";
     159 
     160  /**
     161    The external ID for the file type representing a FPKM
     162    tracking file.
     163    http://cufflinks.cbcb.umd.edu/manual.html#tracking_format
     164    @since 3.0
     165  */
     166  public static final String FPKM_TRACKING = "sequencing.fpkm_tracking";
     167
    159168 
    160169  /**
  • trunk/src/core/net/sf/basedb/core/Install.java

    r5764 r5773  
    617617      createMimeType("application/octet-stream", "Binary Sequence Alignment/Map", "bam", null, false);
    618618      createMimeType("text/plain", "Gene transfer format", "gtf", null, true);
     619      createMimeType("text/plain", "FPKM tracking file", "fpkm_tracking", rawDataType, true);
    619620 
    620621      // Plate geometries
     
    884885        "The default settings use <transcript_id> to identify genes.",
    885886        Item.ARRAYDESIGN, "gtf", reporterMapType);
     887      DataFileTypeData fpkmTrackingFile = createDataFileType(
     888        DataFileType.FPKM_TRACKING, "FPKM tracking file",
     889        "Gene- or isoform expression levels in Fragments Per Kilobase of exon model per Million mapped fragments.",
     890        Item.RAWBIOASSAY, "fpkm_tracking", rawDataType);
    886891     
    887892      // Platforms and variants
     
    907912        "Variant for sequence data which has been pre-processed enough to make it possible " +
    908913        "for importing into the database. ", false, null, 0,
    909         new PlatformFT(gtfDesignFile, true, false)
     914        new PlatformFT(gtfDesignFile, true, false),
     915        new PlatformFT(fpkmTrackingFile, true, false)
    910916      );
    911917     
  • trunk/src/core/net/sf/basedb/util/parser/JepMapper.java

    r4515 r5773  
    2727
    2828import net.sf.basedb.util.jep.Jep;
     29import net.sf.basedb.util.jep.LeftFunction;
    2930import net.sf.basedb.util.parser.FlatFileParser.Data;
    3031
     
    8990    // Replace: \ColumnName\ with: col('ColumnName')
    9091    expression = expression.replaceAll("\\\\([^\\\\]*)\\\\", "col('$1')");
    91     this.parser = Jep.newJep(expression, colFunction, lineNoFunction, dataNoFunction);
     92    this.parser = Jep.newJep(expression, colFunction, lineNoFunction, dataNoFunction, new LeftFunction());
    9293  }
    9394 
Note: See TracChangeset for help on using the changeset viewer.