source: trunk/doc/src/docbook/appendix/raw_data_types.xml

Last change on this file was 5782, checked in by Nicklas Nordborg, 10 years ago

References #1590: Documentation cleanup

Restructured documentation to generate shorter filenames.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Date Id
File size: 15.5 KB
Line 
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE appendix PUBLIC
3    "-//Dawid Weiss//DTD DocBook V3.1-Based Extension for XML and graphics inclusion//EN"
4    "../../../../lib/docbook/preprocess/dweiss-docbook-extensions.dtd">
5<!--
6  $Id: raw_data_types.xml 5782 2011-10-04 13:43:16Z nicklas $
7 
8  Copyright (C) 2007 Nicklas Nordborg
9 
10  This file is part of BASE - BioArray Software Environment.
11  Available at http://base.thep.lu.se/
12 
13  BASE is free software; you can redistribute it and/or
14  modify it under the terms of the GNU General Public License
15  as published by the Free Software Foundation; either version 3
16  of the License, or (at your option) any later version.
17 
18  BASE is distributed in the hope that it will be useful,
19  but WITHOUT ANY WARRANTY; without even the implied warranty of
20  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21  GNU General Public License for more details.
22 
23  You should have received a copy of the GNU General Public License
24  along with BASE. If not, see <http://www.gnu.org/licenses/>.
25-->
26
27<appendix id="appendix.rawdatatypes">
28  <?dbhtml filename="rawdatatypes.html" ?>
29  <title>Platforms and raw-data-types.xml reference</title>
30 
31  <para>
32    Raw data can be stored either as files attached to items and/or in
33    the database. The <classname docapi="net.sf.basedb.core">Platform</classname> 
34    item has information  about this.  For more information see
35    <xref linkend="core_api.data_in_files" />.
36  </para>
37 
38  <sect1 id="appendix.rawdatatypes.platforms">
39    <title>Default platforms and variants installed with BASE</title>
40   
41    <informaltable>
42      <tgroup cols="7">
43        <colspec colname="platform.name" />
44        <colspec colname="platform.id" />
45        <colspec colname="variant.name" />
46        <colspec colname="variant.id" />
47        <colspec colname="filetype.item" />
48        <colspec colname="filetype.name" />
49        <colspec colname="filetype.id" />
50        <thead>
51          <row>
52            <entry namest="platform.name" nameend="platform.id">Platform</entry>
53            <entry namest="variant.name" nameend="variant.id">Variants</entry>
54            <entry namest="filetype.item" nameend="filetype.id">Data file types</entry>
55          </row>
56          <row>
57            <entry>Name</entry>
58            <entry>ID</entry>
59            <entry>Name</entry>
60            <entry>ID</entry>
61            <entry>Item</entry>
62            <entry>Name</entry>
63            <entry>ID</entry>
64          </row>
65        </thead>
66        <tbody>
67          <row>
68            <entry morerows="2">Generic</entry>
69            <entry morerows="2">generic</entry>
70            <entry morerows="2">-</entry>
71            <entry morerows="2">-</entry>
72
73            <entry morerows="1">Array design</entry>
74            <entry>Reporter map</entry>
75            <entry>generic.reportermap</entry>
76          </row>
77          <row>
78            <entry>Print map</entry>
79            <entry>generic.printmap</entry>
80          </row>
81          <row>
82            <entry>Raw bioassay</entry>
83            <entry>Generic raw data</entry>
84            <entry>generic.rawdata</entry>
85          </row>
86          <row>
87            <entry morerows="1">Affymetrix</entry>
88            <entry morerows="1">affymetrix</entry>
89            <entry morerows="1">-</entry>
90            <entry morerows="1">-</entry>
91            <entry>Array design</entry>
92            <entry>CDF file</entry>
93            <entry>affymetrix.cdf</entry>
94          </row>
95          <row>
96            <entry>Raw bioassay</entry>
97            <entry>CEL file</entry>
98            <entry>affymetrix.cel</entry>
99          </row>
100          <row>
101            <entry morerows="1">Sequencing</entry>
102            <entry morerows="1">sequencing</entry>
103            <entry morerows="1">Expression-like</entry>
104            <entry morerows="1">sequencing.expression</entry>
105            <entry>Array design</entry>
106            <entry>GTF ref-seq file</entry>
107            <entry>refseq.gtf</entry>
108          </row>
109          <row>
110            <entry>Raw bioassay</entry>
111            <entry>FPKM tracking file</entry>
112            <entry> sequencing.fpkm_tracking</entry>
113          </row>
114        </tbody>
115      </tgroup>
116    </informaltable>
117   
118 
119  </sect1>
120 
121  <sect1 id="appendix.rawdatatypes.ref">
122    <title>raw-data-types.xml reference</title>
123   
124    <para>
125      A given platform either supports importing data to the database or it
126      doesn't. If it supports import, it may be locked to specific raw data type
127      or it may use any raw data type. Among the default platforms installed with
128      BASE, the Affymetrix platform doesn't support importing data while the Generic
129      platform supports importing to any raw data type.
130    </para>
131   
132    <para>
133      Raw data types are defined in the <filename>raw-data-types.xml</filename>
134      file. This file is located in the <filename>&lt;basedir&gt;/www/WEB-INF/classes</filename>
135      directory and contains information about the database tables and columns to
136      use for storing raw data. BASE ships with default raw data types for many
137      different microarray platforms, including Genepix, Agilent and Illumina.
138    </para>
139   
140    <tip>
141      <para>
142        It is also possible to put additional raw data type definitions in the
143        <filename>&lt;basedir&gt;/www/WEB-INF/classes/raw-data-types</filename>
144        subdirectory. BASE will merge all <filename>*.xml</filename> it finds with
145        the main <filename>raw-data-types.xml</filename> file. The extra
146        configuration files should have the same format as the main
147        <filename>raw-data-types.xml</filename> file. Duplicate raw data types
148        are not supported and it is not possible to add extra columns to
149        existing types using this approach.
150      </para>
151    </tip>
152   
153    <para>
154      If you want your BASE installation to be configured differently we recommend that
155      you do it before the first initialisation of the database.
156      It is possible to change the configuration of an existing BASE installation but it
157      requires manual updates to the database. Following procedure covers how to update:
158    </para>
159
160  <orderedlist>
161  <listitem>
162    <para>
163    Shut down the BASE web server. If you have installed job agents you should shut
164    down them as well.
165    </para>
166  </listitem>
167 
168  <listitem>
169    <para>
170    Modify the <filename>raw-data-types.xml</filename> file or create a new file
171    in the <filename>raw-data-types</filename> subdirectory. If you have installed
172    job agents, make sure they all have the same version as the web server.
173    </para>
174  </listitem>
175 
176  <listitem>
177    <para>
178    Run the <filename>updatedb.sh</filename> script. Tables for new raw data types
179    and new columns for existing raw data types automatically be created, but the script
180    can't delete tables or columns that have been removed, or modify columns that have
181    changed datatype. You will have to do these kind of changes by manually executing
182    SQL against your database. Check your database documentation for information about SQL syntax.
183    </para>
184   
185    <tip>
186      <title>Create a parallel installation</title>
187      <para>
188      You can always create a new temporary parallel installation to check
189      what the table generated by installation script looks like. Compare the
190      new table to the existing one and make sure they match.
191      </para>
192    </tip>
193  </listitem>
194 
195  <listitem>
196    <para>
197    Start up the BASE web server and job agents, if any, again.
198    </para>
199  </listitem>
200  </orderedlist>
201
202  <tip>
203    <title>Start with few columns</title>
204    <para>
205    It is better to start with too few columns, since it is easier to add
206    more columns than it is to remove columns that are not needed.
207    </para>
208  </tip>
209
210  <bridgehead>Format of the raw-data-types.xml file</bridgehead>
211  <para>
212    The following example will serve as a description of the format used in
213    <filename>raw-data-types.xml</filename>:
214  </para>
215
216
217  <programlisting language="xml">
218<![CDATA[
219<?xml version="1.0" ?>
220<?xml-stylesheet type="text/xsl" href="raw-data-types.xsl"?>
221<!DOCTYPE raw-data-types SYSTEM "raw-data-types.dtd" >
222<raw-data-types>
223   <raw-data-type
224      id="genepix"
225      name="GenePix"
226      channels="2"
227      table="RawDataGenePix"
228      >
229      <property
230         name="diameter"
231         title="Spot diameter"
232         description="The diameter of the spot in µm"
233         column="diameter"
234         type="float"
235      />
236      <property
237         name="ch1FgMedian"
238         title="Channel 1 foreground median"
239         description="The median of the foreground intensity in channel 1"
240         column="ch1_fg_median"
241         type="float"
242         channel="1"
243      />
244      <!-- skipped a lot of properties -->
245      <intensity-formula
246         name="mean"
247         title="Mean FG - Mean BG"
248         description="Subtract mean background from mean foreground"
249         >
250         <formula
251            channel="1"
252            expression="raw('ch1FgMean') - raw('ch1BgMean')"
253         />
254         <formula
255            channel="2"
256            expression="raw('ch2FgMean') - raw('ch2BgMean')"
257         />
258      </intensity-formula>
259      <!-- and a few more... --->
260   </raw-data-type>
261</raw-data-types>
262]]> 
263</programlisting>
264 
265  <para>
266    Each raw data type is represented by a <sgmltag class="starttag">raw-data-type</sgmltag> 
267    tag. The following attributes can be used:
268  </para>
269 
270    <table frame="all" id="appendix.rawdatatypes.tag">
271    <title>Attributes for the <sgmltag class="starttag">raw-data-type</sgmltag> tag</title>
272    <tgroup cols="3" align="left">
273      <colspec colname="attribute" align="left" />
274      <colspec colname="required" />
275      <colspec colname="comment" />
276      <thead>
277        <row>
278          <entry>Attribute</entry>
279          <entry>Required</entry>
280          <entry>Comment</entry>
281        </row>
282      </thead>
283      <tbody>
284        <row>
285          <entry>id</entry>
286          <entry>yes</entry>
287          <entry>
288            A unique ID of the raw data type. It should contain only letters,
289            numbers and underscores and the first character must be a letter.
290          </entry>
291        </row>
292        <row>
293          <entry>name</entry>
294          <entry>yes</entry>
295          <entry>
296            A unique name of the raw data type. The name is usually used by client
297            applications for display.
298          </entry>
299        </row>
300        <row>
301          <entry>table</entry>
302          <entry>yes</entry>
303          <entry>
304            The name of the database table to store data in. The table name
305            must be unique and can only contain letters,
306            numbers and underscores. The first character must be a letter.
307          </entry>
308        </row>
309        <row>
310          <entry>channels</entry>
311          <entry>yes</entry>
312          <entry>
313            The number of channels used by this raw data type. It must be
314            a number &gt; 0.
315          </entry>
316        </row>
317        <row>
318          <entry>description</entry>
319          <entry>no</entry>
320          <entry>
321            An optional (longer) description of the raw data type.
322          </entry>
323        </row>
324      </tbody>
325    </tgroup>
326    </table>
327   
328    <para>
329      Following the <sgmltag class="starttag">raw-data-type</sgmltag> tag
330      is one or more  <sgmltag class="starttag">property</sgmltag> tags.
331      Each one defines a column in the database that is designed to hold
332      data values of a particular type. The following attributes can be used
333      on this tag:
334    </para>
335 
336    <table frame="all" id="appendix.rawdatatypes.property">
337    <title>Attributes for the <sgmltag class="starttag">property</sgmltag> tag</title>
338    <tgroup cols="3" align="left">
339      <colspec colname="attribute" align="left" />
340      <colspec colname="required" />
341      <colspec colname="comment" />
342      <thead>
343        <row>
344          <entry>Attribute</entry>
345          <entry>Required</entry>
346          <entry>Comment</entry>
347        </row>
348      </thead>
349      <tbody>
350        <row>
351          <entry>*</entry>
352          <entry></entry>
353          <entry>
354            All attributes defined by the
355            <sgmltag class="starttag">property</sgmltag> tag in
356            <filename>extended-properties.xml</filename>. See
357            <xref linkend="appendix.extendedproperties.property" />.
358          </entry>
359        </row>
360        <row>
361          <entry>channels</entry>
362          <entry>no</entry>
363          <entry>
364            The channel number the property belongs to. Allowed values are 0 to
365            the number of channels specified for the raw data type. If the property
366            doesn't belong to any channels set the value to 0 or leave it
367            unspecified.
368          </entry>
369        </row>
370      </tbody>
371    </tgroup>
372    </table>
373   
374    <para>
375      Following the <sgmltag class="starttag">property</sgmltag> tags comes 0
376      or more <sgmltag class="starttag">intensity-formula</sgmltag> tags.
377      Each one defines mathematical formulas that can be used to
378      calculate the intensity values from the raw data. In the Genepix case,
379      there are several formulas which differs in the way background is
380      subtracted from foreground intensity values. For other raw data
381      types, the intensity formula may just copy one of the raw data values.
382    </para>
383   
384    <para>
385      The intensity formulas are installed as <classname 
386      docapi="net.sf.basedb.core">Formula</classname> items in the database. This
387      means that you can manually add, change or remove intensity formulas directly
388      from the web interface. The intensity formulas in the <filename>raw-data-types.xml</filename>
389      file are only used at installation time.
390    </para>
391   
392    <para>
393      The <sgmltag class="starttag">intensity-formula</sgmltag> tag has the following
394      attributes:
395    </para>
396   
397    <table frame="all" id="appendix.rawdatatypes.intensity-formula">
398    <title>Attributes for the <sgmltag class="starttag">intensity-formula</sgmltag> tag</title>
399    <tgroup cols="3" align="left">
400      <colspec colname="attribute" align="left" />
401      <colspec colname="required" />
402      <colspec colname="comment" />
403      <thead>
404        <row>
405          <entry>Attribute</entry>
406          <entry>Required</entry>
407          <entry>Comment</entry>
408        </row>
409      </thead>
410      <tbody>
411        <row>
412          <entry>name</entry>
413          <entry>yes</entry>
414          <entry>
415            A unique name for the formula. This is only used during installation.
416          </entry>
417        </row>
418        <row>
419          <entry>title</entry>
420          <entry>yes</entry>
421          <entry>
422            The title of the formula. This is used by client applications for
423            display.
424          </entry>
425        </row>
426        <row>
427          <entry>description</entry>
428          <entry>no</entry>
429          <entry>
430            An optional, longer, description of the formula.
431          </entry>
432        </row>
433      </tbody>
434    </tgroup>
435    </table>
436   
437    <para>
438      The <sgmltag class="starttag">intensity-formula</sgmltag> must contain
439      one <sgmltag class="starttag">formula</sgmltag> tag for each channel
440      of the raw data type. The attributes of this tag are:
441    </para>
442   
443    <table frame="all" id="appendix.rawdatatypes.formula">
444    <title>Attributes for the <sgmltag class="starttag">formula</sgmltag> tag</title>
445    <tgroup cols="3" align="left">
446      <colspec colname="attribute" align="left" />
447      <colspec colname="required" />
448      <colspec colname="comment" />
449      <thead>
450        <row>
451          <entry>Attribute</entry>
452          <entry>Required</entry>
453          <entry>Comment</entry>
454        </row>
455      </thead>
456      <tbody>
457        <row>
458          <entry>channel</entry>
459          <entry>yes</entry>
460          <entry>
461            The channel number. One tag for each channel must be specified. No
462            duplicates are allowed.
463          </entry>
464        </row>
465        <row>
466          <entry>expression</entry>
467          <entry>yes</entry>
468          <entry>
469            The mathematical expression used to calculate the intensities.
470            The expression is parsed with the <classname docapi="net.sf.basedb.util.jep">Jep</classname>
471            parser. It supports the common mathematical operations such as +, -, *, /,
472            some mathematical function like, log2(), ln(), sqrt(), etc. See the API
473            documentation for Jep for more information. You can also use two special
474            function developed specifically for this case:
475            <itemizedlist>
476            <listitem>
477              <para>
478              raw(name): Get the value from the raw data property with the given name,
479              for example: <code>raw('ch1FgMedian')</code>.
480              </para>
481            </listitem>
482            <listitem>
483              <para>
484              mean(name): Get the mean value of the raw data property with the given name,
485              for example: <code>mean('ch1BgMean')</code>. The mean is calculated from
486              all raw data spots in the raw bioassay.
487              </para>
488            </listitem>
489            </itemizedlist>
490          </entry>
491        </row>
492      </tbody>
493    </tgroup>
494    </table>
495   
496  </sect1>
497
498</appendix>
499
Note: See TracBrowser for help on using the repository browser.