Changeset 6459
- Timestamp:
- May 27, 2014, 10:46:05 AM (9 years ago)
- Location:
- trunk
- Files:
-
- 5 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/data/plugin_configfile.xml
r5773 r6459 957 957 </parameter> 958 958 <parameter> 959 <name>ignoreRegexp</name> 960 <label>Ignore</label> 961 <description>A regular expression that matches any line that should be ignored. For example, ignore lines starting with #: ^#.*</description> 962 <class>java.lang.String</class> 963 <value>^#.*</value> 964 </parameter> 965 <parameter> 959 966 <name>minDataColumns</name> 960 967 <label>Min data columns</label> … … 1039 1046 </parameter> 1040 1047 <parameter> 1048 <name>ignoreRegexp</name> 1049 <label>Ignore</label> 1050 <description>A regular expression that matches any line that should be ignored. For example, ignore lines starting with #: ^#.*</description> 1051 <class>java.lang.String</class> 1052 <value>^#.*</value> 1053 </parameter> 1054 <parameter> 1041 1055 <name>minDataColumns</name> 1042 1056 <label>Min data columns</label> … … 1121 1135 </parameter> 1122 1136 <parameter> 1137 <name>ignoreRegexp</name> 1138 <label>Ignore</label> 1139 <description>A regular expression that matches any line that should be ignored. For example, ignore lines starting with #: ^#.*</description> 1140 <class>java.lang.String</class> 1141 <value>^#.*</value> 1142 </parameter> 1143 <parameter> 1123 1144 <name>minDataColumns</name> 1124 1145 <label>Min data columns</label> … … 1182 1203 </parameter> 1183 1204 <parameter> 1205 <name>ignoreRegexp</name> 1206 <label>Ignore</label> 1207 <description>A regular expression that matches any line that should be ignored. For example, ignore lines starting with #: ^#.*</description> 1208 <class>java.lang.String</class> 1209 <value>^#.*</value> 1210 </parameter> 1211 <parameter> 1184 1212 <name>trimQuotes</name> 1185 1213 <label>Remove quotes</label> -
trunk/src/core/net/sf/basedb/util/gtf/GtfInputStream.java
r5770 r6459 52 52 in most cases we are not interested in this when importing data to BASE. 53 53 This option also remove the feature, start, end, score, strand and frame 54 columns from the output. 54 columns from the output. Lines that can't be split into at least 9 columns 55 (eg. comment lines starting with #) are ignored and forwarded without modification. 55 56 56 57 @author Nicklas … … 103 104 this.reader = new BufferedReader(new InputStreamReader(master, this.charset)); 104 105 this.ATTRIBUTE_PATTERN = Pattern.compile("([^ ]+) (([^\";]*)|(\"[^\"]*\"));"); 105 init();106 this.buffer = readMore(); 106 107 } 107 108 … … 197 198 { 198 199 return transcriptIds.size(); 199 }200 201 /**202 Initialize the converter by reading the first line from the GTF203 file. The attributes will be extracted and a header row is created204 with the 8 required columns + new colums for each attribute.205 @throws IOException206 */207 private void init()208 throws IOException209 {210 // Read first line211 String[] line = getNextLine();212 if (line == null) return;213 214 // Parse the attributes in 9th column215 parseAttributes(line[8]);216 217 // Generate header line218 StringBuffer sb = new StringBuffer();219 transcriptIds.add(attributes[transcriptIdIndex].value+ '@' + line[0]);220 if (skipRepeatedTranscriptIds)221 {222 sb.append("<seqname>\t<source>");223 }224 else225 {226 sb.append("<seqname>\t<source>\t<feature>\t<start>\t<end>\t<score>\t<strand>\t<frame>");227 }228 for (Attribute attribute : attributes)229 {230 sb.append("\t<").append(attribute.key).append(">");231 }232 sb.append("\n");233 234 // Append first line of data to the buffer235 appendLine(sb, line, attributes);236 buffer = sb.toString().getBytes(charset);237 200 } 238 201 … … 248 211 { 249 212 String[] line; 213 StringBuffer sb = new StringBuffer(); 250 214 do 251 215 { … … 258 222 259 223 // Parse attributes in 9th column 260 parseAttributes(line[8]); 261 262 String id = attributes[transcriptIdIndex].value + '@' + line[0]; 263 if (transcriptIds.add(id) && skipRepeatedTranscriptIds) 264 { 224 if (line.length >= 9) 225 { 226 // Generate headers if this is the first line with data 227 boolean generateHeaders = attributes == null; 228 parseAttributes(line[8]); 229 if (generateHeaders) 230 { 231 if (skipRepeatedTranscriptIds) 232 { 233 sb.append("<seqname>\t<source>"); 234 } 235 else 236 { 237 sb.append("<seqname>\t<source>\t<feature>\t<start>\t<end>\t<score>\t<strand>\t<frame>"); 238 } 239 for (Attribute attribute : attributes) 240 { 241 sb.append("\t<").append(attribute.key).append(">"); 242 } 243 sb.append("\n"); 244 } 245 246 // Break out of the loop as soon as we see a new transcript id or if we are including repeated ids 247 String id = attributes[transcriptIdIndex].value + '@' + line[0]; 248 if (transcriptIds.add(id) || !skipRepeatedTranscriptIds) 249 { 250 appendLine(sb, line, attributes); 251 break; 252 } 253 } 254 else 255 { 256 // Append line as it is 257 appendLine(sb, line, null); 265 258 break; 266 259 } … … 268 261 269 262 // Convert to byte[] 270 return appendLine(new StringBuffer(), line, attributes).toString().getBytes(charset); 271 } 272 273 /** 274 Read the next line from the GTF file and split on tab character 275 into 9 or 10 columns. If the line doesn't contain at least 9 columns an 276 exception is thrown. If the 10th column exists it must be a comment 277 column that starts with a # character. 278 263 return sb.toString().getBytes(charset); 264 } 265 266 /** 267 Read the next line from the GTF file and split on tab character. 279 268 */ 280 269 private String[] getNextLine() … … 285 274 lineNum++; 286 275 String[] columns = line.split("\\t", 10); 287 if (columns.length < 9)288 {289 throw new IOException("A line must have at least 9 columns, at line: " + lineNum);290 }291 276 return columns; 292 277 } … … 357 342 358 343 /** 359 Append the first 8 columns to the buffer and then add all values 360 from the attributes. 344 Append columns to the buffer and separate each with a tab. 345 If attributes are given, the first 8 (or 2 if skipRepeatedTranscriptIds=true) 346 columns are appended, then each of the attributes are appended. 347 If no attributes are given, all columns are copied as they are. 348 361 349 @param sb The buffer to append to 362 350 @param columns The regular columns (must be at least 8) … … 367 355 // First 8 columns are copied with tab separator 368 356 sb.append(columns[0]); 369 int end = skipRepeatedTranscriptIds ? 2 : 8;357 int end = attr == null ? columns.length : (skipRepeatedTranscriptIds ? 2 : 8); 370 358 for (int i = 1; i < end; ++i) 371 359 { 372 360 sb.append("\t").append(columns[i]); 373 361 } 374 // Then follows the attributes375 for (Attribute attribute : attr)376 {377 sb.append("\t").append(attribute.value);378 attribute.value = null;379 }380 /*381 TODO - Handle Comments382 if (line.length == 10)383 {384 sb.append("\t").append(line[9]);385 }386 */362 if (attr != null) 363 { 364 // Then follows the attributes 365 for (Attribute attribute : attr) 366 { 367 sb.append("\t").append(attribute.value); 368 attribute.value = null; 369 } 370 if (columns.length == 10) 371 { 372 sb.append("\t").append(columns[9]); 373 } 374 } 387 375 sb.append("\n"); 388 376 return sb; -
trunk/src/plugins/core/net/sf/basedb/plugins/gtf/DefaultConfigurationValues.java
r5770 r6459 94 94 defaultValues.put("dataSplitterRegexp", "\\t"); 95 95 defaultValues.put("dataHeaderRegexp", "<seqname>\\t.*<transcript_id>.*"); 96 defaultValues.put("ignoreRegexp", "^#.*"); 96 97 defaultValues.put("minDataColumns", 4); 97 98 defaultValues.put("complexExpressions", "allow"); -
trunk/src/test/TestGtfInputStream.java
r5770 r6459 62 62 ffp.setDataHeaderRegexp(Pattern.compile("\\<seqname\\>.*")); 63 63 ffp.setDataSplitterRegexp(Pattern.compile("\\t")); 64 ffp.setIgnoreRegexp(Pattern.compile("^#.*")); 64 65 ffp.setMinDataColumns(4); 65 66 ffp.setMaxDataColumns(-1); -
trunk/src/test/data/test.gtf
r5759 r6459 1 # comment 1 2 chr1 hg19_refGene start_codon 67000042 67000044 0 + . gene_id "NM_032291"; transcript_id "NM_032291"; gene_name2 "SGIP1"; 2 3 chr1 hg19_refGene CDS 67000042 67000051 0 + 0 gene_id "NM_032291"; transcript_id "NM_032291"; gene_name2 "SGIP1";
Note: See TracChangeset
for help on using the changeset viewer.