1 | #!/usr/bin/perl |
---|
2 | |
---|
3 | # $Id: cgh_dataDumper.pl 720 2008-06-11 14:05:18Z jari $ |
---|
4 | |
---|
5 | # Document name: cgh_dataDumper.pl |
---|
6 | # cgh_dataDumper.pl version 2.0 |
---|
7 | # |
---|
8 | # |
---|
9 | # Copyright (C) 2004 Johan Staaf |
---|
10 | |
---|
11 | # cgh_dataDumper.pl is free software; you can redistribute it and/or |
---|
12 | # modify it under the terms of the GNU General Public License |
---|
13 | # as published by the Free Software Foundation; either version 2 |
---|
14 | # of the License, or (at your option) any later version. |
---|
15 | |
---|
16 | # This program is distributed in the hope that it will be useful, |
---|
17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
19 | # GNU General Public License for more details. |
---|
20 | |
---|
21 | # You should have received a copy of the GNU General Public License |
---|
22 | # along with this program; if not, write to the Free Software |
---|
23 | # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
---|
24 | |
---|
25 | # johan.staaf@onk.lu.se |
---|
26 | # Johan Staaf, Dept Oncology, Lund University, S-221 85 Lund, Sweden |
---|
27 | |
---|
28 | use strict; |
---|
29 | use FindBin (); |
---|
30 | |
---|
31 | # VER 1.0 of cgh_dataDumper.pl |
---|
32 | |
---|
33 | |
---|
34 | ######## BASE file requirements ######### |
---|
35 | # The BASEfile format should be normal. |
---|
36 | # expect per row: |
---|
37 | # [reporterId] [geneSymbol] [chromosome] [cytoBand] [startPosition] [endPosition] [assayData = M] |
---|
38 | |
---|
39 | |
---|
40 | ############################################ |
---|
41 | ############# MAIN PROGRAM ############# |
---|
42 | ############################################ |
---|
43 | |
---|
44 | |
---|
45 | ### Main Variables ### |
---|
46 | my (@final); |
---|
47 | my (%param_hash,%base_hash,%bacToGene); |
---|
48 | my @annoStat; |
---|
49 | ################# |
---|
50 | |
---|
51 | ## Read the file and extract BASE column data ####### |
---|
52 | |
---|
53 | $/="%"; #Gives every section as one lump |
---|
54 | my $i=0; |
---|
55 | |
---|
56 | while(<>){ |
---|
57 | $i++; |
---|
58 | chomp; |
---|
59 | if($i==1){ #Parameters |
---|
60 | |
---|
61 | $_ =~ /filename\t(.+)\n/; |
---|
62 | $param_hash{'filename'}=$1; |
---|
63 | $_ =~ /sort\t(\w+)\n/; |
---|
64 | $param_hash{'sort'}=$1; |
---|
65 | $_ =~ /exportMode\t(\w+)\n/; |
---|
66 | $param_hash{'exportMode'}=$1; |
---|
67 | $_ =~ /annotationType\t(.+)\n/; |
---|
68 | $param_hash{'annotationType'}=$1; |
---|
69 | |
---|
70 | $_ =~ /annoForStat\t(.+)\n/; #in format |ER Status|p_brca_family_status|etc..| |
---|
71 | $param_hash{'annoForStat'}=$1; |
---|
72 | my @tmp=split(/\|/,$param_hash{'annoForStat'}); #holding the annotations to get stats for! |
---|
73 | foreach my $line (@tmp){ |
---|
74 | if($line ne ""){ |
---|
75 | push(@annoStat,$line); |
---|
76 | } |
---|
77 | } |
---|
78 | }elsif($i==2){ #section assays info |
---|
79 | $_=~ /count\t(\d*)/; |
---|
80 | $base_hash{'NumberOfAssays'}=$1; |
---|
81 | $_=~ /columns\t(.*)/; |
---|
82 | $base_hash{'AssayColumns'}=$1; |
---|
83 | $_=~ /annotationColumns\t(.*)/; |
---|
84 | $base_hash{'AnnotationColumns'}=$1; |
---|
85 | }elsif($i==3){ #assay info + section spots |
---|
86 | $_=~ /columns\t(.*)\tassayData/; |
---|
87 | $base_hash{'Columns'}=$1; |
---|
88 | $_=~ /channels\t(.*)/; |
---|
89 | $base_hash{'channels'}=$1; |
---|
90 | $_=~/assayFields\t(.*)/; |
---|
91 | $base_hash{'assayFields'}=$1; |
---|
92 | (my $text,my $dump)=split("\n\n",$_); |
---|
93 | $text=~ s/^\n//; |
---|
94 | $base_hash{'Assays'}=$text; |
---|
95 | last; #Dont read in the large bulk of data now! |
---|
96 | } |
---|
97 | } |
---|
98 | #### OBSERVE, all base headers now printed to STDOUT! |
---|
99 | |
---|
100 | |
---|
101 | |
---|
102 | ## Do export of data points, global boolean |
---|
103 | my $doExport="TRUE"; |
---|
104 | |
---|
105 | |
---|
106 | #### Locate the assay field columns for each annotation group to extract data from |
---|
107 | my @assays = split("\n",$base_hash{'Assays'}); |
---|
108 | my @assaynames = (); |
---|
109 | foreach my $line (@assays) { |
---|
110 | my @tmp=split("\t",$line); |
---|
111 | push(@assaynames,$tmp[1]); #keep only the assay name in the order sent to the plugin |
---|
112 | } |
---|
113 | |
---|
114 | if($param_hash{'exportMode'} eq "sampleName_annotation"){ |
---|
115 | my @sampleannotations; |
---|
116 | |
---|
117 | ### Assign an assay to an annotation type and save in @control vector |
---|
118 | my @base_annotations=split("\t",$base_hash{'AnnotationColumns'}); |
---|
119 | my $annotation_index= -1; #tracks which column index to use to get the annotation values |
---|
120 | for (my $i=0;$i<@base_annotations;$i++){ |
---|
121 | if($param_hash{'annotationType'} eq $base_annotations[$i]){ #found the right annotation! |
---|
122 | $annotation_index=$i; |
---|
123 | last; |
---|
124 | } |
---|
125 | } |
---|
126 | for (my $i=0;$i<@assays;$i++) { |
---|
127 | my @tmp=split("\t",$assays[$i]); |
---|
128 | if($annotation_index > -1){ # a real annotation index found |
---|
129 | if($tmp[$annotation_index+2] eq ""){ #[assaynbr] [assayname] then annotations, thats why +2 |
---|
130 | $sampleannotations[$i]="NONE"; #Na for annotation for this assay! |
---|
131 | }else{ |
---|
132 | $sampleannotations[$i]=$tmp[$annotation_index+2]; #a real annotation |
---|
133 | } |
---|
134 | }else{ |
---|
135 | #basically this means no annotation analysis from start, use Na instead in control file |
---|
136 | $sampleannotations[$i]="NONE"; #Na for annotation for this assay! |
---|
137 | |
---|
138 | } |
---|
139 | } |
---|
140 | |
---|
141 | ### print the binding of @assaynames, @sampleannotations |
---|
142 | open(myfile,">SampleNames_and_selectedAnnotation.txt"); |
---|
143 | for (my $i=0;$i<@assaynames;$i++) { |
---|
144 | print myfile "$assaynames[$i]\t$sampleannotations[$i]\n"; |
---|
145 | } |
---|
146 | close(myfile); |
---|
147 | $doExport="FALSE"; |
---|
148 | }elsif($param_hash{'exportMode'} eq "sampleName"){ |
---|
149 | |
---|
150 | ### print @assaynames |
---|
151 | open(myfile,">SampleNames.txt"); |
---|
152 | for (my $i=0;$i<@assaynames;$i++) { |
---|
153 | print myfile "$assaynames[$i]\n"; |
---|
154 | } |
---|
155 | close(myfile); |
---|
156 | $doExport="FALSE"; |
---|
157 | }elsif(($param_hash{'exportMode'} eq "annotationStatistics") && ($param_hash{'annoForStat'} ne "NULL") ){ |
---|
158 | my @base_annotations=split("\t",$base_hash{'AnnotationColumns'}); #all possible base annotations for this sample |
---|
159 | |
---|
160 | ### Fixing the issue with blanks ( :( ) |
---|
161 | my $annos=$base_hash{'Assays'}; |
---|
162 | $annos =~s/\t\t/\tNA\t/g; |
---|
163 | $annos =~s/\t\n/\tNA\n/g; |
---|
164 | $annos =~s/\t\t/\tNA\t/g; |
---|
165 | my @annos=split("\n",$annos); #a row is a sample with its annotations |
---|
166 | |
---|
167 | #print "$base_hash{'Assays'}"; |
---|
168 | #print "\n#####\n"; |
---|
169 | #print "nbr assay $base_hash{'AnnotationColumns'}\n"; |
---|
170 | #print "$annos"; |
---|
171 | |
---|
172 | ### Find the subset of annotations to get statistics for |
---|
173 | my @annotation_index; #tracks which column index to use to get the annotation values |
---|
174 | for (my $i=0;$i<@base_annotations;$i++){ |
---|
175 | foreach my $anno (@annoStat){ |
---|
176 | if(($anno eq $base_annotations[$i]) && ($anno ne "")){ #found the right annotation! |
---|
177 | push(@annotation_index,$i); |
---|
178 | last; |
---|
179 | } |
---|
180 | } |
---|
181 | } |
---|
182 | |
---|
183 | # @annoStat and @annotation_index are now matched! |
---|
184 | my %annoStat; #key = selected annotation type, value = the column of values for that type, |
---|
185 | # separated with tab. |
---|
186 | my %annoUnique; #key = selected annotation type, value = unique annotation values for that type |
---|
187 | for(my $i=0;$i<@annoStat;$i++){ |
---|
188 | #foreach selected annotation extract its column |
---|
189 | my @tempCol; |
---|
190 | foreach my $line (@annos) { |
---|
191 | my @tmp=split("\t",$line); |
---|
192 | shift(@tmp); #removing assay id |
---|
193 | shift(@tmp); #removing assay name |
---|
194 | # only the annotations are left! |
---|
195 | push(@tempCol,$tmp[$annotation_index[$i]]); |
---|
196 | } |
---|
197 | $annoStat{$annoStat[$i]}=join("\t",@tempCol); |
---|
198 | my %saw; |
---|
199 | @saw{@tempCol}=(); |
---|
200 | $annoUnique{$annoStat[$i]}=join("\t",keys(%saw)); |
---|
201 | } |
---|
202 | open(myfile,">AnnotationStatistics.txt"); |
---|
203 | foreach my $annotype (keys(%annoUnique)){ |
---|
204 | #count how many instances of each unique annotation value there is for this annotation type |
---|
205 | my @uniqueV=split("\t",$annoUnique{$annotype}); |
---|
206 | my @values=split("\t",$annoStat{$annotype}); |
---|
207 | my %saw; |
---|
208 | print myfile "$annotype\tNbrAssays\n"; |
---|
209 | foreach my $uniqueValue (@uniqueV){ |
---|
210 | my $r=scalar(grep(/$uniqueValue/,@values)); |
---|
211 | print myfile "$uniqueValue\t$r\n"; |
---|
212 | } |
---|
213 | print myfile "\n"; |
---|
214 | } |
---|
215 | close(myfile); |
---|
216 | $doExport="FALSE"; |
---|
217 | }elsif($param_hash{'exportMode'} eq "annotationAll"){ |
---|
218 | #create an annotation file for all annotations |
---|
219 | |
---|
220 | my @base_annotations=split("\t",$base_hash{'AnnotationColumns'}); #all possible base annotations for this sample |
---|
221 | |
---|
222 | ### Fixing the issue with blanks ( :( ) |
---|
223 | my $annos=$base_hash{'Assays'}; |
---|
224 | $annos =~s/\t\t/\tNA\t/g; |
---|
225 | $annos =~s/\t\n/\tNA\n/g; |
---|
226 | $annos =~s/\t\t/\tNA\t/g; |
---|
227 | my @annos=split("\n",$annos); #a row is a sample with its annotations |
---|
228 | |
---|
229 | open(myfile,">AnnotationAllFile.txt"); |
---|
230 | print myfile "Assay\t$base_hash{'AnnotationColumns'}\n"; |
---|
231 | foreach my $line (@annos){ |
---|
232 | my @tmp=split("\t",$line); |
---|
233 | shift(@tmp); #remove the BASE assay id nummer |
---|
234 | $line=join("\t",@tmp); |
---|
235 | print myfile "$line\n"; |
---|
236 | } |
---|
237 | close(myfile); |
---|
238 | $doExport="FALSE"; |
---|
239 | }elsif(($param_hash{'exportMode'} eq "annotationSelected") && ($param_hash{'annoForStat'} ne "NULL")){ |
---|
240 | #create an annotation file based on only selected annotations |
---|
241 | |
---|
242 | my @base_annotations=split("\t",$base_hash{'AnnotationColumns'}); #all possible base annotations for this sample |
---|
243 | |
---|
244 | ### Fixing the issue with blanks ( :( ) |
---|
245 | my $annos=$base_hash{'Assays'}; |
---|
246 | $annos =~s/\t\t/\tNA\t/g; |
---|
247 | $annos =~s/\t\n/\tNA\n/g; |
---|
248 | $annos =~s/\t\t/\tNA\t/g; |
---|
249 | my @annos=split("\n",$annos); #a row is a sample with its annotations |
---|
250 | |
---|
251 | |
---|
252 | ### Find the subset of annotations to get statistics for |
---|
253 | my @annotation_index; #tracks which column index to use to get the annotation values |
---|
254 | my @matched_annoStat; |
---|
255 | |
---|
256 | foreach my $anno (@annoStat){ |
---|
257 | if($anno =~/~/){ |
---|
258 | # pattern matching!! Get all annotations that match the specified pattern |
---|
259 | # e.g. ~Her130 = all Her130 annotations |
---|
260 | $anno=~ /~(.+)/; |
---|
261 | my $pattern=$1; |
---|
262 | for(my $i=0;$i<@base_annotations;$i++){ |
---|
263 | if($base_annotations[$i]=~/$pattern/){ |
---|
264 | push(@matched_annoStat,$base_annotations[$i]); |
---|
265 | push(@annotation_index,$i); |
---|
266 | } |
---|
267 | } |
---|
268 | }else{ |
---|
269 | for (my $i=0;$i<@base_annotations;$i++){ |
---|
270 | if(($anno eq $base_annotations[$i]) && ($anno ne "")){ #found the right annotation! |
---|
271 | push(@matched_annoStat,$anno); |
---|
272 | push(@annotation_index,$i); |
---|
273 | last; |
---|
274 | } |
---|
275 | } |
---|
276 | } |
---|
277 | } |
---|
278 | # @annoStat and @annotation_index are now matched! |
---|
279 | |
---|
280 | my $annoHeader=join("\t","Assay",@matched_annoStat); |
---|
281 | open(myfile,">AnnotationSelectedFile.txt"); |
---|
282 | print myfile "$annoHeader\n"; |
---|
283 | foreach my $line (@annos){ |
---|
284 | my @tmp=split("\t",$line); |
---|
285 | my $printline=$tmp[1]; #assay name |
---|
286 | for(my $i=0;$i<@matched_annoStat;$i++){ |
---|
287 | #print "$annotation_index[$i] $tmp[$annotation_index[$i]+2]\n"; |
---|
288 | $printline=join("\t",$printline,$tmp[$annotation_index[$i]+2]); |
---|
289 | } |
---|
290 | print myfile "$printline\n"; |
---|
291 | } |
---|
292 | close(myfile); |
---|
293 | $doExport="FALSE"; |
---|
294 | }elsif(($param_hash{'exportMode'} eq "annotationDisplaySelected") && ($param_hash{'annoForStat'} ne "NULL")){ |
---|
295 | #create an annotation display file based on only selected annotations |
---|
296 | |
---|
297 | my @base_annotations=split("\t",$base_hash{'AnnotationColumns'}); #all possible base annotations for this sample |
---|
298 | |
---|
299 | ### Fixing the issue with blanks ( :( ) |
---|
300 | my $annos=$base_hash{'Assays'}; |
---|
301 | $annos =~s/\t\t/\tNA\t/g; |
---|
302 | $annos =~s/\t\n/\tNA\n/g; |
---|
303 | $annos =~s/\t\t/\tNA\t/g; |
---|
304 | my @annos=split("\n",$annos); #a row is a sample with its annotations |
---|
305 | |
---|
306 | |
---|
307 | ### Find the subset of annotations to get statistics for |
---|
308 | my @annotation_index; #tracks which column index to use to get the annotation values |
---|
309 | my @matched_annoStat; |
---|
310 | |
---|
311 | foreach my $anno (@annoStat){ |
---|
312 | if($anno =~/~/){ |
---|
313 | # pattern matching!! Get all annotations that match the specified pattern |
---|
314 | # e.g. ~Her130 = all Her130 annotations |
---|
315 | $anno=~ /~(.+)/; |
---|
316 | my $pattern=$1; |
---|
317 | for(my $i=0;$i<@base_annotations;$i++){ |
---|
318 | if($base_annotations[$i]=~/$pattern/){ |
---|
319 | push(@matched_annoStat,$base_annotations[$i]); |
---|
320 | push(@annotation_index,$i); |
---|
321 | } |
---|
322 | } |
---|
323 | }else{ |
---|
324 | for (my $i=0;$i<@base_annotations;$i++){ |
---|
325 | if(($anno eq $base_annotations[$i]) && ($anno ne "")){ #found the right annotation! |
---|
326 | push(@matched_annoStat,$anno); |
---|
327 | push(@annotation_index,$i); |
---|
328 | last; |
---|
329 | } |
---|
330 | } |
---|
331 | } |
---|
332 | } |
---|
333 | # @annoStat and @annotation_index are now matched! |
---|
334 | # i.e matched_annoStat contains all annotation types to extract data from, |
---|
335 | # and annotation_index contains which columns to use respectively. |
---|
336 | |
---|
337 | # go through each matched_annoStat entry, find the unique annotation values (k) |
---|
338 | # and for each unique value, create a column with value 1 for match on the unique value |
---|
339 | #for a given sample, 0 for no match. Iterate this over all matched annotation types. |
---|
340 | # blank samples have the value NA in @annos. They retain NA in the export, however NA is not |
---|
341 | # allowed as a column type! |
---|
342 | |
---|
343 | my @annotationDisplayMatrix; #N rows = N samples |
---|
344 | my @annotationDisplayHeader; |
---|
345 | push(@annotationDisplayHeader,"AssayName"); |
---|
346 | |
---|
347 | #Initiate the Display matrix with sample names |
---|
348 | for(my $j=0;$j<@annos;$j++){ |
---|
349 | my @tmp=split("\t",$annos[$j]); |
---|
350 | push(@annotationDisplayMatrix,$tmp[1]); #initiate with sample name! |
---|
351 | } |
---|
352 | |
---|
353 | for(my $i=0;$i<@matched_annoStat;$i++){ |
---|
354 | my %uniqueValues; |
---|
355 | my @uniqueValueColumns; |
---|
356 | |
---|
357 | #find unique annotation values |
---|
358 | for(my $j=0;$j<@annos;$j++){ |
---|
359 | my @tmp=split("\t",$annos[$j]); |
---|
360 | if(($tmp[$annotation_index[$i]+2] eq "NA") || ($tmp[$annotation_index[$i]+2] eq "")){ |
---|
361 | #not allowed as a unique entry |
---|
362 | }else{ |
---|
363 | $uniqueValues{$tmp[$annotation_index[$i]+2]}=""; |
---|
364 | } |
---|
365 | } |
---|
366 | my @uniqueValueColumn=sort(keys(%uniqueValues)); |
---|
367 | |
---|
368 | |
---|
369 | #foreach of the unique annotation values, make a column, 1 for present, 0 for absent |
---|
370 | # for each sample |
---|
371 | foreach my $uniqueValue (@uniqueValueColumn){ |
---|
372 | #my $ttt<-$uniqueValue; |
---|
373 | #$ttt =~s/\s/\_/g; |
---|
374 | push(@annotationDisplayHeader,$uniqueValue); #push to the header |
---|
375 | for(my $j=0;$j<@annos;$j++){ |
---|
376 | #for each sample |
---|
377 | my @tmp=split("\t",$annos[$j]); |
---|
378 | |
---|
379 | if($tmp[$annotation_index[$i]+2] eq $uniqueValue){ |
---|
380 | # push in a 1 |
---|
381 | $annotationDisplayMatrix[$j]=$annotationDisplayMatrix[$j]."\t".1; |
---|
382 | }else{ |
---|
383 | #push in a 0 |
---|
384 | $annotationDisplayMatrix[$j]=$annotationDisplayMatrix[$j]."\t".0; |
---|
385 | } |
---|
386 | } |
---|
387 | } |
---|
388 | } |
---|
389 | |
---|
390 | my $annoHeader= join("\t",@annotationDisplayHeader); |
---|
391 | |
---|
392 | open(myfile,">AnnotationDisplaySelectedFile.txt"); |
---|
393 | print myfile "$annoHeader\n"; |
---|
394 | foreach my $row (@annotationDisplayMatrix){ |
---|
395 | print myfile "$row\n"; |
---|
396 | |
---|
397 | } |
---|
398 | close(myfile); |
---|
399 | $doExport="FALSE"; |
---|
400 | }#end annotationDisplay |
---|
401 | |
---|
402 | |
---|
403 | if($doExport eq "TRUE"){ |
---|
404 | #### Determine how many common columns to add to get to assaydata |
---|
405 | my @cc = split("\t",$base_hash{'Columns'}); |
---|
406 | |
---|
407 | #### Make header and calculate nbr of data columns in total |
---|
408 | my $header=join("\t",@cc,@assaynames); |
---|
409 | my $nbrCols=scalar(@cc)+scalar(@assaynames); |
---|
410 | |
---|
411 | |
---|
412 | ####### Parse the spot data one row at a time ####### |
---|
413 | # each row looks like: |
---|
414 | # [reporterId] [geneSymbol] [chromosome] [cytoBand] [startPosition] [endPosition] [assayData = M] |
---|
415 | my $chromosomeIndex=2; |
---|
416 | my $geneIndex=1; |
---|
417 | my $repIdIndex=0; |
---|
418 | my $endPositionIndex=5; |
---|
419 | my @highlightToWrite=(); |
---|
420 | my @toSort=(); #vector for holding data to sort in format as above |
---|
421 | |
---|
422 | $/="\n"; #reset |
---|
423 | my $found = "FALSE"; |
---|
424 | |
---|
425 | |
---|
426 | ## Create a file with the assay names for the stac_format.R script. Also modify the header for stac output |
---|
427 | ## so that the header is sample1 .. sampleN instead of real assay names (column conflict in R for weird assay names |
---|
428 | if($param_hash{'exportMode'} eq "stac"){ |
---|
429 | $header=join("\t",@cc); |
---|
430 | my $index=0; |
---|
431 | open(myoutfile,">assays.txt"); |
---|
432 | foreach my $line (@assaynames){ |
---|
433 | $index++; |
---|
434 | print myoutfile "$line\n"; |
---|
435 | $header=$header."\t"."sample".$index; |
---|
436 | } |
---|
437 | close(myoutfile); |
---|
438 | } |
---|
439 | |
---|
440 | if(($param_hash{'exportMode'} eq "standard_lite") || ($param_hash{'exportMode'} eq "singleLite")){ |
---|
441 | #lite format is reporterId,chromosome,startPosition,endPosition |
---|
442 | $header=join("\t","reporterId","chromosome","startPosition","endPosition",@assaynames); |
---|
443 | } |
---|
444 | |
---|
445 | my $file="basematrix.txt"; |
---|
446 | if($param_hash{'filename'} ne "NULL"){ |
---|
447 | $file=$param_hash{'filename'}; |
---|
448 | } |
---|
449 | |
---|
450 | ## open file for printing of data values |
---|
451 | if($param_hash{'exportMode'} ne "singleLite"){ |
---|
452 | open (myoutfile,">$file"); |
---|
453 | } |
---|
454 | |
---|
455 | if(($param_hash{'exportMode'} ne "agilent") || ($param_hash{'exportMode'} ne "bed") || ($param_hash{'exportMode'} ne "singleLite") ){ |
---|
456 | #print defined header if not Agilent mode, or BED or single lite |
---|
457 | print myoutfile "$header\n"; |
---|
458 | } |
---|
459 | |
---|
460 | while(<>){ |
---|
461 | chomp; |
---|
462 | if($_ =~ /section spots/){ #found the spot section |
---|
463 | $found="TRUE"; |
---|
464 | next; #skip to next line ? |
---|
465 | } |
---|
466 | if(($_ !~/columns/) || ($_ !~/channels 2/) || ($_ !~ /assayFields/) || ($_ !~/assays/) || ($_ !~/%/) && ($found eq "TRUE") ){ |
---|
467 | # now reading real data line |
---|
468 | if ($_ eq ""){ |
---|
469 | # do nada |
---|
470 | }else{ |
---|
471 | # [reporterId] [geneSymbol] [chromosome] [cytoBand] [startPosition] [endPosition] [assayData = M] |
---|
472 | my @temp = split("\t",$_); #split each line |
---|
473 | if( ($temp[$endPositionIndex]>0) && ($temp[$chromosomeIndex]=~ /\w/) && ($temp[$chromosomeIndex]>0) ){ #reporterId must have a bp end and a chromosome not = "" |
---|
474 | foreach my $row (@temp){ #replace empty string with NaN .. |
---|
475 | if($row eq ""){ |
---|
476 | $row = "NaN"; |
---|
477 | } |
---|
478 | } |
---|
479 | if(scalar(@temp)<$nbrCols){ # To cope with dropped tabs.. |
---|
480 | while(scalar(@temp)<$nbrCols){ |
---|
481 | push(@temp,"NaN"); |
---|
482 | } |
---|
483 | } |
---|
484 | my $line; |
---|
485 | if(($param_hash{'exportMode'} eq "standard_lite") || ($param_hash{'exportMode'} eq "singleLite")){ |
---|
486 | splice(@temp, 1, 1); |
---|
487 | #new format of temp |
---|
488 | # [reporterId] [chromosome] [cytoBand] [startPosition] [endPosition] [assayData = M] |
---|
489 | splice(@temp, 2, 1); |
---|
490 | #new format of temp |
---|
491 | # [reporterId] [chromosome] [startPosition] [endPosition] [assayData = M] |
---|
492 | $line = join("\t",@temp); |
---|
493 | }elsif($param_hash{'exportMode'} eq "bed"){ |
---|
494 | #format: chr start stop reporterId |
---|
495 | my $bed_chrom="chr".$temp[$chromosomeIndex]; |
---|
496 | if($temp[$chromosomeIndex] eq "23"){ |
---|
497 | $bed_chrom="chrX"; |
---|
498 | } |
---|
499 | if($temp[$chromosomeIndex] eq "24"){ |
---|
500 | $bed_chrom="chrY"; |
---|
501 | } |
---|
502 | $line = join("\t",$bed_chrom,$temp[4],$temp[$endPositionIndex],$temp[$repIdIndex]); |
---|
503 | }else{ |
---|
504 | $line = join("\t",@temp); |
---|
505 | } |
---|
506 | |
---|
507 | if(($param_hash{'sort'} eq "TRUE") || ($param_hash{'exportMode'} eq "singleLite") || ($param_hash{'exportMode'} eq "agilent") || ($param_hash{'exportMode'} eq "stac") ){ |
---|
508 | push(@toSort,$line); |
---|
509 | }else{ |
---|
510 | print myoutfile "$line\n"; |
---|
511 | } |
---|
512 | } |
---|
513 | } |
---|
514 | }# end of if ( !~/columns etc.. |
---|
515 | } #end of while(<>) |
---|
516 | |
---|
517 | if($param_hash{'exportMode'} eq "stac"){ |
---|
518 | #to format the data correctly and easiest, use a small R script, stac_format.R |
---|
519 | open(myfile,"<$FindBin::Bin/stac_format.R"); |
---|
520 | open (myoutfile, ">stac_formatCommands.R"); |
---|
521 | while(<myfile>){ |
---|
522 | chomp; |
---|
523 | print myoutfile "$_\n"; |
---|
524 | } |
---|
525 | close (myfile); |
---|
526 | close (myoutfile); |
---|
527 | |
---|
528 | my $success = system("R --no-save < stac_formatCommands.R > r-debug.txt"); |
---|
529 | |
---|
530 | #catch the output |
---|
531 | |
---|
532 | } |
---|
533 | |
---|
534 | ## Sort data and print it if selected |
---|
535 | if(($param_hash{'sort'} eq "TRUE") || ($param_hash{'exportMode'} eq "agilent") || ($param_hash{'exportMode'} eq "singleLite")){ |
---|
536 | #sort on chromosome and then on startPosition |
---|
537 | if($param_hash{'sort'} eq "TRUE"){ |
---|
538 | if(($param_hash{'exportMode'} eq "standard_lite") || ($param_hash{'exportMode'} eq "singleLite")){ |
---|
539 | # [reporterId] [chromosome] [startPosition] [endPosition] [assayData = M] |
---|
540 | @toSort = |
---|
541 | map $_->[0] => |
---|
542 | sort { $a->[1] <=> $b->[1] |
---|
543 | || |
---|
544 | $a->[2] <=> $b->[2] } |
---|
545 | map [ $_, (split /\t/)[1], (split /\t/)[2] ] |
---|
546 | #map [ $_, (split /\t/)[1,2] ] |
---|
547 | => @toSort; |
---|
548 | }elsif($param_hash{'exportMode'} eq "bed"){ |
---|
549 | #format: [chromosome] [startPosition] [stopPosition] [reporterId] |
---|
550 | @toSort = |
---|
551 | map $_->[0] => |
---|
552 | sort { $a->[1] <=> $b->[1] |
---|
553 | || |
---|
554 | $a->[2] <=> $b->[2] } |
---|
555 | map [ $_, (split /\t/)[0], (split /\t/)[1] ] |
---|
556 | => @toSort; |
---|
557 | }else{ |
---|
558 | #standard format |
---|
559 | # [reporterId] [geneSymbol] [chromosome] [cytoBand] [startPosition] [endPosition] [assayData = M] |
---|
560 | |
---|
561 | @toSort = |
---|
562 | map $_->[0] => |
---|
563 | sort { $a->[1] <=> $b->[1] || |
---|
564 | $a->[2] <=> $b->[2] } |
---|
565 | map [ $_, (split /\t/)[2], (split /\t/)[4] ] |
---|
566 | => @toSort; |
---|
567 | } |
---|
568 | }#end if to sort data |
---|
569 | if($param_hash{'exportMode'} eq "singleLite"){ |
---|
570 | # split the matrix into separate files... |
---|
571 | # [reporterId] [chromosome] [startPosition] [endPosition] [assayData = M] |
---|
572 | |
---|
573 | #split header |
---|
574 | my @tmpHeader=split("\t",$header); |
---|
575 | my $commonHeader=join("\t",shift(@tmpHeader),shift(@tmpHeader),shift(@tmpHeader),shift(@tmpHeader)); |
---|
576 | my $ccN=4; |
---|
577 | for(my $i=0;$i<@tmpHeader;$i++){ |
---|
578 | #foreach assay create individual files, through iteration.. VERY SLOW! |
---|
579 | my $fileN=$tmpHeader[$i]."_".$file; |
---|
580 | open(mySingleFileHandle,">$fileN"); |
---|
581 | print mySingleFileHandle "$commonHeader\t$tmpHeader[$i]\n"; |
---|
582 | for(my $j=0;$j<@toSort;$j++){ |
---|
583 | my @ttt= split("\t",$toSort[$j]); |
---|
584 | my $lineN=join("\t",$ttt[0],$ttt[1],$ttt[2],$ttt[3],$ttt[$i+$ccN]); |
---|
585 | print mySingleFileHandle "$lineN\n"; |
---|
586 | } |
---|
587 | close(mySingleFileHandle); |
---|
588 | } |
---|
589 | }elsif($param_hash{'exportMode'} eq "agilent"){ |
---|
590 | #create the agilent format and print |
---|
591 | my $headerA=join("\t","FeatureNum","Chr","Start","Stop","SystematicName","Gene Name","Description",@assaynames); |
---|
592 | my $featureNum=0; |
---|
593 | print myoutfile "AMADID 1234567\n"; #check if to use random number? |
---|
594 | print myoutfile "$headerA\n"; |
---|
595 | |
---|
596 | # Format of toSort: |
---|
597 | # [reporterId] [geneSymbol] [chromosome] [cytoBand] [startPosition] [endPosition] [assayData = M] |
---|
598 | foreach my $line (@toSort){ |
---|
599 | $featureNum++; |
---|
600 | my @tmp=split("\t",$line); |
---|
601 | if($tmp[2] eq "23"){ |
---|
602 | $tmp[2]="X"; |
---|
603 | }elsif($tmp[2] eq "24"){ |
---|
604 | $tmp[2]="Y"; |
---|
605 | } |
---|
606 | #my $agilentLine=join("\t",$featureNum,$tmp[2],$tmp[4],$tmp[5],$tmp[0],$tmp[1],""); #if it works with bundled gene symbols.. |
---|
607 | my $agilentLine=join("\t",$featureNum,$tmp[2],$tmp[4],$tmp[5],$tmp[0],"",""); |
---|
608 | |
---|
609 | splice(@tmp,0,6); #only log2data left |
---|
610 | $agilentLine=join("\t",$agilentLine,@tmp); |
---|
611 | print myoutfile "$agilentLine\n"; |
---|
612 | } |
---|
613 | }else{ |
---|
614 | #just print! |
---|
615 | foreach my $line (@toSort){ |
---|
616 | print myoutfile "$line\n"; |
---|
617 | } |
---|
618 | } |
---|
619 | } |
---|
620 | if($param_hash{'exportMode'} ne "singleLite"){ |
---|
621 | close(myoutfile); |
---|
622 | } |
---|
623 | }#end doExport |
---|
624 | |
---|