1 | #!/bin/sh |
---|
2 | # $Id $ |
---|
3 | |
---|
4 | # Nicklas Nordborg, 2020 |
---|
5 | # |
---|
6 | # Finds information about a sequencing run given the barcode of a flow cell |
---|
7 | # It is expected that the sequencing is done with a NovaSeq sequencer |
---|
8 | # |
---|
9 | # run ./novaseq_status.sh <barcode> <run-archive-root-1> [<run-archive-root-2> ...] |
---|
10 | # |
---|
11 | # The output is a number of key-value pairs. All values may not be present. |
---|
12 | # |
---|
13 | # RunArchive: The path to the data folder for the flow cell |
---|
14 | # Config: Date and time the 'Config' folder was last modified |
---|
15 | # RunParameters: Date and time the 'RunParameters.xml' file was last modified |
---|
16 | # Read1: Value from <Read1NumberOfCycles> tag in RunParameters.xml |
---|
17 | # Read2: Value from <Read2NumberOfCycles> tag in RunParameters.xml |
---|
18 | # Index1Read: Value from <IndexRead1NumberOfCycles> tag in RunParameters.xml |
---|
19 | # Index2Read: Value from <IndexRead2NumberOfCycles> tag in RunParameters.xml |
---|
20 | # NovaSeqSerial: Value from <InstrumentName> tag in RunParameters.xml |
---|
21 | # CbclCount: Number of files ending with '.cbcl' |
---|
22 | # LaneCount: Value from the LaneCount attribute in FlowcellLayout tag in RunInfo.xml |
---|
23 | # SurfaceCount: Value from the SurfaceCount attribute in FlowcellLayout tag in RunInfo.xml |
---|
24 | # SwathCount: Value from the SwathCount attribute in FlowcellLayout tag in RunInfo.xml |
---|
25 | # TileCount: Value from the TileCount attribute in FlowcellLayout tag in RunInfo.xml |
---|
26 | # RTAComplete: Date and time the 'RTAComplete.txt' was last modified |
---|
27 | |
---|
28 | BARCODE=$1 |
---|
29 | shift |
---|
30 | RUN_ARCHIVE=$@ |
---|
31 | |
---|
32 | # Format string for file dates/times |
---|
33 | DATE_FORMAT="%Y%m%d %H%M%S" |
---|
34 | |
---|
35 | # Try to find a folder inside run-archive that has the barcode in the name |
---|
36 | # The folder may not yet exist so a missing folder is not an error |
---|
37 | DATA_FOLDER=`find ${RUN_ARCHIVE} -maxdepth 2 -iname "*${BARCODE}*" -type d -print 2> /dev/null || true`; |
---|
38 | |
---|
39 | # Fail if more than one folder is found |
---|
40 | readarray -t lines <<< "${DATA_FOLDER}" |
---|
41 | if [ ! ${#lines[@]} -eq 1 ]; then |
---|
42 | echo "Found ${#lines[@]} data folders for flow cell ${BARCODE}" 1>&2 |
---|
43 | echo ${DATA_FOLDER} 1>&2 |
---|
44 | exit 1 |
---|
45 | fi |
---|
46 | |
---|
47 | echo RunArchive: ${DATA_FOLDER} |
---|
48 | # Config folder is created immediately when starting the NovaSeq |
---|
49 | # We use the date of this folder to set the start date of the job |
---|
50 | if [ -d "${DATA_FOLDER}/Config" ]; then |
---|
51 | echo "Config: `date +"${DATE_FORMAT}" -r "${DATA_FOLDER}/Config"`" |
---|
52 | fi |
---|
53 | |
---|
54 | # RunParameters.xml is created after clustering |
---|
55 | # We extract information about number of reads and lanes |
---|
56 | # and compare that to the number of *.cbcl files we can find |
---|
57 | # This gives an estimate of the current sequencing cycle and we can |
---|
58 | # use this for progress reporting |
---|
59 | RUN_PARAMETERS=${DATA_FOLDER}/RunParameters.xml |
---|
60 | if [ -f "${RUN_PARAMETERS}" ]; then |
---|
61 | echo "RunParameters: `date +"${DATE_FORMAT}" -r "${RUN_PARAMETERS}"`" |
---|
62 | echo "Read1: `grep '<Read1NumberOfCycles>' "${RUN_PARAMETERS}" | cut -d '>' -f 2 | cut -d '<' -f 1`" |
---|
63 | echo "Read2: `grep '<Read2NumberOfCycles>' "${RUN_PARAMETERS}" | cut -d '>' -f 2 | cut -d '<' -f 1`" |
---|
64 | echo "Index1Read: `grep '<IndexRead1NumberOfCycles>' "${RUN_PARAMETERS}" | cut -d '>' -f 2 | cut -d '<' -f 1`" |
---|
65 | echo "Index2Read: `grep '<IndexRead2NumberOfCycles>' "${RUN_PARAMETERS}" | cut -d '>' -f 2 | cut -d '<' -f 1`" |
---|
66 | echo "NovaSeqSerial: `grep '<InstrumentName>' "${RUN_PARAMETERS}" | cut -d '>' -f 2 | cut -d '<' -f 1`" |
---|
67 | fi |
---|
68 | |
---|
69 | # Count number of BCL files which gives us information about |
---|
70 | # the progress of the sequencing |
---|
71 | CBCL_FOLDER=${DATA_FOLDER}/Data/Intensities/BaseCalls |
---|
72 | if [ -d "${CBCL_FOLDER}" ]; then |
---|
73 | echo "CbclCount: `find "${CBCL_FOLDER}" -type f -name *.cbcl | wc -l`" |
---|
74 | fi |
---|
75 | |
---|
76 | |
---|
77 | # RunInfo.xml contains information about the layout of the flowcell |
---|
78 | # which we need to be able to compare the number of *.cbcl files |
---|
79 | RUN_INFO=${DATA_FOLDER}/RunInfo.xml |
---|
80 | if [ -f "${RUN_INFO}" ]; then |
---|
81 | echo "LaneCount: `grep -o 'LaneCount="[^"]*"' "${RUN_INFO}" | cut -d '"' -f 2`" |
---|
82 | echo "SurfaceCount: `grep -o 'SurfaceCount="[^"]*"' "${RUN_INFO}" | cut -d '"' -f 2`" |
---|
83 | echo "SwathCount: `grep -o 'SwathCount="[^"]*"' "${RUN_INFO}" | cut -d '"' -f 2`" |
---|
84 | echo "TileCount: `grep -o 'TileCount="[^"]*"' "${RUN_INFO}" | cut -d '"' -f 2`" |
---|
85 | fi |
---|
86 | |
---|
87 | |
---|
88 | # RTAComplete.txt is created when everything is complete |
---|
89 | # This becomes the end date of the job and should trigger |
---|
90 | # Reggie to start file checks and secondary analysis |
---|
91 | if [ -f "${DATA_FOLDER}/RTAComplete.txt" ]; then |
---|
92 | echo "RTAComplete: `date +"${DATE_FORMAT}" -r "${DATA_FOLDER}/RTAComplete.txt"`" |
---|
93 | fi |
---|