[galaxy-commits] galaxy-dist commit 1dc82a345db2: Updated indel analysis tools: fixed counting bugs in indel_analysis and improved help section; standardized CIGAR regex across tools; updated test files for several tools

commits-noreply at bitbucket.org commits-noreply at bitbucket.org
Fri Jul 23 13:42:44 EDT 2010


# HG changeset patch -- Bitbucket.org
# Project galaxy-dist
# URL http://bitbucket.org/galaxy/galaxy-dist/overview
# User Kelly Vincent <kpvincent at bx.psu.edu>
# Date 1279743946 14400
# Node ID 1dc82a345db24860b31b73624030fa2644c18d38
# Parent  95491a8e9e1bca9267ec5830ac5df2a6d30ed915
Updated indel analysis tools: fixed counting bugs in indel_analysis and improved help section; standardized CIGAR regex across tools; updated test files for several tools

--- a/test-data/indel_analysis_out1.interval
+++ b/test-data/indel_analysis_out1.interval
@@ -1,4 +1,7 @@
-ref	10	11	1	1	9.09
-ref	12	14	2	1	7.69
-ref	13	14	1	1	7.69
-ref	18	20	2	1	8.33
+chr1	11	13	1	100.00
+chr1	21	22	1	25.00
+chr1	21	23	1	25.00
+chrM	16	18	1	9.09
+chrM	19	20	1	8.33
+chrM	21	22	1	9.09
+chrM	22	23	1	9.09

--- a/tools/indels/indel_sam2interval.xml
+++ b/tools/indels/indel_sam2interval.xml
@@ -1,5 +1,5 @@
-<tool id="indel_sam2interval" name="Convert SAM to interval/BED" version="1.0.0">
-  <description>for indels</description>
+<tool id="indel_sam2interval" name="Extract indels" version="1.0.0">
+  <description>from SAM</description><command interpreter="python">
     indel_sam2interval.py
       --input=$input1
@@ -30,7 +30,7 @@
       <when value="false" /></conditional><conditional name="del_out">
-      <param name="include_del_out" type="select" label="Include insertions output bed file?">
+      <param name="include_del_out" type="select" label="Include deletions output bed file?"><option value="true">Yes</option><option value="false">No</option></param>
@@ -41,10 +41,10 @@
   <outputs><data format="interval" name="output1" /><data format="bed" name="output2">
-      <filter>ins_out[ "include_ins_out" ] = "true"</filter>
+      <filter>ins_out[ "include_ins_out" ] == "true"</filter></data><data format="bed" name="output3">
-      <filter>del_out[ "include_del_out" ] = "true"</filter>
+      <filter>del_out[ "include_del_out" ] == "true"</filter></data></outputs><tests>
@@ -69,48 +69,66 @@ Given a SAM file containing indels, conv
 
 **Example**
 
-Suppose you have the following::
+Suppose you have the following mapping results::
 
- r770    89  ref        116   37  17M1I5M          =   72131356   0   CACACTGTGACAGACAGCGCAGC   00/02!!0//1200210AA44/1  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r780   181  ref       4567    0      24M          =   72131356   0  TTGGTGCGCGCGGTTGAGGGTTGG  $$(#%%#$%#%####$%%##$###  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r1231   69    *          0    0        *          *           0  0  AGACCGGGCGGGGTGGCGTTCGGT  %##+'#######%###$#$##$(#
- r1563  133    *          0    0        *          *           0  0  GTTCGTGGCCGGTGGGTGTTTGGG  ###$$#$#$&amp;#####$'$#$###$
- r1945  177  ref      71908    0      23M  190342418  181247988   0   AGAGAGAGAGAGAGAGAGAGAGA   SQQWZYURVYWX]]YXTSY]]ZM  XT:A:R  CM:i:0  SM:i:0   AM:i:0  X0:i:163148            XM:i:0  XO:i:0  XG:i:0  MD:Z:23
- r3671  117  ref   31903418    0      24M          =  190342418   0  CTGGCGTTCTCGGCGTGGATGGGT  #####$$##$#%#%%###%$#$##  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r3673  153  ref   48819768   37  16M1I6M          =  190342418   0   TCTAACTTAGCCTCATAATAGCT   /&lt;&lt;!"0///////00/!!0121/  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r3824  117  ref   80729921    0      24M          =   80324999   0  TCCAGTCGCGTTGTTAGGTTCGGA  #$#$$$#####%##%%###**#+/  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r3911  153  ref   87824718   37  8M1I14M          =   80324999   0   TTTAGCCCGAAATGCCTAGAGCA   4;6//11!"11100110////00  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r4795   81  ref  126739130    0      23M   57401793   57401793   0   TGGCATTCCTGTAGGCAGAGAGG   AZWWZS]!"QNXZ]VQ]]]/2]]  XT:A:R  CM:i:2  SM:i:0   AM:i:0       X0:i:3    X1:i:0  XM:i:2  XO:i:0  XG:i:0  MD:Z:23
- r4797  161  ref   57401793   37      23M   26739130   26739130   0   GATCACCCAGGTGATGTAACTCC   ]WV]]]]WW]]]]]]]]]]PU]]  XT:A:U  CM:i:0  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:0  XO:i:0  XG:i:0  MD:Z:23
- r4800   16  ref        241  255  15M2D8M          =       1550   0   CGTGGCCGGCGGGCCGAAGGCAT   IIIIIIIIIICCCCIII?IIIII  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r5377  170  ref   52090793   37      23M   26739130   26739130   0   TATCAATAAGGTGATGTAACTCG   ]WV]ABAWW]]]]]P]P//GU]]  XT:A:U  CM:i:0  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:0  XO:i:0  XG:i:0  MD:Z:23
- r5612  151  ref  190341152   37  19M1D4M          =  190342418   0  TCTAACTTAGCCTCATAATGCTAA   /&lt;&lt;!"0/4/*/7//00/BC0121/  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r5623  151  ref  188841418   37  19M1I3M          =  190342418   0   TCTAACTTAGCCTCATAATAGCT   /&lt;&lt;!"0/4//7//00/BC0121/  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r7899   69     *          0    0        *          *          0   0  CTGCGTGTTGGTGTCTACTGGGGT  #%#'##$#$##&amp;%#%$$$%#%#'#
- r9192  133     *          0    0        *          *          0   0  GTGCGTCGGGGAGGGTGCTGTCGG  ######%#$%#$$###($###&amp;&amp;%
+ r327     16   chrM   11   37      8M1D10M   *   0   0             CTTACCAGATAGTCATCA   -+&lt;2;?@BA@?-,.+4=4             XT:A:U  NM:i:1  X0:i:1  X1:i:0  XM:i:0  XO:i:1  XG:i:1  MD:Z:41^C35
+ r457      0   chr1   14   37          14M   *   0   0                 ACCTGACAGATATC   =/DF;?@1A@?-,.                 XT:A:U  NM:i:0  X0:i:1  X1:i:0  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+ r501     16   chrM    6   23      7M1I13M   *   0   0          TCTGTGCCTACCAGACATTCA   +=$2;?@BA@?-,.+4=4=4A          XT:A:U  NM:i:3  X0:i:1  X1:i:1  XM:i:2  XO:i:1  XG:i:1  MD:Z:28C36G9        XA:Z:chrM,+134263658,14M1I61M,4;
+ r1288    16   chrM    8   37      11M1I7M   *   0   0            TCACTTACCTGTACACACA   /*F2;?@%A@?-,.+4=4=            XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:2T0T1A69
+ r1902     0   chr1    4   37      7M2D18M   *   0   0        AGTCTCTTACCTGACGGTTATGA   &lt;2;?@BA@?-,.+4=4=4AA663        XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0
+ r2204    16   chrM    9    0          19M   *   0   0            CTGGTACCTGACAGGTATC   2;?@BA@?-,.+4=4=4AA            XT:A:R  NM:i:1  X0:i:2  X1:i:0  XM:i:1  XO:i:0  XG:i:0  MD:Z:0T75           XA:Z:chrM,-564927,76M,1;
+ r2314    16   chrM    6   37      10M2D8M   *   0   0               TCACTCTTACGTCTGA   &lt;2;?@BA@?-,.+4=4               XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:25A5^CA45
+ r3001     0   chrM   13   37   3M1D5M2I7M   *   0   0              TACAGTCACCCTCATCA   &lt;2;?@BA/(@?-,$&amp;                XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0
+ r3218     0   chr1   13   37       8M1D7M   *   0   0                TACAGTCACTCATCA   &lt;2;?@BA/(@?-,$&amp;                XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0
+ r4767    16   chr2    3   37      15M2I7M   *   0   0       CAGACTCTCTTACCAAAGACAGAC   &lt;2;?@BA/(@?-,.+4=4=4AA66       XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:2T1A4T65
+ r5333     0   chrM    5   37      17M1D8M   *   0   0       GTCTCTCATACCAGACAACGGCAT   FB3$@BA/(@?-,.+4=4=4AA66       XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:45C10^C0C5C13
+ r6690    16   chrM    7   23          20M   *   0   0           CTCTCTTACCAGACAGACAT   2;?@BA/(@?-,.+4=4=4A           XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76             XA:Z:chrM,-568532,76M,1;
+ r7211     0   chrM    7   37          24M   *   0   0       CGACAGAGACAAAATAACATTTAA   //&lt;2;?@BA@?-,.+4=442;;6:       XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:2  XO:i:1  XG:i:1  MD:Z:73G0G0
+ r7899    69      *    0    0            *   *   0   0       CTGCGTGTTGGTGTCTACTGGGGT   #%#'##$#$##&amp;%#%$$$%#%#'#
+ r9192   133      *    0    0            *   *   0   0       GTGCGTCGGGGAGGGTGCTGTCGG   ######%#$%#$$###($###&amp;&amp;%
+ r9922    16   chrM    4    0       7M3I9M   *   0   0            CCAGACATTTGAAATCAGG   F/D4=44^D++26632;;6            XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+ r9987    16   chrM    4    0      9M1I18M   *   0   0   AGGTTCTCATTACCTGACACTCATCTTG   G/AD6"/+4=4426632;;6:&lt;2;?@BA   XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+ r10145   16   chr1   16    0       5M2D7M   *   0   0                   CACATTGTTGTA   G//+4=44=4AA                   XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+ r10324   16   chrM   15    0       6M1D5M   *   0   0                   CCGTTCTACTTG   A@??8.G//+4=                   XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+ r12331   16   chrM   17    0       4M2I6M   *   0   0                  AGTCGAATACGTG   632;;6:&lt;2;?@B                  XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+ r12914   16   chr2   24    0       4M3I3M   *   0   0                     ACTACCCCAA   G//+4=42,.                     XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+ r13452   16   chrM   13    0      3M1D11M   *   0   0                 TACGTCACTCATCA   IIIABCCCICCCCI                 XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
 
 
 The following three files will be produced (Interval, Insertions BED and Deletions BED)::
 
- ref         133         134   I   C   1
- ref         256         258   D   -   1
- ref    48819784    48819785   I   A   1
- ref    87824726    87824727   I   G   1
- ref   188841437   188841419   I   T   1
- ref   190341171   190341172   D   -   1
+ chr1   11   13   D     -   1
+ chr1   21   22   D     -   1
+ chr1   21   23   D     -   1
+ chr2   18   19   I    AA   1
+ chr2   28   29   I   CCC   1
+ chrM   11   12   I   TTT   1
+ chrM   13   14   I     C   1
+ chrM   13   14   I     T   1
+ chrM   16   17   D     -   1
+ chrM   16   18   D     -   1
+ chrM   19   20   D     -   1
+ chrM   19   20   I     T   1
+ chrM   21   22   D     -   1
+ chrM   21   22   I    GA   1
+ chrM   22   23   D     -   1
 
- ref         133         134
- ref    48819784    48819785
- ref    87824726    87824727
- ref   188841437   188841419
+ chr2   18   19
+ chr2   28   29
+ chrM   11   12
+ chrM   13   14
+ chrM   13   14
+ chrM   19   20
+ chrM   21   22
 
- ref         256         258
- ref   190341171   190341172
-
-
-
-
-
+ chr1   11   13
+ chr1   21   22
+ chr1   21   23
+ chrM   16   17
+ chrM   16   18
+ chrM   19   20
+ chrM   21   22
+ chrM   22   23
 
 For more information on SAM, please consult the `SAM format description`__.
 

--- a/tools/indels/indel_sam2interval.py
+++ b/tools/indels/indel_sam2interval.py
@@ -72,11 +72,11 @@ def __main__():
         output_bed_del = None
 
     # the pattern to match, assuming just one indel per cigar string
-    pat_indel = re.compile( '(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M' )
+    pat_indel = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' )
     pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
 
     # go through all lines in input file
-    out_data = []
+    out_data = {}
     multi_indel_lines = 0
     for line in open( options.input, 'rb' ):
         if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
@@ -84,14 +84,14 @@ def __main__():
             if split_line < 12:
                 continue
             # grab relevant pieces
-            cigar = split_line[5]
+            cigar = split_line[5].strip()
             pos = int( split_line[3] )
             chr = split_line[2]
             base_string = split_line[9]
             # parse cigar string
-            m = pat_indel.search( cigar )
+            m = pat_indel.match( cigar )
             if not m:
-                m = pat_multi.search( cigar )
+                m = pat_multi.match( cigar )
                 # skip this line if no match
                 if not m:
                     continue
@@ -109,37 +109,43 @@ def __main__():
             start = left + pos
             if middle_type == 'D':
                 end = start + middle
-                d = [ chr, start, end, 'D' ]
+                data = [ chr, start, end, 'D' ]
                 if options.include_base == "true":
-                    d.append( '-' )
-                out_data.append( tuple( d ) )
-                if output_bed_del:
-                    output_bed_del.write( '%s\t%s\t%s\n' % ( chr, start, end ) )
+                    data.append( '-' )
             else:
-                end = start + 1#+ middle
-                d = [ chr, start, end, 'I' ]
+                end = start + 1
+                data = [ chr, start, end, 'I' ]
                 if options.include_base == "true":
-                    d.append( bases )
-                out_data.append( tuple( d ) )
-                if output_bed_ins:
-                    output_bed_ins.write( '%s\t%s\t%s\n' % ( chr, start, end ) )
+                    data.append( bases )
+            location = '\t'.join( [ '%s' % d for d in data ] )
+            try:
+                out_data[ location ] += 1
+            except KeyError:
+                out_data[ location ] = 1
     # output to interval file
-    if options.collapse == 'true':
-        out_dict = {}
-        # first collapse and get counts
-        for data in out_data:
-            location = ' '.join( [ '%s' % d for d in data ] )
-            try:
-                out_dict[ location ].append( data )
-            except KeyError:
-                out_dict[ location ] = [ data ]
-        locations = out_dict.keys()
-        locations.sort( numeric_sort )
-        for loc in locations:
-            output.write( '%s\t%s\n' % ( '\t'.join( [ '%s' % d for d in out_dict[ loc ][0] ] ), len( out_dict[ loc ] ) ) )
-    else:
-        for data in out_data:
-            output.write( '%s\n' % '\t'.join( [ '%s' % d for d in data ] ) )
+    # get all locations and sort
+    locations = out_data.keys()
+    locations.sort( numeric_sort )
+    last_line = ''
+    # output each location, either with counts or each occurrence
+    for loc in locations:
+        sp_loc = loc.split( '\t' )
+        cur_line = '\t'.join( sp_loc[:3] )
+        if options.collapse == 'true':
+            output.write( '%s\t%s\n' % ( loc, out_data[ loc ] ) )
+            if output_bed_del and sp_loc[3] == 'D':
+                output_bed_del.write( '%s\n' % cur_line )
+            if output_bed_ins and sp_loc[3] == 'I' and last_line != cur_line:
+                output_bed_ins.write( '%s\n' % cur_line )
+                last_line = cur_line
+        else:
+            for i in range( out_data[ loc ] ):
+                output.write( '%s\n' % loc )
+                if output_bed_del or output_bed_ins:
+                    if output_bed_del and sp_loc[3] == 'D':
+                        output_bed_del.write( '%s\n' % cur_line )
+                    if output_bed_ins and sp_loc[3] == 'I':
+                        output_bed_ins.write( '%s\n' % cur_line )
 
     # cleanup, close files
     if output_bed_ins:
@@ -150,6 +156,6 @@ def __main__():
 
     # if skipped lines because of more than one indel, output message
     if multi_indel_lines > 0:
-        sys.stdout.write( '%s alignments were skipped because they contained more than one indel or had unhandled operations (N/S/H/P).' % multi_indel_lines )
+        sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )
 
 if __name__=="__main__": __main__()

--- a/test-data/indel_sam2interval_in1.sam
+++ b/test-data/indel_sam2interval_in1.sam
@@ -1,17 +1,24 @@
-r770	89	ref	116	37	17M1I5M	=	72131356	0	CACACTGTGACAGACAGCGCAGC	00/02!!0//1200210AA44/1	XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r780	181	ref	4567	0	24M	=	72131356	0	TTGGTGCGCGCGGTTGAGGGTTGG	$$(#%%#$%#%####$%%##$###	XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
+r327	16	chrM	11	37	8M1D10M	*	0	0	CTTACCAGATAGTCATCA	-+&lt;2;?@BA@?-,.+4=4	XT:A:U  NM:i:1  X0:i:1  X1:i:0  XM:i:0  XO:i:1  XG:i:1  MD:Z:41^C35
+r457	0	chr1	14	37	14M	*	0	0	ACCTGACAGATATC	=/DF;?@1A@?-,.	XT:A:U  NM:i:0  X0:i:1  X1:i:0  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r501	16	chrM	6	23	7M1I13M	*	0	0	TCTGTGCCTACCAGACATTCA	+=$2;?@BA@?-,.+4=4=4A	XT:A:U  NM:i:3  X0:i:1  X1:i:1  XM:i:2  XO:i:1  XG:i:1  MD:Z:28C36G9        XA:Z:chrM,+134263658,14M1I61M,4;
 r1231	69	*	0	0	*	*	0	0	AGACCGGGCGGGGTGGCGTTCGGT	%##+'#######%###$#$##$(#
+r1288	16	chrM	8	37	11M1I7M	*	0	0	TCACTTACCTGTACACACA	/*F2;?@%A@?-,.+4=4=	XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:2T0T1A69
 r1563	133	*	0	0	*	*	0	0	GTTCGTGGCCGGTGGGTGTTTGGG	###$$#$#$&#####$'$#$###$
-r1945	177	ref	71908	0	23M	190342418	181247988	0	AGAGAGAGAGAGAGAGAGAGAGA	SQQWZYURVYWX]]YXTSY]]ZM	XT:A:R  CM:i:0  SM:i:0  AM:i:0  X0:i:163148  XM:i:0  XO:i:0  XG:i:0  MD:Z:23
-r3671	117	ref	31903418	0	24M	=	190342418	0  CTGGCGTTCTCGGCGTGGATGGGT	#####$$##$#%#%%###%$#$##
-r3673	153	ref	48819768	37	16M1I6M	=	190342418	0	TCTAACTTAGCCTCATAATAGCT	/&lt;&lt;!"0///////00/!!0121/	XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r3824	117	ref	80729921	0	24M	=	80324999	0  TCCAGTCGCGTTGTTAGGTTCGGA	#$#$$$#####%##%%###**#+/
-r3911	153	ref	87824718	37	8M1I14M	=	80324999	0	TTTAGCCCGAAATGCCTAGAGCA	4;6//11!"11100110////00	XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r4795	81	ref	126739130	0	23M	57401793	57401793	0	TGGCATTCCTGTAGGCAGAGAGG	AZWWZS]!"QNXZ]VQ]]]/2]]	XT:A:R  CM:i:2  SM:i:0  AM:i:0  X0:i:3  X1:i:0  XM:i:2  XO:i:0  XG:i:0  MD:Z:23
-r4797	161	ref	57401793	37	23M	26739130	26739130	0	GATCACCCAGGTGATGTAACTCC	]WV]]]]WW]]]]]]]]]]PU]]	XT:A:U  CM:i:0  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:0  XO:i:0  XG:i:0  MD:Z:23
-r4800	16	ref	241	255	15M2D8M	=	1550	0	CGTGGCCGGCGGGCCGAAGGCAT	IIIIIIIIIICCCCIII?IIIII
-r5377	170	ref	52090793	37	23M	26739130	26739130	0	TATCAATAAGGTGATGTAACTCG	]WV]ABAWW]]]]]P]P//GU]]	XT:A:U  CM:i:0  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:0  XO:i:0  XG:i:0  MD:Z:23
-r5612	151	ref	190341152	37	19M1D4M	=	190342418	0  TCTAACTTAGCCTCATAATGCTAA	/&lt;&lt;!"0/4/*/7//00/BC0121/	XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r5623	151	ref	188841418	37	19M1I3M	=	190342418	0	TCTAACTTAGCCTCATAATAGCT	/&lt;&lt;!"0/4//7//00/BC0121/	XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
+r1902	0	chr1	4	37	7M2D18M	*	0	0	AGTCTCTTACCTGACGGTTATGA	&lt;2;?@BA@?-,.+4=4=4AA663	XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0
+r2204	16	chrM	9	0	19M	*	0	0	CTGGTACCTGACAGGTATC	2;?@BA@?-,.+4=4=4AA	XT:A:R  NM:i:1  X0:i:2  X1:i:0  XM:i:1  XO:i:0  XG:i:0  MD:Z:0T75           XA:Z:chrM,-564927,76M,1;
+r2314	16	chrM	6	37	10M2D8M	*	0	0	TCACTCTTACGTCTGA	&lt;2;?@BA@?-,.+4=4	XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:25A5^CA45
+r3001	0	chrM	13	37	3M1D5M2I7M	*	0	0	TACAGTCACCCTCATCA	&lt;2;?@BA/(@?-,$&	XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0
+r3218	0	chr1	13	37	8M1D7M	*	0	0	TACAGTCACTCATCA	&lt;2;?@BA/(@?-,$&	XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0
+r4767	16	chr2	3	37	15M2I7M	*	0	0	CAGACTCTCTTACCAAAGACAGAC	&lt;2;?@BA/(@?-,.+4=4=4AA66	XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:2T1A4T65
+r5333	0	chrM	5	37	17M1D8M	*	0	0	GTCTCTCATACCAGACAACGGCAT	FB3$@BA/(@?-,.+4=4=4AA66	XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:45C10^C0C5C13
+r6690	16	chrM	7	23	20M	*	0	0	CTCTCTTACCAGACAGACAT	2;?@BA/(@?-,.+4=4=4A	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76             XA:Z:chrM,-568532,76M,1;
+r7211	0	chrM	7	37	24M	*	0	0	CGACAGAGACAAAATAACATTTAA	//&lt;2;?@BA@?-,.+4=442;;6:	XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:2  XO:i:1  XG:i:1  MD:Z:73G0G0
 r7899	69	*	0	0	*	*	0	0	CTGCGTGTTGGTGTCTACTGGGGT	#%#'##$#$##&%#%$$$%#%#'#
 r9192	133	*	0	0	*	*	0	0	GTGCGTCGGGGAGGGTGCTGTCGG	######%#$%#$$###($###&&%
+r9922	16	chrM	4	0	7M3I9M	*	0	0	CCAGACATTTGAAATCAGG	F/D4=44^D++26632;;6	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r9987	16	chrM	4	0	9M1I18M	*	0	0	AGGTTCTCATTACCTGACACTCATCTTG	G/AD6"/+4=4426632;;6:&lt;2;?@BA	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r10145	16	chr1	16	0	5M2D7M	*	0	0	CACATTGTTGTA	G//+4=44=4AA	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r10324	16	chrM	15	0	6M1D5M	*	0	0	CCGTTCTACTTG	A@??8.G//+4=	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r12331	16	chrM	17	0	4M2I6M	*	0	0	AGTCGAATACGTG	632;;6:&lt;2;?@B	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r12914	16	chr2	24	0	4M3I3M	*	0	0	ACTACCCCAA	G//+4=42,.	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r13452	16	chrM	13	0	3M2D11M	*	0	0	TACTCACTCATCAG	IIIABCCCICCCCI	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76

--- a/tools/indels/indel_analysis.xml
+++ b/tools/indels/indel_analysis.xml
@@ -24,7 +24,7 @@
     </test><test><param name="input1" value="indel_analysis_in2.sam" ftype="sam"/>
-      <param name="threshold" value="0.08"/>
+      <param name="threshold" value="0.09"/><output name="out_del" file="indel_analysis_out3.interval" ftype="interval"/><output name="out_ins" file="indel_analysis_out4.interval" ftype="interval"/></test>
@@ -35,75 +35,87 @@
 
 Given an input sam file, this tool provides analysis of the indels. It filters out matches that do not meet the frequency threshold. The way this frequency of occurence is calculated is different for deletions and insertions. The CIGAR string's "M" can indicate an exact match or a mismatch. For SAM containing the following bits of information (assuming the reference "ACTGCTCGAT")::
 
- CHROM  POS  CIGAR       SEQ
-   ref    3  2M1I3M   TATCTC
-   ref    2  3M1D2M    ATGTC
-   ref    4  2M2I3M  GTTGAAG
-   ref    1  2M2D2M     ACCT
-   ref    2  3M1I2M   TCCATC
-   ref    7      4M     CTAT
-   ref    5      5M    CGTGA
+ CHROM  POS   CIGAR  SEQ
+   ref    3  2M1I3M  TACTTC
+   ref    1  2M1D3M  ACGCT
+   ref    4  4M2I3M  GTTCAAGAT
+   ref    2  2M2D3M  CTCCG
+   ref    1  3M1D4M  AACCTGG
+   ref    6  3M1I2M  TTCAAT
+   ref    5  3M1I3M  CTCTGTT
+   ref    7      4M  CTAT
+   ref    5      5M  CGCTA
+   ref    3  2M1D2M  TGCC
 
-The following totals would be calculated::
+The following totals would be calculated (this is an intermediate step and not output)::
 
- Counts for chromosome "ref", where "-" indicates a deletion and
- "+" an insertion
- ----------------------------------------------------------------
-  POS  BASE  NUMREADS  DELPROPCALC  DELPROP  INSPROPCALC  INSPROP
-    1     A         1          1/1     1.00           --       --
-    2     A         1          1/2     0.50           --       --
-          C         1          1/2     0.50           --       --
-    3     T         3          3/4     0.75           --       --
-         --         1          1/4     0.25           --       --
-    4     A         1          1/5     0.20           --       --
-          G         2          2/5     0.40           --       --
-          C         1          1/5     0.20           --       --
-          -         1          1/5     0.20           --       --
-    5     C         4          4/5     0.80           --       --
-          T         1          1/5     0.20           --       --
-         +A         1          ---      ---          1/6     0.17
-         +T         1          ---      ---          1/6     0.17
-    6     A         1          1/6     0.17           --       --
-          C         1          1/6     0.17           --       --
-          T         4          4/6     0.67           --       --
-        +TG         1          ---      ---          1/7     0.14
-    7     A         1          1/6     0.17           --       --
-          C         3          3/6     0.50           --       --
-          G         1          1/6     0.17           --       --
-          T         1          1/6     0.17           --       --
-    8     G         2          2/3     0.67           --       --
-          T         1          1/3     0.33           --       --
-    9     A         2          2/2     1.00           --       --
-   10     T         1          1/1     1.00           --       --
+ -------------------------------------------------------------------------------------------------------
+  POS  BASE  NUMREADS  DELPROPCALC  DELPROP  INSPROPSTARTCALC  INSSTARTPROP  INSPROPENDCALC  INSENDPROP
+ -------------------------------------------------------------------------------------------------------
+    1     A         2          2/2     1.00               ---           ---             ---         ---
+    2     A         1          1/3     0.33               ---           ---             ---         ---
+          C         2          2/3     0.67               ---           ---             ---         ---
+    3     C         1          1/5     0.20               ---           ---             ---         ---
+          T         3          3/5     0.60               ---           ---             ---         ---
+          -         1          1/5     0.20               ---           ---             ---         ---
+    4     A         1          1/6     0.17               ---           ---             ---         ---
+          G         3          3/6     0.50               ---           ---             ---         ---
+          -         1          1/6     0.17               ---           ---             ---         ---
+         --         1          1/6     0.17               ---           ---             ---         ---
+    5     C         4          4/7     0.57               ---           ---             ---         ---
+          T         2          2/7     0.29               ---           ---             ---         ---
+          -         1          1/7     0.14               ---           ---             ---         ---
+         +C         1          ---      ---               1/7          0.14             1/9        0.11
+    6     C         2          2/9     0.22               ---           ---             ---         ---
+          G         1          1/9     0.11               ---           ---             ---         ---
+          T         6          6/9     0.67               ---           ---             ---         ---
+    7     C         7          7/9     0.78               ---           ---             ---         ---
+          G         1          1/9     0.11               ---           ---             ---         ---
+          T         1          1/9     0.11               ---           ---             ---         ---
+    8     C         1          1/7     0.14               ---           ---             ---         ---
+          G         4          4/7     0.57               ---           ---             ---         ---
+          T         2          2/7     0.29               ---           ---             ---         ---
+         +T         1          ---      ---               1/8          0.13             1/6        0.17
+        +AA         1          ---      ---               1/8          0.13             1/6        0.17
+    9     A         4          4/5     0.80               ---           ---             ---         ---
+          T         1          1/5     0.20               ---           ---             ---         ---
+         +A         1          ---      ---               1/6          0.17             1/5        0.20
+   10     T         4          4/4     1.00               ---           ---             ---         ---
 
-Note that the way these are calculated may not be immediately clear. First, the basic total number of reads at a given position is the number of reads with a particular base plus the number of reads with that a deletion at that given position. Note that deletions of two bases and one base would be counted separately. Insertions are not counted in this total, which is used to calculate the proportion of occurrences of each individual base and deletion. For position 4 above, the reference base is G, and there are 2 occurrences of it along with one each of mismatching bases A and C. Also, there is on 1-base deletion. So there are a total of 5 matches/mismatches/deletions, and the proportions for each base are either 1/5 = 0.20 or 2/5 = 0.40, and for the deletion it is 1/5 = 0.20. Insertions are slightly more complicated. Each insertion is regarded individually, and the total number of occurrences of that insertion is divided by the sum of the number of its occurrences and the b
 asic total. So for position 5, there are a total of 5 matches/mismatches/deletions, and two insertions that each occur once, so each has a insertion has a proportion of 1/6 = 0.17.
+The general idea for calculating these is that we want to find out the proportion of times a particular event occurred at a position among all reads that touch that base in some way. First, the basic total number of reads at a given position is the number of reads with each particular base plus the number of reads with that a deletion at that given position (including the bases that are "mismatches"). Note that deletions of two bases and one base would be counted completely separately. Insertions are not counted in this total. For position 4 above, the reference base is G, and there are 3 occurrences of it along with one mismatching base, A. Also, there is a 1-base deletion and another 2-base deletion. So there are a total of 5 matches/mismatches/deletions, and the proportions for each base are 1/6 = 0.17 (A) and 3/6 = 0.50 (G), and for each deletion it is 1/6 = 0.17.
 
-The DELPROP or INSPROP needs to be greater than the threshold frequency specified by the user.
+Insertions are slightly more complicated. We actually want to get the frequency of occurrence for both the associated start and end positions, since an insertion appears between those two bases. Each insertion is regarded individually, and the total number of occurrences of that insertion is divided by the sum of the number of its occurrences and the basic total for either the start or end. So for the insertions at position 8, there are a total of 7 matches/mismatches/deletions at position 8, and two insertions that each occur once, so each has an INSSTARTPROP of 1/8 = 0.13. For the end position there are 5 matches/mismatches/deletions, so the INSENDPROP is 1/6 = 0.17 for both insertions (T and AA).
 
-The output varies for deletions and insertions, though for both, the first three columns are chromosome, start position, and end position.
+These proportions (DELPROP and either INSSTARTPROP or INSENDPROP) need to be greater than the threshold frequency specified by the user in order for that base, deletion or insertion to be included in the output.
+
+
+** Output format **
+
+The output varies for deletions and insertions, although for both, the first three columns are chromosome, start position, and end position.
 
 Columns in the deletions file::
 
-                       Column  Description
- ----------------------------  ------------------------------------------------------------------------------------
-  1                     Chrom  Chromosome
-  2                     Start  Starting position
-  3                       End  Ending position
-  4 Number of Deleted Base(s)  The number of bases deleted at Start position
-  5      Frequency Percentage  Frequency of this exact deletion (2 and 1 are mutually exclusive), as percentage (%)
+                        Column  Description
+ -----------------------------  ---------------------------------------------------------------------------------------------------
+  1                      Chrom  Chromosome
+  2                      Start  Starting position
+  3                        End  Ending position
+  4                   Coverage  Number of reads containing this exact deletion
+  5       Frequency Percentage  Frequency of this exact deletion (2 and 1 are mutually exclusive, for instance), as percentage (%)
 
 Columns in the insertions file::
 
-                  Column  Description
- --------------------------  -------------------------------------------------------------------------------------------------------------
-  1                Chrom  Chromosome
-  2                Start  Starting position
-  3                  End  Ending position (always Start + 1 for insertions)
-  4     Inserted Base(s)  The exact base(s) inserted at Start position
-  5 Freq. Perc. at Start  Frequency of this exact insertion given Start position ("GG" and "G" are considered distinct), as percentage (%)
-  6   Freq. Perc. at End  Frequency of this exact insertion given End position ("GG" and "G" are considered distinct), as percentage (%)
+                   Column  Description
+ ------------------------  -----------------------------------------------------------------------------------------------------------------
+  1                 Chrom  Chromosome
+  2                 Start  Starting position
+  3                   End  Ending position (always Start + 1 for insertions)
+  4      Inserted Base(s)  The exact base(s) inserted at Start position
+  5              Coverage  Number of reads containing this exact insertion
+  6  Freq. Perc. at Start  Frequency of this exact insertion given Start position ("GG" and "G" are considered distinct), as percentage (%)
+  7    Freq. Perc. at End  Frequency of this exact insertion given End position ("GG" and "G" are considered distinct), as percentage (%)
 
-Before using this tool, you probably will want to use the Filter SAM for indels tool to filter out indels on bases with insufficient quality scores, but this is not required.
+Before using this tool, you may will want to use the Filter SAM for indels tool to filter out indels on bases with insufficient quality scores, but this is not required.
 
 
 -----
@@ -112,38 +124,43 @@ Before using this tool, you probably wil
 
 If you set the threshold to 0.0 and have the following SAM file::
 
- r770     89  ref   6   37    7M1I5M  =  0   0              TGGATCTTCATAG              !0//110AA44/1  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r1124   113  ref   4    0       23M  =  0   0    CATCGTTCTGTTAGATCTACGTA    PQRVUMNXYRPUXYXWXSOSZ]M  XT:A:R  CM:i:0   SM:i:0  AM:i:0  X0:i:163148            XM:i:0  XO:i:0  XG:i:0  MD:Z:23
- r1789   177  ref   6    0       17M  =  0   0          TCGATCGCTTAGTTCTC          SQQWZY]]YXTSY]]ZM  XT:A:R  CM:i:0   SM:i:0  AM:i:0  X0:i:163148            XM:i:0  XO:i:0  XG:i:0  MD:Z:23
- r3671   153  ref  10   37    6M1I6M  =  0   0              TCTCTTTAGGTCT              /&lt;&lt;!"0///////  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r3824   153  ref   5   37    8M1I7M  =  0   0           ATTGATGTTCTTAGAT           4;6//11!"100110/  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r4800    16  ref   7  255    5M2D6M  =  0   0                CGATCTTTGAT                IIIIIIIIIIC  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r5612   151  ref   5   37    8M1D9M  =  0   0          ATCTATCTTTTGATCTC          /&lt;&lt;!"0/4/*/7//B0/  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r5612   151  ref  11   37   3M1I10M  =  0   0             CTCCTTAGCTCTCC             /&lt;&lt;!"0/4//7//0  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r9145   115  ref  11    0       19M  =  0  -1       CTCTTAGCTCTCCGAATTAG        7753:&lt;5#"4!&amp;=9518A&gt;  XT:A:R  CM:i:2   SM:i:0  AM:i:0       X0:i:4  X1:i:137  XM:i:2  XO:i:0  XG:i:0  MD:Z:23
- r11770   89  ref  10   37   10M2I8M  =  0   0       TCTCTTAGATGGCTCCGTAT       00/02!!0/120210AA4/1  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r13671  153  ref   1   37  12M1I12M  =  0   0  TCGCATCGATCTCCGTAGATCTCCG  /&lt;""&lt;!"0///////00/!!0121/  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r13824  153  ref  13   37    9M1I7M  =  0   0          CATAGATCTACCGGATT          4;6//11!"11100110  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r24800   16  ref   3  255   15M2D9M  =  0   0   GCATCTATCTGATAGCTCCGAATT   IIIIIIIII45"CCCIII?IIIII  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r25612  151  ref   1   37    9M1D5M  =  0   0             TCGCATCGACTCTT             0/4/*/7//00/1C  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r25612  151  ref  21   37    4M1I7M  =  0   0               TGCGTTATTGGG               &lt;!"0/70/BC01  XT:A:U  CM:i:2  SM:i:37  AM:i:0       X0:i:1    X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
- r29962   16  ref  20   37    4M1I7M  =  0   0              CTCCGGTATGAGG              &lt;!"0/70/7BC01  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
+ r327     16   chrM   11   37      8M1D10M   *   0   0             CTTACCAGATAGTCATCA   -+&lt;2;?@BA@?-,.+4=4             XT:A:U  NM:i:1  X0:i:1  X1:i:0  XM:i:0  XO:i:1  XG:i:1  MD:Z:41^C35
+ r457      0   chr1   14   37          14M   *   0   0                 ACCTGACAGATATC   =/DF;?@1A@?-,.                 XT:A:U  NM:i:0  X0:i:1  X1:i:0  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+ r501     16   chrM    6   23      7M1I13M   *   0   0          TCTGTGCCTACCAGACATTCA   +=$2;?@BA@?-,.+4=4=4A          XT:A:U  NM:i:3  X0:i:1  X1:i:1  XM:i:2  XO:i:1  XG:i:1  MD:Z:28C36G9        XA:Z:chrM,+134263658,14M1I61M,4;
+ r1288    16   chrM    8   37      11M1I7M   *   0   0            TCACTTACCTGTACACACA   /*F2;?@%A@?-,.+4=4=            XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:2T0T1A69
+ r1902     0   chr1    4   37      7M2D18M   *   0   0        AGTCTCTTACCTGACGGTTATGA   &lt;2;?@BA@?-,.+4=4=4AA663        XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0
+ r2204    16   chrM    9    0          19M   *   0   0            CTGGTACCTGACAGGTATC   2;?@BA@?-,.+4=4=4AA            XT:A:R  NM:i:1  X0:i:2  X1:i:0  XM:i:1  XO:i:0  XG:i:0  MD:Z:0T75           XA:Z:chrM,-564927,76M,1;
+ r2314    16   chrM    6   37      10M2D8M   *   0   0               TCACTCTTACGTCTGA   &lt;2;?@BA@?-,.+4=4               XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:25A5^CA45
+ r3001     0   chrM   13   37   3M1D5M2I7M   *   0   0              TACAGTCACCCTCATCA   &lt;2;?@BA/(@?-,$&amp;                XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0
+ r3218     0   chr1   13   37       8M1D7M   *   0   0                TACAGTCACTCATCA   &lt;2;?@BA/(@?-,$&amp;                XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0
+ r4767    16   chr2    3   37      15M2I7M   *   0   0       CAGACTCTCTTACCAAAGACAGAC   &lt;2;?@BA/(@?-,.+4=4=4AA66       XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:2T1A4T65
+ r5333     0   chrM    5   37      17M1D8M   *   0   0       GTCTCTCATACCAGACAACGGCAT   FB3$@BA/(@?-,.+4=4=4AA66       XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:45C10^C0C5C13
+ r6690    16   chrM    7   23          20M   *   0   0           CTCTCTTACCAGACAGACAT   2;?@BA/(@?-,.+4=4=4A           XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76             XA:Z:chrM,-568532,76M,1;
+ r7211     0   chrM    7   37          24M   *   0   0       CGACAGAGACAAAATAACATTTAA   //&lt;2;?@BA@?-,.+4=442;;6:       XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:2  XO:i:1  XG:i:1  MD:Z:73G0G0
+ r9922    16   chrM    4    0       7M3I9M   *   0   0            CCAGACATTTGAAATCAGG   F/D4=44^D++26632;;6            XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+ r9987    16   chrM    4    0      9M1I18M   *   0   0   AGGTTCTCATTACCTGACACTCATCTTG   G/AD6"/+4=4426632;;6:&lt;2;?@BA   XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+ r10145   16   chr1   16    0       5M2D7M   *   0   0                   CACATTGTTGTA   G//+4=44=4AA                   XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+ r10324   16   chrM   15    0       6M1D5M   *   0   0                   CCGTTCTACTTG   A@??8.G//+4=                   XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+ r12331   16   chrM   17    0       4M2I6M   *   0   0                  AGTCGAATACGTG   632;;6:&lt;2;?@B                  XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+ r12914   16   chr2   24    0       4M3I3M   *   0   0                     ACTACCCCAA   G//+4=42,.                     XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
 
 The following will be produced (deletions file followed by insertions file)::
 
- ref   10   11   1   1   9.09
- ref   12   14   2   1   7.69
- ref   13   14   1   1   7.69
- ref   18   20   2   1   8.33
+ chr1   11   13   1   100.00
+ chr1   21   22   1    25.00
+ chr1   21   23   1    25.00
+ chrM   16   18   1     9.09
+ chrM   19   20   1     8.33
+ chrM   21   22   1     9.09
+ chrM   22   23   1     9.09
 
- ref   13   14    C   1    6.25    6.67
- ref   13   14    T   2   12.50   13.33
- ref   14   15    C   1    6.67    7.14
- ref   16   17    T   1    7.14    7.69
- ref   20   21   GG   1    8.33    8.33
- ref   22   23    A   1    8.33   11.11
- ref   24   25    G   1   11.11   12.50
- ref   25   26    T   1   12.50   14.29
+ chr2   18   19    AA   1   50.00   50.00
+ chr2   28   29   CCC   1   50.00   50.00
+ chrM   11   12   TTT   1    9.09    9.09
+ chrM   13   14     C   1    9.09    9.09
+ chrM   13   14     T   1    9.09    9.09
+ chrM   19   20     T   1    7.69    8.33
+ chrM   21   22    GA   1    8.33    8.33
 
 
   </help>

--- a/test-data/indel_analysis_out3.interval
+++ b/test-data/indel_analysis_out3.interval
@@ -1,2 +1,6 @@
-ref	10	11	1	1	9.09
-ref	18	20	2	1	8.33
+chr1	11	13	1	100.00
+chr1	21	22	1	25.00
+chr1	21	23	1	25.00
+chrM	16	18	1	9.09
+chrM	21	22	1	9.09
+chrM	22	23	1	9.09

--- a/test-data/indel_analysis_out4.interval
+++ b/test-data/indel_analysis_out4.interval
@@ -1,5 +1,5 @@
-ref	13	14	T	2	13.33	14.29
-ref	20	21	GG	1	8.33	8.33
-ref	22	23	A	1	8.33	11.11
-ref	24	25	G	1	11.11	14.29
-ref	25	26	T	1	12.50	14.29
+chr2	18	19	AA	1	50.00	50.00
+chr2	28	29	CCC	1	50.00	50.00
+chrM	11	12	TTT	1	9.09	9.09
+chrM	13	14	C	1	9.09	9.09
+chrM	13	14	T	1	9.09	9.09

--- a/tools/indels/indel_analysis.py
+++ b/tools/indels/indel_analysis.py
@@ -10,7 +10,7 @@ usage: %prog [options] [input3 sum3[ inp
    -D, --out_del=D: The interval output file showing deletions
 """
 
-import re, sys
+import re, sets, sys
 from galaxy import eggs
 import pkg_resources; pkg_resources.require( "bx-python" )
 from bx.cookbook import doc_optparse
@@ -20,18 +20,18 @@ def stop_err( msg ):
     sys.stderr.write( '%s\n' % msg )
     sys.exit()
 
-def add_to_ref_pos( ref_pos, pos, bases ):
+def add_to_mis_matches( mis_matches, pos, bases ):
     """
-    Adds the bases and counts to the ref_pos dict
+    Adds the bases and counts to the mis_matches dict
     """
     for j, base in enumerate( bases ):
         try:
-            ref_pos[ pos + j ][ base ] += 1
+            mis_matches[ pos + j ][ base ] += 1
         except KeyError:
             try:
-                ref_pos[ pos + j ][ base ] = 1
+                mis_matches[ pos + j ][ base ] = 1
             except KeyError:
-                ref_pos[ pos + j ] = { base: 1 }
+                mis_matches[ pos + j ] = { base: 1 }
 
 def __main__():
     #Parse Command Line
@@ -39,17 +39,16 @@ def __main__():
     # prep output files
     out_ins = open( options.out_ins, 'wb' )
     out_del = open( options.out_del, 'wb' )
-    # pattern
-    pat = re.compile( '(^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$)|(^(?P<match_width>\d+)M$)' )
+    # patterns
+    pat = re.compile( '^((?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M)$|((?P<match_width>\d+)M)$' )
     pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
     # for tracking occurences at each pos of ref
-    ref_pos = {}
+    mis_matches = {}
     indels = {}
-    num_reads = {}
     multi_indel_lines = 0
     # go through all lines in input file
     for i,line in enumerate( open( options.input, 'rb' ) ):
-        if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
+        if line.strip() and not line.startswith( '#' ) and not line.startswith( '@' ) :
             split_line = line.split( '\t' )
             chrom = split_line[2].strip()
             pos = int( split_line[3].strip() )
@@ -59,11 +58,11 @@ def __main__():
             if chrom == '*':
                 continue
             # find matches like 3M2D7M or 7M3I10M
-            matches = ''
-            m = pat.search( cigar )
+            match = {}
+            m = pat.match( cigar )
             # unprocessable CIGAR
             if not m:
-                m = pat_multi.search( cigar )
+                m = pat_multi.match( cigar )
                 # skip this line if no match
                 if not m:
                     continue
@@ -72,11 +71,9 @@ def __main__():
                     multi_indel_lines += 1
             # get matching parts for the indel or full match if matching
             else:
-                if not ref_pos.has_key( chrom ):
-                    ref_pos[ chrom ] = {}
+                if not mis_matches.has_key( chrom ):
+                    mis_matches[ chrom ] = {}
                     indels[ chrom ] = { 'D': {}, 'I': {} }
-                    if not num_reads.has_key( chrom ):
-                        num_reads[ chrom ] = {}
                 parts = m.groupdict()
                 if parts[ 'match_width' ] or ( parts[ 'lmatch' ] and parts[ 'ins_del_width' ] and parts[ 'rmatch' ] ):
                     match = parts
@@ -84,12 +81,7 @@ def __main__():
             if match:
                 # match/mismatch
                 if parts[ 'match_width' ]:
-                    add_to_ref_pos( ref_pos[ chrom ], pos, bases )
-                    for i, base in enumerate( bases ):
-                        try:
-                            num_reads[ chrom ][ i + pos ] += 1
-                        except KeyError:
-                            num_reads[ chrom ][ i + pos ] = 1
+                    add_to_mis_matches( mis_matches[ chrom ], pos, bases )
                 # indel
                 else:
                     # pieces of CIGAR string
@@ -104,86 +96,81 @@ def __main__():
                     right_bases = bases[ -right : ]
                     start = pos + left
                     # add data to ref_pos dict for match/mismatch bases on left and on right
-                    add_to_ref_pos( ref_pos[ chrom ], pos, left_bases )
-                    for i, base in enumerate( left_bases ):
-                        try:
-                            num_reads[ chrom ][ i + pos ] += 1
-                        except KeyError:
-                            num_reads[ chrom ][ i + pos ] = 1
+                    add_to_mis_matches( mis_matches[ chrom ], pos, left_bases )
                     if match[ 'ins_del' ] == 'I':
-                        add_to_ref_pos( ref_pos[ chrom ], start, right_bases )
-                        indel_pos = start
+                        add_to_mis_matches( mis_matches[ chrom ], start, right_bases )
                     else:
-                        add_to_ref_pos( ref_pos[ chrom ], start + middle, right_bases )
-                        indel_pos = start + middle
-                    for i, base in enumerate( right_bases ):
-                        try:
-                            num_reads[ chrom ][ i + indel_pos ] += 1
-                        except KeyError:
-                            num_reads[ chrom ][ i + indel_pos ] = 1
+                        add_to_mis_matches( mis_matches[ chrom ], start + middle, right_bases )
                     # for insertions, count instances of particular inserted bases
                     if match[ 'ins_del' ] == 'I':
                         if indels[ chrom ][ 'I' ].has_key( start ):
-                            if indels[ chrom ][ 'I' ][ start ].has_key( middle_bases ):
+                            try:
                                 indels[ chrom ][ 'I' ][ start ][ middle_bases ] += 1
-                            else:
+                            except KeyError:
                                 indels[ chrom ][ 'I' ][ start ][ middle_bases ] = 1
                         else:
                             indels[ chrom ][ 'I' ][ start ] = { middle_bases: 1 }
                     # for deletions, count number of deletions bases
                     else:
                         if indels[ chrom ][ 'D' ].has_key( start ):
-                            if indels[ chrom ][ 'D' ][ start ].has_key( middle ):
+                            try:
                                 indels[ chrom ][ 'D' ][ start ][ middle ] += 1
-                            else:
+                            except KeyError:
                                 indels[ chrom ][ 'D' ][ start ][ middle ] = 1
                         else:
                             indels[ chrom ][ 'D' ][ start ] = { middle: 1 }
     # compute deletion frequencies and insertion frequencies for checking against threshold
     freqs = {}
     ins_freqs = {}
-    chroms = ref_pos.keys()
+    chroms = mis_matches.keys()
     chroms.sort()
     for chrom in chroms:
         freqs[ chrom ] = {}
         ins_freqs[ chrom ] = {}
-        poses = num_reads[ chrom ].keys()
-        poses.sort()
+        poses = mis_matches[ chrom ].keys()
+        poses.extend( indels[ chrom ][ 'D' ].keys() )
+        poses.extend( indels[ chrom ][ 'I' ].keys() )
+        poses = list( sets.Set( poses ) )
         for pos in poses:
             # all reads touching this particular position
             freqs[ chrom ][ pos ] = {}
             sum_counts = 0.0
             sum_counts_end = 0.0
             # get basic counts (match/mismatch)
-            if num_reads[ chrom ].has_key( pos ):
-                sum_counts += float( num_reads[ chrom ][ pos ] )
-                try:
-                    sum_counts_end += float( num_reads[ chrom ][ pos + 1 ] )
-                except KeyError:
-                    pass
+            try:
+                sum_counts += float( sum( mis_matches[ chrom ][ pos ].values() ) )
+            except KeyError:
+                pass
+            try:
+                sum_counts_end += float( sum( mis_matches[ chrom ][ pos + 1 ].values() ) )
+            except KeyError:
+                pass
             # add deletions also touching this position
             try:
                 sum_counts += float( sum( indels[ chrom ][ 'D' ][ pos ].values() ) )
-                try:
-                    sum_counts_end += float( sum( indels[ chrom ][ 'D' ][ pos + 1 ].values() ) )
-                except KeyError:
-                    pass
-                for d in indels[ chrom ][ 'D' ][ pos ].keys():
-                    freqs[ chrom ][ pos ][ '-' * d ] = indels[ chrom ][ 'D' ][ pos ][ d ] / sum_counts
             except KeyError:
                 pass
+            try:
+                sum_counts_end += float( sum( indels[ chrom ][ 'D' ][ pos + 1 ].values() ) )
+            except KeyError:
+                pass
+            freqs[ chrom ][ pos ][ 'total' ] = sum_counts
             # calculate actual frequencies
             # deletions
-            freqs[ chrom ][ pos ][ 'total' ] = sum_counts
-            for base in ref_pos[ chrom ][ pos ].keys():
-                try:
-                    prop = float( ref_pos[ chrom ][ pos ][ base ] ) / sum_counts
-                    freqs[ chrom ][ pos ][ base ] = prop
-                except ZeroDivisionError:
-                    freqs[ chrom ][ pos ][ base ] = 0.0
+            # frequencies for deletions
             try:
                 for d in indels[ chrom ][ 'D' ][ pos ].keys():
-                    freqs[ chrom ][ pos ][ '-' * d ] = indels[ chrom ][ 'D' ][ pos ][ d ] / sum_counts
+                    freqs[ chrom ][ pos ][ d ] = indels[ chrom ][ 'D' ][ pos ][ d ] / sum_counts
+            except KeyError:
+                pass
+            # frequencies for matches/mismatches
+            try:
+                for base in mis_matches[ chrom ][ pos ].keys():
+                    try:
+                        prop = float( mis_matches[ chrom ][ pos ][ base ] ) / sum_counts
+                        freqs[ chrom ][ pos ][ base ] = prop
+                    except ZeroDivisionError:
+                        freqs[ chrom ][ pos ][ base ] = 0.0
             except KeyError:
                 pass
             # insertions
@@ -191,7 +178,7 @@ def __main__():
                 for bases in indels[ chrom ][ 'I' ][ pos ].keys():
                     prop_start = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts )
                     try:
-                        prop_end = indels[ chrom ][ 'I' ][ pos ][ bases ] / sum_counts_end
+                        prop_end = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts_end )
                     except ZeroDivisionError:
                         prop_end = 0.0
                     try:
@@ -204,7 +191,7 @@ def __main__():
     threshold = float( options.threshold )
     #out_del.write( '#Chrom\tStart\tEnd\t#Del\t#Reads\t%TotReads\n' )
     #out_ins.write( '#Chrom\tStart\tEnd\tInsBases\t#Reads\t%TotReadsAtStart\t%ReadsAtEnd\n' )
-    for chrom in ref_pos.keys():
+    for chrom in chroms:
         # deletions file
         poses = indels[ chrom ][ 'D' ].keys()
         poses.sort()
@@ -214,9 +201,9 @@ def __main__():
             dels.sort()
             for d in dels:
                 end = start + d
-                prop = freqs[ chrom ][ start ][ '-' * d ]
+                prop = freqs[ chrom ][ start ][ d ]
                 if prop > threshold :
-                    out_del.write( '%s\t%s\t%s\t%s\t%s\t%.2f\n' % ( chrom, start, end, d, indels[ chrom ][ 'D' ][ pos ][ d ], 100.0 * prop ) )
+                    out_del.write( '%s\t%s\t%s\t%s\t%.2f\n' % ( chrom, start, end, indels[ chrom ][ 'D' ][ pos ][ d ], 100.0 * prop ) )
         # insertions file
         poses = indels[ chrom ][ 'I' ].keys()
         poses.sort()
@@ -235,6 +222,6 @@ def __main__():
     out_ins.close()
     # if skipped lines because of more than one indel, output message
     if multi_indel_lines > 0:
-        sys.stdout.write( '%s alignments were skipped because they contained more than one indel or had unhandled operations (N/S/H/P).' % multi_indel_lines )
+        sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )
 
 if __name__=="__main__": __main__()

--- a/test-data/indel_sam2interval_out1.interval
+++ b/test-data/indel_sam2interval_out1.interval
@@ -1,6 +1,14 @@
-ref	133	134	I	C	1
-ref	256	258	D	-	1
-ref	48819784	48819785	I	A	1
-ref	87824726	87824727	I	G	1
-ref	188841437	188841438	I	A	1
-ref	190341171	190341172	D	-	1
+chr1	11	13	D	-	1
+chr1	21	22	D	-	1
+chr1	21	23	D	-	1
+chr2	18	19	I	AA	1
+chr2	28	29	I	CCC	1
+chrM	11	12	I	TTT	1
+chrM	13	14	I	C	1
+chrM	13	14	I	T	1
+chrM	16	18	D	-	2
+chrM	19	20	D	-	1
+chrM	19	20	I	T	1
+chrM	21	22	D	-	1
+chrM	21	22	I	GA	1
+chrM	22	23	D	-	1

--- a/test-data/indel_analysis_in1.sam
+++ b/test-data/indel_analysis_in1.sam
@@ -1,22 +1,19 @@
-r770	89	ref	6	37	7M1I5M	=	0	0	TCGATCTTCATAG	!0//110AA44/1  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r1124	113	ref	4	0	23M	=	0	0	CATCGTTCTGTTAGATCTACGTA	PQRVUMNXYRPUXYXWXSOSZ]M  XT:A:R  CM:i:0  SM:i:0  AM:i:0  X0:i:163148  XM:i:0  XO:i:0  XG:i:0  MD:Z:23
-r1231	69	*	0	0	*	*	0	0	AGACCGGGCGGGGTGGCGTTCGGT	%##+'#######%###$#$##$(#
-r1563	133	*	0	0	*	*	0	0	GTTCGTGGCCGGTGGGTGTTTGGG	###$$#$#$&#####$'$#$###$
-r1789	177	ref	6	0	17M	=	0	0	TCGATCGCTTAGTTCTC	SQQWZY]]YXTSY]]ZM  XT:A:R  CM:i:0  SM:i:0  AM:i:0  X0:i:163148  XM:i:0  XO:i:0  XG:i:0  MD:Z:23
-r3671	153	ref	10	37	6M1I6M	=	0	0	TCTCTTTAGGTCT	/<<!"0///////  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r3824	153	ref	5	37	8M1I7M	=	0	0	ATCGATGTTCTTAGAT	4;6//11!"100110/  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r4800	16	ref	7	255	5M2D6M	=	0	0	CGATCTTTGAT	IIIIIIIIIIC  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r5612	151	ref	5	37	8M1D9M	=	0	0	ATCTATCTTTTGATCTC	/<<!"0/4/*/7//B0/  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r5929	151	ref	11	37	3M1I10M	=	0	0	CTCCTTAGCTCTCC	/<<!"0/4//7//0  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r6743	69	*	0	0	*	*	0	0	TGCCGTGTCTTGCTAACGCCGATT	#'#$$#$###%%##$$$$######
-r9145	115	ref	11	0	19M	=	0	-1	CTCTTAGCTCTCCGAATTAG	7753:<5#"4!&=9518A>  XT:A:R  CM:i:2  SM:i:0  AM:i:0  X0:i:4  X1:i:137  XM:i:2  XO:i:0  XG:i:0  MD:Z:23
-r11770	89	ref	10	37	10M2I8M	=	0	0	TCTCTTAGATGGCTCCGTAT	00/02!!0/120210AA4/1  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r13671	153	ref	1	37	12M1I12M	=	0	0	TCGCATCGATCTCCTTAGATCTCCG	/<""<!"0///////00/!!0121/  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r13762	133	*	0	0	*	*	0	0	TGGGTGGATGTGTTGTCGTTCATG	#$#$###$#$#######$#$####
-r13824	153	ref	13	37	9M1I7M	=	0	0	CATAGATCTACCGGATT	4;6//11!"11100110  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r24800	16	ref	3	255	15M2D9M	=	0	0	GCATCTATCTGATAGCTCCGAATT	IIIIIIIII45"CCCIII?IIIII  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r25612	151	ref	1	37	9M1D5M	=	0	0	TCGCATCGACTCTT	0/4/*/7//00/1C  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r25786	151	ref	21	37	4M1I7M	=	0	0	TGCGTTATTGGG	<!"0/70/BC01  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r27899	69	*	0	0	*	*	0	0	CTGCGTGTTGGTGTCTACTGGGGT	#%#'##$#$##&%#%$$$%#%#'#
-r29192	133	*	0	0	*	*	0	0	GTGCGTCGGGGAGGGTGCTGTCGG	######%#$%#$$###($###&&%
-r29962	16	ref	20	37	4M1I7M	=	0	0	CTCCGGTATGAGG	<!"0/70/7BC01  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
+r327	16	chrM	11	37	8M1D10M	*	0	0	CTTACCAGATAGTCATCA	-+<2;?@BA@?-,.+4=4	XT:A:U  NM:i:1  X0:i:1  X1:i:0  XM:i:0  XO:i:1  XG:i:1  MD:Z:41^C35
+r457	0	chr1	14	37	14M	*	0	0	ACCTGACAGATATC	=/DF;?@1A@?-,.	XT:A:U  NM:i:0  X0:i:1  X1:i:0  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r501	16	chrM	6	23	7M1I13M	*	0	0	TCTGTGCCTACCAGACATTCA	+=$2;?@BA@?-,.+4=4=4A	XT:A:U  NM:i:3  X0:i:1  X1:i:1  XM:i:2  XO:i:1  XG:i:1  MD:Z:28C36G9  XA:Z:chr5,+134263658,14M1I61M,4;
+r1288	16	chrM	8	37	11M1I7M	*	0	0	TCACTTACCTGTACACACA	/*F2;?@%A@?-,.+4=4=	XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:2T0T1A69
+r1902	0	chr1	4	37	7M2D18M	*	0	0	AGTCTCTTACCTGACGGTTATGA	<2;?@BA@?-,.+4=4=4AA663	XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0
+r2204	16	chrM	9	0	19M	*	0	0	CTGGTACCTGACAGGTATC	2;?@BA@?-,.+4=4=4AA	XT:A:R  NM:i:1  X0:i:2  X1:i:0  XM:i:1  XO:i:0  XG:i:0  MD:Z:0T75  XA:Z:chr1,-564927,76M,1;
+r2314	16	chrM	6	37	10M2D8M	*	0	0	TCACTCTTACGTCTGA	<2;?@BA@?-,.+4=4	XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:25A5^CA45
+r3001	0	chrM	13	37	3M1D5M2I7M	*	0	0	TACAGTCACCCTCATCA	<2;?@BA/(@?-,$&	XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0
+r3218	0	chr1	13	37	8M1D7M	*	0	0	TACAGTCACTCATCA	<2;?@BA/(@?-,$&	XT:A:U	NM:i:3	X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0
+r4767	16	chr2	3	37	15M2I7M	*	0	0	CAGACTCTCTTACCAAAGACAGAC	<2;?@BA/(@?-,.+4=4=4AA66	XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:2T1A4T65
+r5333	0	chrM	5	37	17M1D8M	*	0	0	GTCTCTCATACCAGACAACGGCAT	FB3$@BA/(@?-,.+4=4=4AA66	XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:45C10^C0C5C13
+r6690	16	chrM	7	23	20M	*	0	0	CTCTCTTACCAGACAGACAT	2;?@BA/(@?-,.+4=4=4A	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76  XA:Z:chr1,-568532,76M,1;
+r7211	0	chrM	7	37	24M	*	0	0	CGACAGAGACAAAATAACATTTAA	//<2;?@BA@?-,.+4=442;;6:	XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:2  XO:i:1  XG:i:1  MD:Z:73G0G0
+r9922	16	chrM	4	0	7M3I9M	*	0	0	CCAGACATTTGAAATCAGG	F/D4=44^D++26632;;6	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r9987	16	chrM	4	0	9M1I18M	*	0	0	AGGTTCTCATTACCTGACACTCATCTTG	G/AD6"/+4=4426632;;6:<2;?@BA  XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r10145	16	chr1	16	0	5M2D7M	*	0	0	CACATTGTTGTA	G//+4=44=4AA	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r10324	16	chrM	15	0	6M1D5M	*	0	0	CCGTTCTACTTG	A@??8.G//+4=	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r12331	16	chrM	17	0	4M2I6M	*	0	0	AGTCGAATACGTG	632;;6:<2;?@B	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r12914	16	chr2	24	0	4M3I3M	*	0	0	ACTACCCCAA	G//+4=42,.	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76

--- a/tools/indels/sam_indel_filter.py
+++ b/tools/indels/sam_indel_filter.py
@@ -26,11 +26,11 @@ def __main__():
     # prep output file
     output = open( options.output, 'wb' )
     # patterns
-    pat_indel = re.compile( '(?P<before_match>(\d+[MNSHP])*)(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M(?P<after_match>(\d+[MNSHP])*)' )
-    pat_matches = re.compile( '(\d+[MIDNSHP])+' )
+    pat = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' )
+    pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
     try:
-        qual_thresh = int( options.quality_threshold ) + 33
-        if qual_thresh < 33 or qual_thresh > 126:
+        qual_thresh = int( options.quality_threshold )
+        if qual_thresh < 0 or qual_thresh > 93:
             raise ValueError
     except ValueError:
         stop_err( 'Your quality threshold should be an integer between 0 and 93, inclusive.' )
@@ -46,54 +46,39 @@ def __main__():
     for i,line in enumerate(open( options.input, 'rb' )):
         if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
             split_line = line.split( '\t' )
-            cigar = split_line[5]
-            # find all possible matches, like 3M2D7M and 7M3I10M in 3M2D7M3I10M
-            cigar_copy = cigar[:]
-            matches = []
-            while len( cigar_copy ) >= 6:  # nMnInM or nMnDnM
-                m = pat_indel.search( cigar_copy )
+            cigar = split_line[5].strip()
+            # find matches like 3M2D7M or 7M3I10M
+            match = {}
+            m = pat.match( cigar )
+            # if unprocessable CIGAR
+            if not m:
+                m = pat_multi.match( cigar )
+                # skip this line if no match
                 if not m:
-                    break
+                    continue
+                # account for multiple indels or operations we don't process
                 else:
-                    parts = m.groupdict()
-                    if parts[ 'lmatch' ] and parts[ 'ins_del_width' ] and parts[ 'rmatch' ]:
-                        pre_left = 0
-                        if m.start() > 0:
-                            pre_left_groups = pat_matches.search( cigar_copy[ : m.start() ] )
-                            if pre_left_groups:
-                                for pl in pre_left_groups.groups():
-                                    if pl.endswith( 'M' ) or pl.endswith( 'S' ) or pl.endswith( 'P' ):
-                                        pre_left += pl[:-1]
-                        parts[ 'pre_left' ] = pre_left
-                        matches.append( parts )
-                    cigar_copy = cigar_copy[ len( parts[ 'lmatch' ] ) + 1 : ]
-            # see if matches meet filter requirements
-            if len( matches ) > 1:
-                multi_indel_lines += 1
-            elif len( matches ) == 1:
-                pre_left = int( matches[0][ 'pre_left' ] )
-                left = int( matches[0][ 'lmatch' ] )
-                right = int( matches[0][ 'rmatch' ] )
-                if matches[0][ 'ins_del' ] == 'D':
-                    middle = int( matches[0][ 'ins_del_width' ] )
+                    multi_indel_lines += 1
+            # otherwise get matching parts
+            else:
+                match = m.groupdict()
+            # process for indels
+            if match:
+                left = int( match[ 'lmatch' ] )
+                right = int( match[ 'rmatch' ] )
+                if match[ 'ins_del' ] == 'I':
+                    middle = int( match[ 'ins_del_width' ] )
                 else:
                     middle = 0
                 # if there are enough adjacent bases to check, then do so
                 if left >= adj_bases and right >= adj_bases:
                     quals = split_line[10]
-                    left_quals = quals[ pre_left : pre_left + left ][ -adj_bases : ]
-                    middle_quals = quals[ pre_left + left : pre_left + left + middle ]
-                    right_quals = quals[ pre_left + left + middle : pre_left + left + middle + right ][ : adj_bases ]
+                    eligible_quals = quals[ left - adj_bases : left + middle + adj_bases ]
                     qual_thresh_met = True
-                    for l in left_quals:
-                        if ord( l ) < qual_thresh:
+                    for q in eligible_quals:
+                        if ord( q ) - 33 < qual_thresh:
                             qual_thresh_met = False
                             break
-                    if qual_thresh_met:
-                        for r in right_quals:
-                            if ord( r ) < qual_thresh:
-                                qual_thresh_met = False
-                                break
                     # if filter reqs met, output line
                     if qual_thresh_met:
                         output.write( line )

--- a/tools/indels/sam_indel_filter.xml
+++ b/tools/indels/sam_indel_filter.xml
@@ -1,5 +1,5 @@
-<tool id="sam_indel_filter" name="Filter SAM" version="1.0.0">
-  <description>for indels</description>
+<tool id="sam_indel_filter" name="Filter Indels" version="1.0.0">
+  <description>for SAM</description><command interpreter="python">
     sam_indel_filter.py
       --input=$input1

--- a/test-data/indel_sam2interval_out3.bed
+++ b/test-data/indel_sam2interval_out3.bed
@@ -1,2 +1,7 @@
-ref	256	258
-ref	190341171	190341172
+chr1	11	13
+chr1	21	22
+chr1	21	23
+chrM	16	18
+chrM	19	20
+chrM	21	22
+chrM	22	23

--- a/test-data/indel_sam2interval_out2.bed
+++ b/test-data/indel_sam2interval_out2.bed
@@ -1,4 +1,6 @@
-ref	133	134
-ref	48819784	48819785
-ref	87824726	87824727
-ref	188841437	188841438
+chr2	18	19
+chr2	28	29
+chrM	11	12
+chrM	13	14
+chrM	19	20
+chrM	21	22

--- a/test-data/indel_analysis_out2.interval
+++ b/test-data/indel_analysis_out2.interval
@@ -1,8 +1,7 @@
-ref	13	14	C	1	7.14	7.14
-ref	13	14	T	2	13.33	14.29
-ref	14	15	C	1	6.67	7.14
-ref	16	17	T	1	7.14	7.69
-ref	20	21	GG	1	8.33	8.33
-ref	22	23	A	1	8.33	11.11
-ref	24	25	G	1	11.11	14.29
-ref	25	26	T	1	12.50	14.29
+chr2	18	19	AA	1	50.00	50.00
+chr2	28	29	CCC	1	50.00	50.00
+chrM	11	12	TTT	1	9.09	9.09
+chrM	13	14	C	1	9.09	9.09
+chrM	13	14	T	1	9.09	9.09
+chrM	19	20	T	1	7.69	8.33
+chrM	21	22	GA	1	8.33	8.33

--- a/test-data/indel_analysis_in2.sam
+++ b/test-data/indel_analysis_in2.sam
@@ -1,16 +1,24 @@
-r770	89	ref	6	37	7M1I5M	=	0	0	TCGATCTTCATAG	!0//110AA44/1  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r1124	113	ref	4	0	23M	=	0	0	CATCGTTCTGTTAGATCTACGTA	PQRVUMNXYRPUXYXWXSOSZ]M  XT:A:R  CM:i:0  SM:i:0  AM:i:0  X0:i:163148  XM:i:0  XO:i:0  XG:i:0  MD:Z:23
-r1789	177	ref	6	0	17M	=	0	0	TCGATCGCTTAGTTCTC	SQQWZY]]YXTSY]]ZM  XT:A:R  CM:i:0  SM:i:0  AM:i:0  X0:i:163148  XM:i:0  XO:i:0  XG:i:0  MD:Z:23
-r3671	153	ref	10	37	6M1I6M	=	0	0	TCTCTTTAGGTCT	/<<!"0///////  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r3824	153	ref	5	37	8M1I7M	=	0	0	ATCGATGTTCTTAGAT	4;6//11!"100110/  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r4800	16	ref	7	255	5M2D6M	=	0	0	CGATCTTTGAT	IIIIIIIIIIC  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r5612	151	ref	5	37	8M1D9M	=	0	0	ATCTATCTTTTGATCTC	/<<!"0/4/*/7//B0/  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r5929	151	ref	11	37	3M1I10M	=	0	0	CTCCTTAGCTCTCC	/<<!"0/4//7//0  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r9145	115	ref	11	0	19M	=	0	-1	CTCTTAGCTCTCCGAATTAG	7753:<5#"4!&=9518A>  XT:A:R  CM:i:2  SM:i:0  AM:i:0  X0:i:4  X1:i:137  XM:i:2  XO:i:0  XG:i:0  MD:Z:23
-r11770	89	ref	10	37	10M2I8M	=	0	0	TCTCTTAGATGGCTCCGTAT	00/02!!0/120210AA4/1  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r13671	153	ref	1	37	12M1I12M	=	0	0	TCGCATCGATCTCCTTAGATCTCCG	/<""<!"0///////00/!!0121/  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r13824	153	ref	13	37	9M1I7M	=	0	0	CATAGATCTACCGGATT	4;6//11!"11100110  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r24800	16	ref	3	255	15M2D9M	=	0	0	GCATCTATCTGATAGCTCCGAATT	IIIIIIIII45"CCCIII?IIIII  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r25612	151	ref	1	37	9M1D5M	=	0	0	TCGCATCGACTCTT	0/4/*/7//00/1C  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r25786	151	ref	21	37	4M1I7M	=	0	0	TGCGTTATTGGG	<!"0/70/BC01  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
-r29962	16	ref	20	37	4M1I7M	=	0	0	CTCCGGTATGAGG	<!"0/70/7BC01  XT:A:U  CM:i:2  SM:i:37  AM:i:0  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:1  MD:Z:22
+r327	16	chrM	11	37	8M1D10M	*	0	0	CTTACCAGATAGTCATCA	-+<2;?@BA@?-,.+4=4	XT:A:U  NM:i:1  X0:i:1  X1:i:0  XM:i:0  XO:i:1  XG:i:1	MD:Z:41^C35
+r457	0	chr1	14	37	14M	*	0	0	ACCTGACAGATATC	=/DF;?@1A@?-,.	XT:A:U  NM:i:0  X0:i:1  X1:i:0  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r501	16	chrM	6	23	7M1I13M	*	0	0	TCTGTGCCTACCAGACATTCA	+=$2;?@BA@?-,.+4=4=4A	XT:A:U  NM:i:3  X0:i:1  X1:i:1  XM:i:2  XO:i:1  XG:i:1  MD:Z:28C36G9  XA:Z:chr5,+134263658,14M1I61M,4;
+r764	4	*	0	0	*	*	0	0	TTAAGGGGAACGTGTGGGCTATTTAGGCTTTATG	BBB=?BBA?>;?=B=AA=;A at B>=;;>:A=?:?9
+r1288	16	chrM	8	37	11M1I7M	*	0	0	TCACTTACCTGTACACACA	/*F2;?@%A@?-,.+4=4=	XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:2T0T1A69
+r1902	0	chr1	4	37	7M2D18M	*	0	0	AGTCTCTTACCTGACGGTTATGA	<2;?@BA@?-,.+4=4=4AA663	XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0
+r2204	16	chrM	9	0	19M	*	0	0	CTGGTACCTGACAGGTATC	2;?@BA@?-,.+4=4=4AA	XT:A:R  NM:i:1  X0:i:2  X1:i:0  XM:i:1  XO:i:0  XG:i:0  MD:Z:0T75  XA:Z:chr1,-564927,76M,1;
+r2314	16	chrM	6	37	10M2D6M	*	0	0	TCACTCTTACGTCTGA	<2;?@BA@?-,.+4=4	XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:25A5^CA45
+r3001	0	chrM	13	37	3M1D5M2I7M	*	0	0	TACAGTCACCCTCATCA	<2;?@BA/(@?-,$&	XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0
+r3218	0	chr1	13	37	8M1D7M	*	0	0	TACAGTCACTCATCA	<2;?@BA/(@?-,$&	XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:1  XO:i:1  XG:i:2  MD:Z:17^CA58A0
+r3760	4	*	0	0	*	*	0	0	CCTGTGATCCATCGTGATGTCTTATTTAAGG	BABCBCBBBABBACBCABCAACBBCBBB=?B
+r4767	16	chr2	3	37	15M2I7M	*	0	0	CAGACTCTCTTACCAAAGACAGAC	<2;?@BA/(@?-,.+4=4=4AA66	XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:2T1A4T65
+r5333	0	chrM	5	37	17M1D8M	*	0	0	GTCTCTCATACCAGACAACGGCAT	FB3$@BA/(@?-,.+4=4=4AA66	XT:A:U  NM:i:4  X0:i:1  X1:i:0  XM:i:3  XO:i:1  XG:i:1  MD:Z:45C10^C0C5C13
+r6121	4	*	0	0	*	*	0	0	GTTAATAGGGTGATAGA	AB/BC==CC%ACBC/CB
+r6690	16	chrM	7	23	20M	*	0	0	CTCTCTTACCAGACAGACAT	2;?@BA/(@?-,.+4=4=4A	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76  XA:Z:chr1,-568532,76M,1;
+r7211	0	chrM	7	37	24M	*	0	0	CGACAGAGACAAAATAACATTTAA	//<2;?@BA@?-,.+4=442;;6:	XT:A:U  NM:i:3  X0:i:1  X1:i:0  XM:i:2  XO:i:1  XG:i:1  MD:Z:73G0G0
+r8122	4	*	0	0	*	*	0	0	GACCTGTGATCCATCGTGATGT	CBBAB/$3BB<AB/,CBCABCA
+r9922	16	chrM	4	0	7M3I9M	*	0	0	CCAGACATTTGAAATCAGG	F/D4=44^D++26632;;6	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r9987	16	chrM	4	0	9M1I18M	*	0	0	AGGTTCTCATTACCTGACACTCATCTTG	G/AD6"/+4=4426632;;6:<2;?@BA	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r10145	16	chr1	16	0	5M2D7M	*	0	0	CACATTGTTGTA	G//+4=44=4AA	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r10324	16	chrM	15	0	6M1D5M	*	0	0	CCGTTCTACTTG	A@??8.G//+4=	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r11324	4	*	0	0	*	*	0	0	AATAGGGTGATAGACCTG	//CCCC#ACB;;C3BA5C
+r12331	16	chrM	17	0	4M2I6M	*	0	0	AGTCGAATACGTG	632;;6:<2;?@B	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76
+r12914	16	chr2	24	0	4M3I3M	*	0	0	ACTACCCCAA	G//+4=42,.	XT:A:U  NM:i:0  X0:i:1  X1:i:1  XM:i:0  XO:i:0  XG:i:0  MD:Z:76


More information about the galaxy-commits mailing list