[galaxy-commits] galaxy-dist commit 2d046444998e: Enable 'extract genomic DNA' tool to accept and produce GFF files and added functional tests for this feature.
commits-noreply at bitbucket.org
commits-noreply at bitbucket.org
Fri Jul 16 10:06:30 EDT 2010
# HG changeset patch -- Bitbucket.org
# Project galaxy-dist
# URL http://bitbucket.org/galaxy/galaxy-dist/overview
# User jeremy goecks <jeremy.goecks at emory.edu>
# Date 1278618202 14400
# Node ID 2d046444998edb1c4a5126897990bddd25de69f5
# Parent 41089d12cdd9eaaeb8e75e09157d412ccf935eae
Enable 'extract genomic DNA' tool to accept and produce GFF files and added functional tests for this feature.
--- a/tools/extract/extract_genomic_dna.xml
+++ b/tools/extract/extract_genomic_dna.xml
@@ -1,20 +1,27 @@
<tool id="Extract genomic DNA 1" name="Extract Genomic DNA" version="2.2.1"><description>using coordinates from assembled/unassembled genomes</description>
- <command interpreter="python">extract_genomic_dna.py $input $out_file1 -1 ${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol} -d $dbkey -o $out_format -g ${GALAXY_DATA_INDEX_DIR}</command>
+ <command interpreter="python">
+ extract_genomic_dna.py $input $out_file1 -d $dbkey -o $out_format -g ${GALAXY_DATA_INDEX_DIR}
+ #if isinstance( $input.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
+ -1 1,4,5,7 --gff
+ #else:
+ -1 ${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol}
+ #end if
+ </command><inputs>
- <param format="interval" name="input" type="data" label="Fetch sequences corresponding to Query">
- <validator type="unspecified_build" />
- <validator type="dataset_metadata_in_file" filename="alignseq.loc" metadata_name="dbkey" metadata_column="1" message="Sequences are not currently available for the specified build." line_startswith="seq" />
+ <param format="interval,gff" name="input" type="data" label="Fetch sequences corresponding to Query">
+ <validator type="unspecified_build" />
+ <validator type="dataset_metadata_in_file" filename="alignseq.loc" metadata_name="dbkey" metadata_column="1" message="Sequences are not currently available for the specified build." line_startswith="seq" /></param><param name="out_format" type="select" label="Output data type">
- <option value="fasta">FASTA</option>
- <option value="interval">Interval</option>
+ <option value="fasta">FASTA</option>
+ <option value="interval">Interval</option></param></inputs><outputs>
- <data format="fasta" name="out_file1" metadata_source="input">
+ <data format="input" name="out_file1" metadata_source="input"><change_format>
- <when input="out_format" value="interval" format="interval" />
+ <when input="out_format" value="fasta" format="fasta" /></change_format></data></outputs>
@@ -34,6 +41,17 @@
<param name="out_format" value="interval"/><output name="out_file1" file="extract_genomic_dna_out3.interval" /></test>
+ <!-- Test GFF file support. -->
+ <test>
+ <param name="input" value="gff_filtering_out1.gff" dbkey="mm9" ftype="gff" />
+ <param name="out_format" value="interval"/>
+ <output name="out_file1" file="extract_genomic_dna_out4.gff" />
+ </test>
+ <test>
+ <param name="input" value="gff_filtering_out1.gff" dbkey="mm9" ftype="gff" />
+ <param name="out_format" value="fasta"/>
+ <output name="out_file1" file="extract_genomic_dna_out5.fasta" />
+ </test></tests><help>
@@ -90,7 +108,7 @@ Extracting sequences with **FASTA** outp
CACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCAC
ACACG
-Extrracting sequences with **Interval** output data type returns::
+Extracting sequences with **Interval** output data type returns::
chr7 127475281 127475310 NM_000230 0 + GTAGGAATCGCAGCGCCAGCGGTTGCAAG
chr7 127485994 127486166 NM_000230 0 + GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG
--- /dev/null
+++ b/test-data/extract_genomic_dna_out5.fasta
@@ -0,0 +1,258 @@
+>mm9_chr10_62044836_62045189_+
+AATTACAAGATCGACACACCAAGATAGGCAGATCCATGGTTGGTTTTACT
+TTGTAAATCTAAAAGTATGTTGGAAAACGATGCAATGAATTCTTATCCTT
+TTTCAAAATGAAGAATTTGTGATGGTTAGTGGACAGTTCAGAAGCCTCTC
+TGCAAGAAAGGGGGCGCTGAGAAGTGGTAAAAAAAGGAAGGAAGCACTCG
+GGCTTTGTCAGCAGGGTGGACCCTGGGGTCCACAGTGGGAACAGTCCCTT
+CTGGCCTCTACTCACTGACCAAACGCTTTACTAAAACTCCGCTTCTGGCC
+TCTGTTGCCACCTCCTGGTCGCTGTCCTCGGAAGTTTCTACTTCCTCCTC
+GCT
+>mm9_chr10_75372918_75373002_+
+GCGTCTCGCAGCTTCTGCCCGTCGATCTCCATGTCGAGCCGGATGGGCAC
+CAGCACCTCAGGCTGTGACGCATTCTCATGGATC
+>mm9_chr10_80362427_80363292_-
+ATGACGGACAAGTGTTTCCGGAAGTGCATCGGGAAGCCCGGGGGCTCCTT
+GGATAACTCGGAGCAGGTGAGACATCTCGGGAACCCGGGGTGGTGAGGGG
+CGCGGGGTCAGGAGCGTCTAGGAGGTTGAGAGATGTGCGCGTGCGCGGCC
+TCTAGCCTTAGCTACTGAGGAAGTTGTGCGCGTGCGCGGGGTGAGGACCC
+GGCTTCTGTGCCTAGATCGGTGCAGCCTTCATGGGTGATCCTCGGGTCGT
+GTGACCGTCAGTCAGGGATCCCCCTCCACGCTTTGCAGAAATGCATCGCC
+ATGTGCATGGACCGCTACATGGACGCCTGGAATACCGTGTCCCGCGCCTA
+CAACTCTCGACTGCAGCGGGAACGAGCCAACATGTGACCGGGACCTGTGC
+CTCGGGACACCGTGCTTATGGTCTGAACTGTTTTCCCTGCCAGTTAGGGT
+GTCTCCTCCTAGCCGCCCTGAAGTCTGGCAGCATGGAGGGCTTGGGGATC
+GAGGCCTCTCCCCTGGGTTGCTGCGTCCAGCTCAATCTCAGAAGAGAGTG
+AGGACCCGACAGAGCACAGGGATCTGGCTGGCCCCACTGACCTGTGACCT
+CAGGAGAGCAGGCCAATAAATCGCTGCTGGGGCAGTAAAGCAGGCGTGTC
+ACCTCACTGCTTCAGGTCCCTTCCCCTGAGTAGGCCCAGACCTCCCAGGG
+TATCTTTCCCCTTGGGGTCAGTGGGCTGCTGGCTCTCAGGGAATTCGGAG
+CATGATCTCAGGTGTTTGGTCATCCCGGGGAGACCAGCCGAGGTTAAGAA
+GCAAGGCTTCATGTagccttcacctatcatgcatgaggcccagggtgctg
+accttaactctgaat
+>mm9_chr11_7904564_7904642_+
+CATCTTCTATTTGAGCCTCCATCCAGGCACCTCTGAAACAAAGGTGCACT
+CACTGCATGTCCACTTGTCACAGGAGCC
+>mm9_chr11_78140155_78140259_+
+CTGCTTGCTAATTTTCTCTCTTGGGATCAGGGGGACGTGAACTCCAGCCC
+TGACTCGTGCTCCTTATGCTCTGAGTACATAGCAAATAAATGAGAGCAAA
+ACAC
+>mm9_chr11_105616461_105616737_+
+TAGGTGTAATAGTGGAAAACAATAGTTTTTAAACTTCAGAGTCCAGGGCT
+GTAACTCAGTAGTAACAGTGTTCTCTAAGTATGTTATTCTTCCTCTACAT
+GCTGAAATTTTTCATATTTGGAGCATTCACTGTTCCATGTATCAGTAAAT
+TATATTGTGAGCTGTCATCATATCTAAGCACCATATTGAATATTTTTCAT
+GATTAAAATTTGTTGAAACAACAATTCTATGACCGAAAAAAGCAAGGCTT
+TGTAAATAACATGTTTGTTACTAGTA
+>mm9_chr12_30701761_30702509_+
+TGTGGAGTGTACTTATATGATCCCTATGCTGATAGGATTACCTTCCTAGA
+CATAGCTAGACGCAAAGCCACATGTGTAAGGCTGCTGAGCAAAGACAGCA
+TCCCAGCATGGGTGTGTTCACGGTGGATTCACCACGTTGCATATGTAAAG
+TGGTCCCCTTGGCTTACCCTTCACTTTGCTCATGAGATTCAGAAGCTGGT
+GGTCCAGCAGGGGTGAGCATTTGTGAAATAGTAAGCTGAACTTAGTGGTG
+AGATTTCAGAACAGACTTCTGTGAAGTAAGAGATGTAACCATGCATCTAA
+AATCAGATGGCCGTGTAACTGCTCGGGCATAGAAATGGTGGGAGAACCTG
+TCCTGGGTACCTGGCATTTCACATGAGCCCAGGGATATGTCTTGTGCCAA
+GGCACACAAGTGTCCATGGACTTGGACAGGTGCCAAGGGTTTTTGTCTCT
+GTTCCTATGTGGGAGGCTGGCTGTGATTTACATTAATTTCTGTATTTCAA
+ACGAAGATGTCTGCAGATCTCCATTTTGATGTTACAGCCTCATTGCCCAG
+GCAGTGGGCAGTGCCCAGACACCCTTTCTGACTAGCCACTGCATTGGGCT
+TCTGTGATTCAAAGTAGTGTATATATTTATTTACTTCTCTGACTGTGGCC
+AACAGCCAAATGCCATTTTATGTTCCTTGTATTCAGTCCATTACCAAAGA
+GGTGTTTGCACTTTGTAATGATACCTTTCAGTTCAAATAAAAGGACCA
+>mm9_chr13_49159495_49159569_+
+ttttcttttggattacttgatttttttttatttgatcttatttatgatga
+ttttgagtacatttttgaacagtt
+>mm9_chr13_100200303_100200330_+
+TCTCATATGAATAGCCACCCTCTTCTG
+>mm9_chr14_31949102_31949152_+
+GGATGCTATCCGCGATGTGCATGTAAAGGGCCTCATGTACCAGTGGATCG
+>mm9_chr14_67604226_67604668_+
+TTCACCGTGAGAGTTTTCTCCATTTCACTCTTCACTGTGCTGTTCTCTGT
+GCCGCTTTCCTCTTGACTTATAAACATCTGAGCCAGTTTTCAATAAACTT
+AAAACGAAGCCTGCTTCTCATCCCAAATTGTAAACAGGAATAAAGCTTTT
+TAAACCTTATCTTAAATTTTAACTTTGTTGAATTCTGCTTTGTGATAGGA
+CAATCTGTTTCACCCAACAAGAATCTGTGTAGGAGGATGAACATCCCGCA
+TGTTGGAGCTGCAAATCAGCACTGTACAAGCTCACTGATGGACAGCTGTT
+CTGTGATGTATTCCATGATTTTACTAATACTTTCAAAAATGGCAAAACTA
+ACTTCAGTTTTAATGTTGAAAGAAAATCATAAATGTTCCCATAGTTCAAT
+GGCACTGTCGATGAAACTGCTACTGAATTTAGAGAGAAAACG
+>mm9_chr14_75165581_75165744_+
+ggccctgggatgataTAACAGAAGAGTCTAAAGGAGGCTTCTGAGATGTG
+CAGTAGGAAAGCCTGGCACATAATAGGTTATTATCTAAATCCCTTCACTA
+CTCTTCAAAGACAGCAGGATGCCTCTGCTCCCATGTTTTATCTCTACTTA
+TGTGGAATTTATG
+>mm9_chr16_57154026_57154067_+
+GTTGAGGTTTATTTAAGTAAAATGATTTTTTAAAAAAGCAA
+>mm9_chr16_74862301_74862560_+
+GCATTGGCAGCAGATATTGGTACCCAGTGGCACTGCAGAGTACTTACAAT
+CAGGACTCGCTACTGTGCTTCATTCTGCTTTTCTCTCTGCTTCTATTACA
+GTTAAAGTGTTGCTAATTATAGAAACTCTCTGTTTATTGAACCTCGGTGT
+TAAGAAAAACTTGTAATCTTCAGATATGATCCGAAAGATTCCCAAACAAA
+TGTAACAAGGTCCACTTTTGTAGCCCTTTCTACCAGAAcactggttatca
+acctgtggg
+>mm9_chr16_98168778_98168914_+
+CCTATTTATTTCACTAAACATCTGCCTGCTAGCTGAGATAAACATTCTCT
+AAAAAACTGTTTACTGCAAAAAGTGATTACTGTTTTTTATTAGTTTCTTA
+GCATTTGAAATAGTTACATGAATGGAAGGATAGAGT
+>mm9_chr17_8483211_8483268_+
+AGACTTGTCAACAGCTCACCCAATGATGGAACTGAGGCTGCCCCTCAAGT
+GGCCAGA
+>mm9_chr17_30355790_30355913_+
+atctcatacccataagctcagaactcggggtggtaacataggaggactgc
+catgagtgtgactaacctgggctataggaggaggatctaccttaagcaaa
+tgaCCAACAAAACTAACAAGCTC
+>mm9_chr18_39571717_39571880_+
+TATAACATTCCATAAATGTACAATAATCTATTTTTGAGAAGCTCATTTTG
+AAACTTAACACTGTCATTGATAATCTTCAAGTGGTATTTCTTAGGCACCA
+TAAATTTCACATCCAGCTGGGTTACAATTATTTTAAAGTACTTTGAGACC
+AATTTAAACCATT
+>mm9_chr19_17633087_17633203_+
+TGGGAAATGAACTGCATGGCAATGAACCCCAGGGAATTTGGTGGTTAATT
+GTCTAAGGATAAGGACATCAGTTTTGTCTTTTGCATCACTGTGACCTTTG
+CCTCTAATTGTATAGA
+>mm9_chr19_41997623_41997859_+
+gctacacaacgactcacatagagggaagcaggcacacatcagataaaaca
+cAAAAGGATGGGTTGGTGATGGGCATAGTTAATGAGGGCCACTAGGTAAA
+TACACCTGATCCAAAAGTCACGCTACTACTTAGATTCTTCTCTCTGCTAA
+AGACAACAGAAgacatgttagccatgcttgtaatccctgcattggggaga
+tggagtcagaaatatcactgcaagttcacccaatag
+>mm9_chr19_56516514_56516684_+
+TGTATTCATTCACTATTCACTGATTTGTCAGATCATCCATCCACACAGGT
+GCTGAAGAGTAACCCATTTCACTTTGTATACAAGATAATGTTTTTGTACT
+TCAAATACATCTGGAATTCTTTCAAATATTCCAAGATTTTTTTTTTTTCT
+GAATAATCTTTGGTTACCTC
+>mm9_chr2_4543773_4543977_+
+gagccatttctccagccccTTTATGTGGAATATTAACAAGAGAAGACAAC
+ATAAAATGACTTACCATGCTGTGTGGCCTAACAGTGGATGAAGAATGAGT
+GATTTGGGCATTTCTGATAGTATTTATAAAGAAGACTTTTATGACCAAAC
+CACATGTCACAGTAGGGATTTGCTGCACATCTTATGAGAGTTTCTTCTTT
+GTCA
+>mm9_chr2_30200330_30200938_+
+CGCACACAAAGGATTTATTTGCCAGAGAGCAAGCAGACAGGCAGAGGTCA
+GAATGTTAGTTAGAAACTGAAGGAATGACTGCTGTAGCCACTGTGCCCAG
+CCAGAGCCATGAGGGAAGTGGGAGGCAGCACTTGGTGCTGCTGCTCTGGC
+TGACCCTTCTGGTTTCCTGCCACACTCCTAGCCCTGCCTGTGTGCTGCTG
+TCCCCCTCAACCTTCCACAGCCAGAAGGCAGATGTTCTTTCATGCCAAGA
+GCATCCATCCCCAGCATATCCTGGGCCCATGGTGGTGTCAAATGTAGTGA
+CCCTTCTGCCTTAAGGGAGCTGGGAAGCCTGGGGTGTGCAGGGTTGCAGG
+TCAGAAGCAGGACTAGCAGAGGGGCCTGGGGCCATTCTGTCTTGTGGGCT
+CTTTAATAGCTGAATGACGGGCACAGCCAGAAAAGGGTTAGGTCCCTTAT
+CCTAAGCAGCTCTGTGGCCAGCAGACGACTCTAAGTGGCAGAGCCTGGGA
+AGGGGCTGCTTAGCTGAGAAGTTCCAGGTAGGTGACAGGAACCTTGCCCT
+TCTTGTTGCCTCTCTCACCAATGAGCCAGTCGGGATCCATGCCTGGCAGG
+CTGTAGAC
+>mm9_chr2_106644219_106644341_+
+attcttaaggtaaatacctaggagtgatgtaacccagtcatagggaagaa
+ctacttttaatttgttgagcaacccccaacctgattttgacacaggtttg
+agtagtttacacttctactaac
+>mm9_chr2_125388930_125389219_+
+AGAGCACACAGCACATCACTTAGGCCTCCAACATTAAGGCAGCGCAAGTG
+CCTCAAGTAACTGAGAATACTTTACTCAGATACAAGGGTATCAAAAACAT
+GAGAACTGGCAGGAAGACCTCACAATGGTTTGTTAGCATCAAGTATTACC
+ATCCAGTTTCCTGTTTAAATAGTAATTAATGACTATTCTGAAATAAGGCA
+AATAATTACTCAAGCGGGCTGTCAAAGCCACTATCCTGTTGGCTGGGCAT
+CGGAGCAGTTAACTTTATCAAAGGCTTCTGACACAATGA
+>mm9_chr3_130936638_130936898_+
+CGAGGCTGCAGGCTGCAAATGTTCCCAGGCAGGCAAGACCTCACGTCCTA
+CTGGCTGCTGCCCTTGGGTGCATCTGTAGGCCCCGTGGCTCCTGCCCCTG
+GGGTTCAACACCGATAAACATAGAATACTCATTTTCAGAAGACCTGAGGG
+AATGAGTCTAAGCAACGCTTTTTACAAAAAGTGGCAAGGTTCAGGAAAAA
+AAAAAAAAAAGATGTTGCTCCAAGGCACCAAGGGTGTAATTTTTTTTCAG
+AAAAAGTCAG
+>mm9_chr3_136592670_136592771_+
+TGTCAGCCCATCACATTTTAGTGACAACAGTCATAGCCTTTATTTTCAGA
+TGACTTTCCTCTAAAACCACTGTCTATGAGTTGCCCCCCAAAACTCAAAA
+A
+>mm9_chr3_152861373_152861508_+
+ATCAAAAGCGACATGCAAGCATCTTGCTCTCACCACAGATCACTGAGACA
+TTAAGAGTGACGTCTCTTGAACTGTTGGCACGCCTAAGTTATTTCAGCAT
+TTCTTGCTCAGCAGTTGTTCTCTTGGCTTCCTCTG
+>mm9_chr4_13715309_13715630_+
+AACACATGGCCACATCATGTGATATTTTCAAAACACTTACACATAGCTTT
+GAGAAGGTCCCTGCAGGAATGATCCATCCTCTCACAGTTGGCCCATTTTT
+TAACAGCATATCTGCATTTTCCATTTAGGAGAGCTATATATTATTAGCTT
+ACATTTTTGGGTAGTAAAACAGTGCATTGCTGATTGTAAAACATGGACTT
+TATTATCTGCTGAAAATTGATTTGGCATTTATAGCCACTGTGTATTAGAC
+TGTTTTTCTGTTTTTAACATCAATGCTTAAAAGCGATGATTTGTGTTTaa
+aaaaattaaaaaaataaaata
+>mm9_chr4_147515028_147515097_+
+GCTGACGTGCTCTCCGAGTTCCTGGAGGTGGCCGTGCACCTGATTCTCTA
+TGTGCGCGAGGTCTACCCG
+>mm9_chr5_3949521_3949685_+
+AGTCCCAACCACCCCCTTGTTTAATGTATAACTTTCTGAAATGGGAGCGT
+TAGAATGGATTAAAATGGTTGGTAGGTGGTTGGATCACCAACCAAGACCA
+GAAATAGAGGGGTAGGCTGCTCAGGAGAGTATTGGGAGGGTAGCTATTAT
+TTGCATTTTGTGCT
+>mm9_chr5_68089693_68089831_+
+CAATGATAGAGAAGACTAAAATAAAAGCAGGCATGCTGGCACAAGCGACA
+GAAGGAAAAAGCCTCACCCGGCCCTGTTTGAGGCCACTCCTGGTGGCTCC
+TTTTCCAAGGACCATGCGGTCAAGCCTCTGAGTTGTTC
+>mm9_chr5_122819525_122819619_+
+CTTTAGAAAAGATGCATCTGTCATTGATTTAGGGATATGAATTGTTTGGA
+TTTGAGTAGTTTTCCATAACTCCTGCAGTTTGGCAATGTGTGCG
+>mm9_chr5_145619547_145619710_+
+CGGCGTTCTGAAAACTGTGCTCCGGGATGAGATCATTGCTTGGCACAAAA
+AGACACAGGAGGACACTTCCTCTCCACTGTCGGCCGCAGGGCAGCCTGAG
+AACATGGACAGCCAGCAGCTGGTTTCCTTAGTTCAGAAAGCCGTCACTGC
+CATCATGACCCGC
+>mm9_chr6_83928983_83929105_+
+ACAGGAACCATTATTTACATTTAATTTGGATGAATTTGTTACTGTGGATG
+AAGTCATAGAAGAAGTAAATCCTTCTCAAGCCAAGCAGAATCCATTAAAA
+GGAAAAAGAAAGGAAGCCCTCA
+>mm9_chr6_118857948_118858148_+
+CCAGGCTTGCTAGTTGGTGCAGTTAGCTACATCTCAGGACAGAGACAAGG
+TACTCTGAGCTCCCCTTGAACTGCCACACAAGCTGTCTCCTGGATGCCAA
+GCAGAGAAACCTGGAGACAACAATCATCATACTCAAAACCAGGATCTCTT
+TCTTAAGACTTTTGTATTTTGTCCCAGCCCTAACCCTGAGTTCTGCTGAA
+>mm9_chr7_85554209_85554343_+
+GTGAAACATCATGCTTCTGCATCAAGTTATTAGTGGGAAACCTGTAAAAG
+TTGACATTGAATGCTGATAACAAATTACTTTCATCCTGTCTCATAATGAA
+TCCTACATCAAGACAAGGCAAGTGAGAAAGAGGG
+>mm9_chr7_104055490_104055589_+
+ACATTTCTCCTCTCTTGGGGGAGCGCATCTCCTTGGGTGTGTCCACATCC
+GCCCCTAGGTACCCAGTGTGATGTGAGACACGAGTGTCTGTGCTAACTT
+>mm9_chr8_9970397_9970545_+
+AGTCTTCACCAAAATTAAGTCTCAGCTAACTTAAAAGTTGCAAGGATTTT
+TTTCAATAAAATTAATATCTTAAGTGTTTGGTGTTTAGATGATTCTCTCT
+CAACTTCCCCCACATTATCAAAAAACATTTGATGAACCTTAAAAACTC
+>mm9_chr9_20449845_20449932_+
+CCAGCACCGATGACACCATCGGCGACTTGAAGAAACTGATAGCTGCTCAA
+ACTGGCACCCGCTGGAACAAGATCGTTCTTAAAAAGT
+>mm9_chr9_107445869_107445930_+
+CAAGCAGAAGCTGGTGCCCATCATGACCATCCTGCTGGAAGAGCTGAATG
+CCTCCGGCCGC
+>mm9_chr9_120860475_120860606_+
+CTGCCATTGTACGCACCATGCAGAATACAAATGATGTAGAGACAGCTCGT
+TGTACTGCTGGGACTCTGCACAACCTTTCTCACCACCGCGAGGGCTTGCT
+GGCCATCTTTAAGTCTGGTGGCATCCCAGCG
+>mm9_chrX_10274056_10274087_+
+ACTTCGCTGTCATCATTTGTACAAACTCTTT
+>mm9_chrX_39881430_39881678_+
+AGCTAAAAAGAGTCCTTTTCTGACAGAAAGGCTGGACTTCTCCTTTTCAC
+CGTTTCTCTTACTGATGCTTTTGCCAGAAGAACAGTAAAGATTTAGACAC
+TGTCATGATTCATACACGTAAAATATTTTTCAAGGACACAATCTGATATA
+CTAACATTTATTTAAGAGGTTAAAGTCCACCACTAAATCTAAGGAAAGAT
+TTTTAACTGCCAAACACATTTCCTTTGACAAATAATGTAAGATGACAA
+>mm9_chrX_148249671_148249713_+
+AATGCTAGTATGAACAGTGGGAGGAATGAGCAAAATGTTACA
+>mm9_chrX_148481504_148482455_+
+CGCCACAACCTGCTACAGGCCTGTAAGATGCAGGACATCAAACTGCCACT
+GTCAAAGGGCACCATGGATGATATTAGTCAGGAAGAAGTGAGTATTATGG
+TGGGTGGTAGGAGTCATCTATGAATATTTAACCAGTAATGGGAGATTACA
+GATGGCCAGGAAGGGCAGGCAACAGATAGGACCACATAGAGTTGTGAGGG
+GCATAAAGATGGATGCAGAAGAAATGTGGCAAGGTGGAAGTAGTGAAGTC
+AGGCTTTGGTATGAGAGAGACATTGATTTGAGAGGAGAGCTGCAAGCCAG
+TGAGTACTCAGAAAGACCAAGAATGGGTCATTAATCTTAAGGATTTGAGC
+TCTTAGCTGCAGCAGATACTGGGCATGGGTAGGAGTGAGAATTGAGGAGC
+AGAGGAAGATGGGAAACTGGAGAACCTAAGGAGACTGATAGCTTAGCTGC
+AGTAAGGGAGGTTGGCCAGAAGAGGGTTGGGTAGGGGACTCAGCAAGGCA
+GAACTAAGGAAGCTTAGGTGGAGGGGAAGGAACAACATCTGAGCAACTAA
+AGCACTCTATCAACTGGAAGTGCAAGATGGTAGTGAGGGGTGGACAGGTG
+TAACTGAGTAACTCTTTGTAGGTAGCCTTTCAGTTTAATTCAGTAAAATA
+TTTTGAACACTAGTATTCCAGATACTGGTAGGCCATGACTTAACCATTCC
+TAATGTTAATCTCAGCTGTGCTAGCTGAGCTTGTGTTCACATTAGACATG
+AAGAAACTTAGTAAAAGGTAGAGCCCAGTTTTCGGTTTGGACCTTCCTGT
+TGGCCTCTGCTTCCGTGCCATCTAGCAAAGGAGTTCCTAATCTCTAGAGG
+GATACAAATGACTAGTCTGCTCCATCTGCCTCTTCCAACATTGCAGGGTA
+GCTCCCAGGGAGAAGAGTCAGTGAGTGGTTCCCAGAGAACATCCAGTATC
+T
--- a/tools/new_operations/gops_intersect.py
+++ b/tools/new_operations/gops_intersect.py
@@ -70,7 +70,7 @@ def main():
for line in intersect( [g1,g2], pieces=pieces, mincols=mincols ):
if type( line ) == GenomicInterval:
if in1_gff_format:
- line = convert_to_gff_coordinates( line )
+ line = convert_bed_coords_to_gff( line )
out_file.write( "%s\n" % "\t".join( line.fields ) )
else:
out_file.write( "%s\n" % line )
--- a/tools/extract/extract_genomic_dna.py
+++ b/tools/extract/extract_genomic_dna.py
@@ -5,6 +5,7 @@ usage: %prog $input $out_file1
-d, --dbkey=N: Genome build of input file
-o, --output_format=N: the data type of the output file
-g, --GALAXY_DATA_INDEX_DIR=N: the directory containing alignseq.loc
+ -G, --gff: input and output file, when it is interval, coordinates are treated as GFF format (1-based, half-open) rather than 'traditional' 0-based, closed format.
"""
from galaxy import eggs
import pkg_resources
@@ -14,6 +15,7 @@ from bx.cookbook import doc_optparse
import bx.seq.nib
import bx.seq.twobit
from galaxy.tools.util.galaxyops import *
+from galaxy.tools.util.gff_util import *
assert sys.version_info[:2] >= ( 2, 4 )
@@ -50,6 +52,7 @@ def __main__():
chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols )
dbkey = options.dbkey
output_format = options.output_format
+ gff_format = options.gff
GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR
input_filename, output_filename = args
except:
@@ -80,6 +83,8 @@ def __main__():
chrom = fields[chrom_col]
start = int( fields[start_col] )
end = int( fields[end_col] )
+ if gff_format:
+ start, end = convert_gff_coords_to_bed( [start, end] )
if includes_strand_col:
strand = fields[strand_col]
except:
@@ -162,7 +167,11 @@ def __main__():
c = b
else: # output_format == "interval"
meta_data = "\t".join( fields )
- fout.write( "%s\t%s\n" % ( meta_data, str( sequence ) ) )
+ if gff_format:
+ format_str = "%s seq \"%s\";\n"
+ else:
+ format_str = "%s\t%s\n"
+ fout.write( format_str % ( meta_data, str( sequence ) ) )
fout.close()
--- a/tools/new_operations/gops_subtract.py
+++ b/tools/new_operations/gops_subtract.py
@@ -71,7 +71,7 @@ def main():
for line in subtract( [g1,g2], pieces=pieces, mincols=mincols ):
if type( line ) is GenomicInterval:
if in1_gff_format:
- line = convert_to_gff_coordinates( line )
+ line = convert_bed_coords_to_gff( line )
out_file.write( "%s\n" % "\t".join( line.fields ) )
else:
out_file.write( "%s\n" % line )
--- /dev/null
+++ b/test-data/extract_genomic_dna_out4.gff
@@ -0,0 +1,46 @@
+chr10 Cufflinks transcript 62044837 62045189 1000 . . gene_id "CUFF.23531"; transcript_id "CUFF.23531.1"; FPKM "19.5178121606"; frac "1.000000"; conf_lo "9.264456"; conf_hi "29.771168"; cov "1.108611"; seq "AATTACAAGATCGACACACCAAGATAGGCAGATCCATGGTTGGTTTTACTTTGTAAATCTAAAAGTATGTTGGAAAACGATGCAATGAATTCTTATCCTTTTTCAAAATGAAGAATTTGTGATGGTTAGTGGACAGTTCAGAAGCCTCTCTGCAAGAAAGGGGGCGCTGAGAAGTGGTAAAAAAAGGAAGGAAGCACTCGGGCTTTGTCAGCAGGGTGGACCCTGGGGTCCACAGTGGGAACAGTCCCTTCTGGCCTCTACTCACTGACCAAACGCTTTACTAAAACTCCGCTTCTGGCCTCTGTTGCCACCTCCTGGTCGCTGTCCTCGGAAGTTTCTACTTCCTCCTCGCT";
+chr10 Cufflinks transcript 75372919 75373002 1000 . . gene_id "CUFF.24985"; transcript_id "CUFF.24985.1"; FPKM "124.4970510798"; frac "1.000000"; conf_lo "71.411330"; conf_hi "177.582772"; cov "7.071429"; seq "GCGTCTCGCAGCTTCTGCCCGTCGATCTCCATGTCGAGCCGGATGGGCACCAGCACCTCAGGCTGTGACGCATTCTCATGGATC";
+chr10 Cufflinks transcript 80362428 80363292 1000 - . gene_id "CUFF.26065"; transcript_id "CUFF.26065.1"; FPKM "43.6170921216"; frac "1.000000"; conf_lo "32.260169"; conf_hi "54.974016"; cov "2.477449"; seq "ATGACGGACAAGTGTTTCCGGAAGTGCATCGGGAAGCCCGGGGGCTCCTTGGATAACTCGGAGCAGGTGAGACATCTCGGGAACCCGGGGTGGTGAGGGGCGCGGGGTCAGGAGCGTCTAGGAGGTTGAGAGATGTGCGCGTGCGCGGCCTCTAGCCTTAGCTACTGAGGAAGTTGTGCGCGTGCGCGGGGTGAGGACCCGGCTTCTGTGCCTAGATCGGTGCAGCCTTCATGGGTGATCCTCGGGTCGTGTGACCGTCAGTCAGGGATCCCCCTCCACGCTTTGCAGAAATGCATCGCCATGTGCATGGACCGCTACATGGACGCCTGGAATACCGTGTCCCGCGCCTACAACTCTCGACTGCAGCGGGAACGAGCCAACATGTGACCGGGACCTGTGCCTCGGGACACCGTGCTTATGGTCTGAACTGTTTTCCCTGCCAGTTAGGGTGTCTCCTCCTAGCCGCCCTGAAGTCTGGCAGCATGGAGGGCTTGGGGATCGAGGCCTCTCCCCTGGGTTGCTGCGTCCAGCTCAATCTCAGAAGAGAGTGAGGACCCGACAGAGCACAGGGATCTGGCTGGCCCCACTGACCTGTGACCTCAGGAGAGCAGGCCAATAAATCGCTGCTGGGGCAGTAAAGCAGGCGTGTCACCTCACTGCTTCAGGTCCCTTCCCCTGAGTAGGCCCAGACCTCCCAGGGTATCTTTCCCCTTGGGGTCAGTGGGCTGCTGGCTCTCAGGGAATTCGGAGCATGATCTCAGGTGTTTGGTCATCCCGGGGA
GACCAGCCGAGGTTAAGAAGCAAGGCTTCATGTagccttcacctatcatgcatgaggcccagggtgctgaccttaactctgaat";
+chr11 Cufflinks transcript 7904565 7904642 1000 . . gene_id "CUFF.33508"; transcript_id "CUFF.33508.1"; FPKM "61.6484988869"; frac "1.000000"; conf_lo "22.882428"; conf_hi "100.414569"; cov "3.501633"; seq "CATCTTCTATTTGAGCCTCCATCCAGGCACCTCTGAAACAAAGGTGCACTCACTGCATGTCCACTTGTCACAGGAGCC";
+chr11 Cufflinks exon 78140156 78140259 1000 . . gene_id "CUFF.43148"; transcript_id "CUFF.43148.1"; exon_number "1"; FPKM "54.8483511750"; frac "1.000000"; conf_lo "23.181641"; conf_hi "86.515061"; cov "3.115385"; seq "CTGCTTGCTAATTTTCTCTCTTGGGATCAGGGGGACGTGAACTCCAGCCCTGACTCGTGCTCCTTATGCTCTGAGTACATAGCAAATAAATGAGAGCAAAACAC";
+chr11 Cufflinks exon 105616462 105616737 1000 . . gene_id "CUFF.48385"; transcript_id "CUFF.48385.1"; exon_number "1"; FPKM "18.9452034252"; frac "1.000000"; conf_lo "7.520816"; conf_hi "30.369591"; cov "1.076087"; seq "TAGGTGTAATAGTGGAAAACAATAGTTTTTAAACTTCAGAGTCCAGGGCTGTAACTCAGTAGTAACAGTGTTCTCTAAGTATGTTATTCTTCCTCTACATGCTGAAATTTTTCATATTTGGAGCATTCACTGTTCCATGTATCAGTAAATTATATTGTGAGCTGTCATCATATCTAAGCACCATATTGAATATTTTTCATGATTAAAATTTGTTGAAACAACAATTCTATGACCGAAAAAAGCAAGGCTTTGTAAATAACATGTTTGTTACTAGTA";
+chr12 Cufflinks exon 30701762 30702509 1000 . . gene_id "CUFF.53897"; transcript_id "CUFF.53897.1"; exon_number "1"; FPKM "48.9333329111"; frac "1.000000"; conf_lo "37.780391"; conf_hi "60.086275"; cov "2.779412"; seq "TGTGGAGTGTACTTATATGATCCCTATGCTGATAGGATTACCTTCCTAGACATAGCTAGACGCAAAGCCACATGTGTAAGGCTGCTGAGCAAAGACAGCATCCCAGCATGGGTGTGTTCACGGTGGATTCACCACGTTGCATATGTAAAGTGGTCCCCTTGGCTTACCCTTCACTTTGCTCATGAGATTCAGAAGCTGGTGGTCCAGCAGGGGTGAGCATTTGTGAAATAGTAAGCTGAACTTAGTGGTGAGATTTCAGAACAGACTTCTGTGAAGTAAGAGATGTAACCATGCATCTAAAATCAGATGGCCGTGTAACTGCTCGGGCATAGAAATGGTGGGAGAACCTGTCCTGGGTACCTGGCATTTCACATGAGCCCAGGGATATGTCTTGTGCCAAGGCACACAAGTGTCCATGGACTTGGACAGGTGCCAAGGGTTTTTGTCTCTGTTCCTATGTGGGAGGCTGGCTGTGATTTACATTAATTTCTGTATTTCAAACGAAGATGTCTGCAGATCTCCATTTTGATGTTACAGCCTCATTGCCCAGGCAGTGGGCAGTGCCCAGACACCCTTTCTGACTAGCCACTGCATTGGGCTTCTGTGATTCAAAGTAGTGTATATATTTATTTACTTCTCTGACTGTGGCCAACAGCCAAATGCCATTTTATGTTCCTTGTATTCAGTCCATTACCAAAGAGGTGTTTGCACTTTGTAATGATACCTTTCAGTTCAAATAAAAGGACCA";
+chr13 Cufflinks exon 49159496 49159569 1000 . . gene_id "CUFF.67788"; transcript_id "CUFF.67788.1"; exon_number "1"; FPKM "44.9657653777"; frac "1.000000"; conf_lo "10.974842"; conf_hi "78.956689"; cov "2.554054"; seq "ttttcttttggattacttgatttttttttatttgatcttatttatgatgattttgagtacatttttgaacagtt";
+chr13 Cufflinks transcript 100200304 100200330 1000 . . gene_id "CUFF.73108"; transcript_id "CUFF.73108.1"; FPKM "123.2395051093"; frac "1.000000"; conf_lo "30.079196"; conf_hi "216.399814"; cov "7.000000"; seq "TCTCATATGAATAGCCACCCTCTTCTG";
+chr14 Cufflinks transcript 31949103 31949152 1000 . . gene_id "CUFF.77316"; transcript_id "CUFF.77316.1"; FPKM "85.5634278330"; frac "1.000000"; conf_lo "28.521143"; conf_hi "142.605713"; cov "4.860000"; seq "GGATGCTATCCGCGATGTGCATGTAAAGGGCCTCATGTACCAGTGGATCG";
+chr14 Cufflinks exon 67604227 67604668 1000 . . gene_id "CUFF.81446"; transcript_id "CUFF.81446.1"; exon_number "1"; FPKM "123.6776546104"; frac "1.000000"; conf_lo "100.611653"; conf_hi "146.743656"; cov "7.024887"; seq "TTCACCGTGAGAGTTTTCTCCATTTCACTCTTCACTGTGCTGTTCTCTGTGCCGCTTTCCTCTTGACTTATAAACATCTGAGCCAGTTTTCAATAAACTTAAAACGAAGCCTGCTTCTCATCCCAAATTGTAAACAGGAATAAAGCTTTTTAAACCTTATCTTAAATTTTAACTTTGTTGAATTCTGCTTTGTGATAGGACAATCTGTTTCACCCAACAAGAATCTGTGTAGGAGGATGAACATCCCGCATGTTGGAGCTGCAAATCAGCACTGTACAAGCTCACTGATGGACAGCTGTTCTGTGATGTATTCCATGATTTTACTAATACTTTCAAAAATGGCAAAACTAACTTCAGTTTTAATGTTGAAAGAAAATCATAAATGTTCCCATAGTTCAATGGCACTGTCGATGAAACTGCTACTGAATTTAGAGAGAAAACG";
+chr14 Cufflinks exon 75165582 75165744 1000 . . gene_id "CUFF.82088"; transcript_id "CUFF.82088.1"; exon_number "1"; FPKM "20.4139057543"; frac "1.000000"; conf_lo "4.982443"; conf_hi "35.845368"; cov "1.159509"; seq "ggccctgggatgataTAACAGAAGAGTCTAAAGGAGGCTTCTGAGATGTGCAGTAGGAAAGCCTGGCACATAATAGGTTATTATCTAAATCCCTTCACTACTCTTCAAAGACAGCAGGATGCCTCTGCTCCCATGTTTTATCTCTACTTATGTGGAATTTATG";
+chr16 Cufflinks transcript 57154027 57154067 1000 . . gene_id "CUFF.103364"; transcript_id "CUFF.103364.1"; FPKM "162.3154457537"; frac "1.000000"; conf_lo "75.554191"; conf_hi "249.076701"; cov "9.219512"; seq "GTTGAGGTTTATTTAAGTAAAATGATTTTTTAAAAAAGCAA";
+chr16 Cufflinks exon 74862302 74862560 1000 . . gene_id "CUFF.105450"; transcript_id "CUFF.105450.1"; exon_number "1"; FPKM "11.0120241741"; frac "1.000000"; conf_lo "2.020744"; conf_hi "20.003304"; cov "0.625483"; seq "GCATTGGCAGCAGATATTGGTACCCAGTGGCACTGCAGAGTACTTACAATCAGGACTCGCTACTGTGCTTCATTCTGCTTTTCTCTCTGCTTCTATTACAGTTAAAGTGTTGCTAATTATAGAAACTCTCTGTTTATTGAACCTCGGTGTTAAGAAAAACTTGTAATCTTCAGATATGATCCGAAAGATTCCCAAACAAATGTAACAAGGTCCACTTTTGTAGCCCTTTCTACCAGAAcactggttatcaacctgtggg";
+chr16 Cufflinks transcript 98168779 98168914 1000 . . gene_id "CUFF.107834"; transcript_id "CUFF.107834.1"; FPKM "24.4666664555"; frac "1.000000"; conf_lo "5.971605"; conf_hi "42.961728"; cov "1.389706"; seq "CCTATTTATTTCACTAAACATCTGCCTGCTAGCTGAGATAAACATTCTCTAAAAAACTGTTTACTGCAAAAAGTGATTACTGTTTTTTATTAGTTTCTTAGCATTTGAAATAGTTACATGAATGGAAGGATAGAGT";
+chr17 Cufflinks exon 8483212 8483268 1000 . . gene_id "CUFF.108498"; transcript_id "CUFF.108498.1"; exon_number "1"; FPKM "50.0370923000"; frac "1.000000"; conf_lo "9.181978"; conf_hi "90.892207"; cov "2.842105"; seq "AGACTTGTCAACAGCTCACCCAATGATGGAACTGAGGCTGCCCCTCAAGTGGCCAGA";
+chr17 Cufflinks exon 30355791 30355913 1000 . . gene_id "CUFF.111759"; transcript_id "CUFF.111759.1"; exon_number "1"; FPKM "19.3232673516"; frac "1.000000"; conf_lo "2.040012"; conf_hi "36.606523"; cov "1.097561"; seq "atctcatacccataagctcagaactcggggtggtaacataggaggactgccatgagtgtgactaacctgggctataggaggaggatctaccttaagcaaatgaCCAACAAAACTAACAAGCTC";
+chr18 Cufflinks transcript 39571718 39571880 1000 . . gene_id "CUFF.123569"; transcript_id "CUFF.123569.1"; FPKM "20.4139057543"; frac "1.000000"; conf_lo "4.982443"; conf_hi "35.845368"; cov "1.159509"; seq "TATAACATTCCATAAATGTACAATAATCTATTTTTGAGAAGCTCATTTTGAAACTTAACACTGTCATTGATAATCTTCAAGTGGTATTTCTTAGGCACCATAAATTTCACATCCAGCTGGGTTACAATTATTTTAAAGTACTTTGAGACCAATTTAAACCATT";
+chr19 Cufflinks exon 17633088 17633203 1000 . . gene_id "CUFF.131333"; transcript_id "CUFF.131333.1"; exon_number "1"; FPKM "20.4893265884"; frac "1.000000"; conf_lo "2.163116"; conf_hi "38.815537"; cov "1.163793"; seq "TGGGAAATGAACTGCATGGCAATGAACCCCAGGGAATTTGGTGGTTAATTGTCTAAGGATAAGGACATCAGTTTTGTCTTTTGCATCACTGTGACCTTTGCCTCTAATTGTATAGA";
+chr19 Cufflinks transcript 41997624 41997859 1000 . . gene_id "CUFF.133569"; transcript_id "CUFF.133569.1"; FPKM "28.1988698132"; frac "1.000000"; conf_lo "13.125940"; conf_hi "43.271800"; cov "1.601695"; seq "gctacacaacgactcacatagagggaagcaggcacacatcagataaaacacAAAAGGATGGGTTGGTGATGGGCATAGTTAATGAGGGCCACTAGGTAAATACACCTGATCCAAAAGTCACGCTACTACTTAGATTCTTCTCTCTGCTAAAGACAACAGAAgacatgttagccatgcttgtaatccctgcattggggagatggagtcagaaatatcactgcaagttcacccaatag";
+chr19 Cufflinks exon 56516515 56516684 1000 . . gene_id "CUFF.135203"; transcript_id "CUFF.135203.1"; exon_number "1"; FPKM "33.5542854247"; frac "1.000000"; conf_lo "14.181710"; conf_hi "52.926861"; cov "1.905882"; seq "TGTATTCATTCACTATTCACTGATTTGTCAGATCATCCATCCACACAGGTGCTGAAGAGTAACCCATTTCACTTTGTATACAAGATAATGTTTTTGTACTTCAAATACATCTGGAATTCTTTCAAATATTCCAAGATTTTTTTTTTTTCTGAATAATCTTTGGTTACCTC";
+chr2 Cufflinks transcript 4543774 4543977 1000 . . gene_id "CUFF.136435"; transcript_id "CUFF.136435.1"; FPKM "37.2825393608"; frac "1.000000"; conf_lo "18.641270"; conf_hi "55.923809"; cov "2.117647"; seq "gagccatttctccagccccTTTATGTGGAATATTAACAAGAGAAGACAACATAAAATGACTTACCATGCTGTGTGGCCTAACAGTGGATGAAGAATGAGTGATTTGGGCATTTCTGATAGTATTTATAAAGAAGACTTTTATGACCAAACCACATGTCACAGTAGGGATTTGCTGCACATCTTATGAGAGTTTCTTCTTTGTCA";
+chr2 Cufflinks transcript 30200331 30200938 1000 . . gene_id "CUFF.140289"; transcript_id "CUFF.140289.1"; FPKM "100.0741846001"; frac "1.000000"; conf_lo "82.383401"; conf_hi "117.764968"; cov "5.684211"; seq "CGCACACAAAGGATTTATTTGCCAGAGAGCAAGCAGACAGGCAGAGGTCAGAATGTTAGTTAGAAACTGAAGGAATGACTGCTGTAGCCACTGTGCCCAGCCAGAGCCATGAGGGAAGTGGGAGGCAGCACTTGGTGCTGCTGCTCTGGCTGACCCTTCTGGTTTCCTGCCACACTCCTAGCCCTGCCTGTGTGCTGCTGTCCCCCTCAACCTTCCACAGCCAGAAGGCAGATGTTCTTTCATGCCAAGAGCATCCATCCCCAGCATATCCTGGGCCCATGGTGGTGTCAAATGTAGTGACCCTTCTGCCTTAAGGGAGCTGGGAAGCCTGGGGTGTGCAGGGTTGCAGGTCAGAAGCAGGACTAGCAGAGGGGCCTGGGGCCATTCTGTCTTGTGGGCTCTTTAATAGCTGAATGACGGGCACAGCCAGAAAAGGGTTAGGTCCCTTATCCTAAGCAGCTCTGTGGCCAGCAGACGACTCTAAGTGGCAGAGCCTGGGAAGGGGCTGCTTAGCTGAGAAGTTCCAGGTAGGTGACAGGAACCTTGCCCTTCTTGTTGCCTCTCTCACCAATGAGCCAGTCGGGATCCATGCCTGGCAGGCTGTAGAC";
+chr2 Cufflinks transcript 106644220 106644341 1000 . . gene_id "CUFF.148977"; transcript_id "CUFF.148977.1"; FPKM "27.2743167045"; frac "1.000000"; conf_lo "6.656871"; conf_hi "47.891762"; cov "1.549180"; seq "attcttaaggtaaatacctaggagtgatgtaacccagtcatagggaagaactacttttaatttgttgagcaacccccaacctgattttgacacaggtttgagtagtttacacttctactaac";
+chr2 Cufflinks exon 125388931 125389219 1000 . . gene_id "CUFF.151331"; transcript_id "CUFF.151331.1"; exon_number "1"; FPKM "23.0274507817"; frac "1.000000"; conf_lo "10.718761"; conf_hi "35.336141"; cov "1.307958"; seq "AGAGCACACAGCACATCACTTAGGCCTCCAACATTAAGGCAGCGCAAGTGCCTCAAGTAACTGAGAATACTTTACTCAGATACAAGGGTATCAAAAACATGAGAACTGGCAGGAAGACCTCACAATGGTTTGTTAGCATCAAGTATTACCATCCAGTTTCCTGTTTAAATAGTAATTAATGACTATTCTGAAATAAGGCAAATAATTACTCAAGCGGGCTGTCAAAGCCACTATCCTGTTGGCTGGGCATCGGAGCAGTTAACTTTATCAAAGGCTTCTGACACAATGA";
+chr3 Cufflinks transcript 130936639 130936898 1000 . . gene_id "CUFF.171349"; transcript_id "CUFF.171349.1"; FPKM "20.1110620975"; frac "1.000000"; conf_lo "7.983635"; conf_hi "32.238489"; cov "1.142308"; seq "CGAGGCTGCAGGCTGCAAATGTTCCCAGGCAGGCAAGACCTCACGTCCTACTGGCTGCTGCCCTTGGGTGCATCTGTAGGCCCCGTGGCTCCTGCCCCTGGGGTTCAACACCGATAAACATAGAATACTCATTTTCAGAAGACCTGAGGGAATGAGTCTAAGCAACGCTTTTTACAAAAAGTGGCAAGGTTCAGGAAAAAAAAAAAAAAAGATGTTGCTCCAAGGCACCAAGGGTGTAATTTTTTTTCAGAAAAAGTCAG";
+chr3 Cufflinks exon 136592671 136592771 1000 . . gene_id "CUFF.171861"; transcript_id "CUFF.171861.1"; exon_number "1"; FPKM "32.9452142371"; frac "1.000000"; conf_lo "8.040973"; conf_hi "57.849455"; cov "1.871287"; seq "TGTCAGCCCATCACATTTTAGTGACAACAGTCATAGCCTTTATTTTCAGATGACTTTCCTCTAAAACCACTGTCTATGAGTTGCCCCCCAAAACTCAAAAA";
+chr3 Cufflinks transcript 152861374 152861508 1000 . . gene_id "CUFF.173007"; transcript_id "CUFF.173007.1"; FPKM "24.6479010219"; frac "1.000000"; conf_lo "6.015839"; conf_hi "43.279963"; cov "1.400000"; seq "ATCAAAAGCGACATGCAAGCATCTTGCTCTCACCACAGATCACTGAGACATTAAGAGTGACGTCTCTTGAACTGTTGGCACGCCTAAGTTATTTCAGCATTTCTTGCTCAGCAGTTGTTCTCTTGGCTTCCTCTG";
+chr4 Cufflinks exon 13715310 13715630 1000 . . gene_id "CUFF.174817"; transcript_id "CUFF.174817.1"; exon_number "1"; FPKM "19.2510308382"; frac "1.000000"; conf_lo "8.572480"; conf_hi "29.929581"; cov "1.093458"; seq "AACACATGGCCACATCATGTGATATTTTCAAAACACTTACACATAGCTTTGAGAAGGTCCCTGCAGGAATGATCCATCCTCTCACAGTTGGCCCATTTTTTAACAGCATATCTGCATTTTCCATTTAGGAGAGCTATATATTATTAGCTTACATTTTTGGGTAGTAAAACAGTGCATTGCTGATTGTAAAACATGGACTTTATTATCTGCTGAAAATTGATTTGGCATTTATAGCCACTGTGTATTAGACTGTTTTTCTGTTTTTAACATCAATGCTTAAAAGCGATGATTTGTGTTTaaaaaaattaaaaaaataaaata";
+chr4 Cufflinks exon 147515029 147515097 1000 . . gene_id "CUFF.190627"; transcript_id "CUFF.190627.1"; exon_number "1"; FPKM "34.4458244094"; frac "1.000000"; conf_lo "3.636542"; conf_hi "65.255106"; cov "1.956522"; seq "GCTGACGTGCTCTCCGAGTTCCTGGAGGTGGCCGTGCACCTGATTCTCTATGTGCGCGAGGTCTACCCG";
+chr5 Cufflinks exon 3949522 3949685 1000 . . gene_id "CUFF.192485"; transcript_id "CUFF.192485.1"; exon_number "1"; FPKM "23.1879208220"; frac "1.000000"; conf_lo "6.791585"; conf_hi "39.584257"; cov "1.317073"; seq "AGTCCCAACCACCCCCTTGTTTAATGTATAACTTTCTGAAATGGGAGCGTTAGAATGGATTAAAATGGTTGGTAGGTGGTTGGATCACCAACCAAGACCAGAAATAGAGGGGTAGGCTGCTCAGGAGAGTATTGGGAGGGTAGCTATTATTTGCATTTTGTGCT";
+chr5 Cufflinks transcript 68089694 68089831 1000 . . gene_id "CUFF.199409"; transcript_id "CUFF.199409.1"; FPKM "17.2229122047"; frac "1.000000"; conf_lo "1.818271"; conf_hi "32.627553"; cov "0.978261"; seq "CAATGATAGAGAAGACTAAAATAAAAGCAGGCATGCTGGCACAAGCGACAGAAGGAAAAAGCCTCACCCGGCCCTGTTTGAGGCCACTCCTGGTGGCTCCTTTTCCAAGGACCATGCGGTCAAGCCTCTGAGTTGTTC";
+chr5 Cufflinks exon 122819526 122819619 1000 . . gene_id "CUFF.205487"; transcript_id "CUFF.205487.1"; exon_number "1"; FPKM "25.2486782797"; frac "1.000000"; conf_lo "2.649470"; conf_hi "47.847887"; cov "1.434124"; seq "CTTTAGAAAAGATGCATCTGTCATTGATTTAGGGATATGAATTGTTTGGATTTGAGTAGTTTTCCATAACTCCTGCAGTTTGGCAATGTGTGCG";
+chr5 Cufflinks transcript 145619548 145619710 1000 . . gene_id "CUFF.209965"; transcript_id "CUFF.209965.1"; FPKM "40.8278115086"; frac "1.000000"; conf_lo "19.004428"; conf_hi "62.651195"; cov "2.319018"; seq "CGGCGTTCTGAAAACTGTGCTCCGGGATGAGATCATTGCTTGGCACAAAAAGACACAGGAGGACACTTCCTCTCCACTGTCGGCCGCAGGGCAGCCTGAGAACATGGACAGCCAGCAGCTGGTTTCCTTAGTTCAGAAAGCCGTCACTGCCATCATGACCCGC";
+chr6 Cufflinks exon 83928984 83929105 1000 . . gene_id "CUFF.219317"; transcript_id "CUFF.219317.1"; exon_number "1"; FPKM "46.7559714935"; frac "1.000000"; conf_lo "19.761399"; conf_hi "73.750544"; cov "2.655738"; seq "ACAGGAACCATTATTTACATTTAATTTGGATGAATTTGTTACTGTGGATGAAGTCATAGAAGAAGTAAATCCTTCTCAAGCCAAGCAGAATCCATTAAAAGGAAAAAGAAAGGAAGCCCTCA";
+chr6 Cufflinks exon 118857949 118858148 1000 . . gene_id "CUFF.223543"; transcript_id "CUFF.223543.1"; exon_number "1"; FPKM "19.0140950740"; frac "1.000000"; conf_lo "5.569100"; conf_hi "32.459091"; cov "1.080000"; seq "CCAGGCTTGCTAGTTGGTGCAGTTAGCTACATCTCAGGACAGAGACAAGGTACTCTGAGCTCCCCTTGAACTGCCACACAAGCTGTCTCCTGGATGCCAAGCAGAGAAACCTGGAGACAACAATCATCATACTCAAAACCAGGATCTCTTTCTTAAGACTTTTGTATTTTGTCCCAGCCCTAACCCTGAGTTCTGCTGAA";
+chr7 Cufflinks transcript 85554210 85554343 1000 . . gene_id "CUFF.235778"; transcript_id "CUFF.235778.1"; FPKM "17.7370289869"; frac "1.000000"; conf_lo "1.872548"; conf_hi "33.601510"; cov "1.007463"; seq "GTGAAACATCATGCTTCTGCATCAAGTTATTAGTGGGAAACCTGTAAAAGTTGACATTGAATGCTGATAACAAATTACTTTCATCCTGTCTCATAATGAATCCTACATCAAGACAAGGCAAGTGAGAAAGAGGG";
+chr7 Cufflinks exon 104055491 104055589 1000 . . gene_id "CUFF.238474"; transcript_id "CUFF.238474.1"; exon_number "1"; FPKM "28.8092349606"; frac "1.000000"; conf_lo "5.286593"; conf_hi "52.331877"; cov "1.636364"; seq "ACATTTCTCCTCTCTTGGGGGAGCGCATCTCCTTGGGTGTGTCCACATCCGCCCCTAGGTACCCAGTGTGATGTGAGACACGAGTGTCTGTGCTAACTT";
+chr8 Cufflinks exon 9970398 9970545 1000 . . gene_id "CUFF.245320"; transcript_id "CUFF.245320.1"; exon_number "1"; FPKM "22.4828826889"; frac "1.000000"; conf_lo "5.487421"; conf_hi "39.478345"; cov "1.277027"; seq "AGTCTTCACCAAAATTAAGTCTCAGCTAACTTAAAAGTTGCAAGGATTTTTTTCAATAAAATTAATATCTTAAGTGTTTGGTGTTTAGATGATTCTCTCTCAACTTCCCCCACATTATCAAAAAACATTTGATGAACCTTAAAAACTC";
+chr9 Cufflinks transcript 20449846 20449932 1000 . . gene_id "CUFF.260747"; transcript_id "CUFF.260747.1"; FPKM "234.9313045507"; frac "1.000000"; conf_lo "163.275950"; conf_hi "306.586659"; cov "13.344091"; seq "CCAGCACCGATGACACCATCGGCGACTTGAAGAAACTGATAGCTGCTCAAACTGGCACCCGCTGGAACAAGATCGTTCTTAAAAAGT";
+chr9 Cufflinks exon 107445870 107445930 1000 . . gene_id "CUFF.272761"; transcript_id "CUFF.272761.1"; exon_number "1"; FPKM "38.9633095779"; frac "1.000000"; conf_lo "4.113466"; conf_hi "73.813153"; cov "2.213115"; seq "CAAGCAGAAGCTGGTGCCCATCATGACCATCCTGCTGGAAGAGCTGAATGCCTCCGGCCGC";
+chr9 Cufflinks transcript 120860476 120860606 1000 . . gene_id "CUFF.275115"; transcript_id "CUFF.275115.1"; FPKM "25.4005086867"; frac "1.000000"; conf_lo "6.199529"; conf_hi "44.601488"; cov "1.442748"; seq "CTGCCATTGTACGCACCATGCAGAATACAAATGATGTAGAGACAGCTCGTTGTACTGCTGGGACTCTGCACAACCTTTCTCACCACCGCGAGGGCTTGCTGGCCATCTTTAAGTCTGGTGGCATCCCAGCG";
+chrX Cufflinks exon 10274057 10274087 1000 . . gene_id "CUFF.276147"; transcript_id "CUFF.276147.1"; exon_number "1"; FPKM "99.5432248142"; frac "1.000000"; conf_lo "21.405127"; conf_hi "177.681323"; cov "5.654052"; seq "ACTTCGCTGTCATCATTTGTACAAACTCTTT";
+chrX Cufflinks transcript 39881431 39881678 1000 . . gene_id "CUFF.277419"; transcript_id "CUFF.277419.1"; FPKM "42.1683560109"; frac "1.000000"; conf_lo "24.187709"; conf_hi "60.149003"; cov "2.395161"; seq "AGCTAAAAAGAGTCCTTTTCTGACAGAAAGGCTGGACTTCTCCTTTTCACCGTTTCTCTTACTGATGCTTTTGCCAGAAGAACAGTAAAGATTTAGACACTGTCATGATTCATACACGTAAAATATTTTTCAAGGACACAATCTGATATACTAACATTTATTTAAGAGGTTAAAGTCCACCACTAAATCTAAGGAAAGATTTTTAACTGCCAAACACATTTCCTTTGACAAATAATGTAAGATGACAA";
+chrX Cufflinks transcript 148249672 148249713 1000 . . gene_id "CUFF.282847"; transcript_id "CUFF.282847.1"; FPKM "56.5895686726"; frac "1.000000"; conf_lo "5.974320"; conf_hi "107.204818"; cov "3.214286"; seq "AATGCTAGTATGAACAGTGGGAGGAATGAGCAAAATGTTACA";
+chrX Cufflinks transcript 148481505 148482455 1000 + . gene_id "CUFF.282965"; transcript_id "CUFF.282965.1"; FPKM "40.1706233958"; frac "1.000000"; conf_lo "16.978103"; conf_hi "63.363144"; cov "2.281690"; seq "CGCCACAACCTGCTACAGGCCTGTAAGATGCAGGACATCAAACTGCCACTGTCAAAGGGCACCATGGATGATATTAGTCAGGAAGAAGTGAGTATTATGGTGGGTGGTAGGAGTCATCTATGAATATTTAACCAGTAATGGGAGATTACAGATGGCCAGGAAGGGCAGGCAACAGATAGGACCACATAGAGTTGTGAGGGGCATAAAGATGGATGCAGAAGAAATGTGGCAAGGTGGAAGTAGTGAAGTCAGGCTTTGGTATGAGAGAGACATTGATTTGAGAGGAGAGCTGCAAGCCAGTGAGTACTCAGAAAGACCAAGAATGGGTCATTAATCTTAAGGATTTGAGCTCTTAGCTGCAGCAGATACTGGGCATGGGTAGGAGTGAGAATTGAGGAGCAGAGGAAGATGGGAAACTGGAGAACCTAAGGAGACTGATAGCTTAGCTGCAGTAAGGGAGGTTGGCCAGAAGAGGGTTGGGTAGGGGACTCAGCAAGGCAGAACTAAGGAAGCTTAGGTGGAGGGGAAGGAACAACATCTGAGCAACTAAAGCACTCTATCAACTGGAAGTGCAAGATGGTAGTGAGGGGTGGACAGGTGTAACTGAGTAACTCTTTGTAGGTAGCCTTTCAGTTTAATTCAGTAAAATATTTTGAACACTAGTATTCCAGATACTGGTAGGCCATGACTTAACCATTCCTAATGTTAATCTCAGCTGTGCTAGCTGAGCTTGTGTTCACATTAGACATGAAGAAACTTAGTAAAAGGTAGAGCCCAG
TTTTCGGTTTGGACCTTCCTGTTGGCCTCTGCTTCCGTGCCATCTAGCAAAGGAGTTCCTAATCTCTAGAGGGATACAAATGACTAGTCTGCTCCATCTGCCTCTTCCAACATTGCAGGGTAGCTCCCAGGGAGAAGAGTCAGTGAGTGGTTCCCAGAGAACATCCAGTATCT";
--- a/lib/galaxy/tools/util/gff_util.py
+++ b/lib/galaxy/tools/util/gff_util.py
@@ -6,23 +6,37 @@ from bx.intervals.io import NiceReaderWr
class GFFReaderWrapper( NiceReaderWrapper ):
"""
- Reader wrapper converts GFF format--starting and ending coordinates are 1-based, closed--to the 'traditional' interval format--0 based,
- half-open. This is useful when using GFF files as inputs to tools that expect traditional interval format.
+ Reader wrapper converts GFF format--starting and ending coordinates are 1-based, closed--to the
+ 'traditional'/BED interval format--0 based, half-open. This is useful when using GFF files as inputs
+ to tools that expect traditional interval format.
"""
def parse_row( self, line ):
- interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, self.end_col, self.strand_col, self.default_strand, fix_strand=self.fix_strand )
- # Change from 1-based to 0-based format.
- interval.start -= 1
- # Add 1 to end to move from closed to open format for end coordinate.
- interval.end += 1
+ interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, self.end_col, \
+ self.strand_col, self.default_strand, fix_strand=self.fix_strand )
+ interval = convert_gff_coords_to_bed( interval )
return interval
-def convert_to_gff_coordinates( interval ):
+def convert_bed_coords_to_gff( interval ):
"""
- Converts a GenomicInterval's coordinates to GFF format.
+ Converts an interval object's coordinates from BED format to GFF format. Accepted object types include
+ GenomicInterval and list (where the first element in the list is the interval's start, and the second
+ element is the interval's end).
"""
if type( interval ) is GenomicInterval:
interval.start += 1
- interval.end -= 1
- return interval
+ elif type ( interval ) is list:
+ interval[ 0 ] += 1
return interval
+
+def convert_gff_coords_to_bed( interval ):
+ """
+ Converts an interval object's coordinates from GFF format to BED format. Accepted object types include
+ GenomicInterval and list (where the first element in the list is the interval's start, and the second
+ element is the interval's end).
+ """
+ if type( interval ) is GenomicInterval:
+ interval.start -= 1
+ elif type ( interval ) is list:
+ interval[ 0 ] -= 1
+ return interval
+
More information about the galaxy-commits
mailing list