Skip to content

Commit

Permalink
Merge pull request galaxyproject#2683 from mvdbeek/test_twobit_data_m…
Browse files Browse the repository at this point in the history
…anager

Fix tests for data_manager_twobit_builder and python 3 compatibility
  • Loading branch information
bgruening authored and mvdbeek committed Nov 20, 2019
1 parent 8e79c16 commit 30456c9
Show file tree
Hide file tree
Showing 10 changed files with 172 additions and 34 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
CHUNK_SIZE = 2**20 # 1mb


def get_id_name( params, dbkey, fasta_description=None):
def get_id_name(params, dbkey, fasta_description=None):
# TODO: ensure sequence_id is unique and does not already appear in location file
sequence_id = params['param_dict']['sequence_id']
if not sequence_id:
Expand All @@ -26,73 +26,73 @@ def get_id_name( params, dbkey, fasta_description=None):
return sequence_id, sequence_name


def build_twobit( data_manager_dict, fasta_filename, params, target_directory, dbkey, sequence_id, sequence_name ):
twobit_base_name = "%s.2bit" % ( sequence_id )
twobit_filename = os.path.join( target_directory, twobit_base_name )
def build_twobit(data_manager_dict, fasta_filename, params, target_directory, dbkey, sequence_id, sequence_name):
twobit_base_name = "%s.2bit" % (sequence_id)
twobit_filename = os.path.join(target_directory, twobit_base_name)

args = [ 'faToTwoBit', fasta_filename, twobit_filename ]
tmp_stderr = tempfile.NamedTemporaryFile( prefix="tmp-data-manager-twobit-builder-stderr" )
proc = subprocess.Popen( args=args, shell=False, cwd=target_directory, stderr=tmp_stderr.fileno() )
args = ['faToTwoBit', fasta_filename, twobit_filename]
tmp_stderr = tempfile.NamedTemporaryFile(prefix="tmp-data-manager-twobit-builder-stderr")
proc = subprocess.Popen(args=args, shell=False, cwd=target_directory, stderr=tmp_stderr.fileno())
return_code = proc.wait()
if return_code:
tmp_stderr.flush()
tmp_stderr.seek(0)
print("Error building index:", file=sys.stderr)
while True:
chunk = tmp_stderr.read( CHUNK_SIZE )
chunk = tmp_stderr.read(CHUNK_SIZE)
if not chunk:
break
sys.stderr.write( chunk )
sys.exit( return_code )
sys.stderr.write(chunk)
sys.exit(return_code)
tmp_stderr.close()
# lastz_seqs
data_table_entry = dict( value=sequence_id, name=sequence_name, path=twobit_base_name )
data_table_entry = dict(value=sequence_id, name=sequence_name, path=twobit_base_name)

_add_data_table_entry( data_manager_dict, "lastz_seqs", data_table_entry )
_add_data_table_entry(data_manager_dict, "lastz_seqs", data_table_entry)
# twobit.loc
data_table_entry = dict( value=sequence_id, path=twobit_base_name )
data_table_entry = dict(value=sequence_id, path=twobit_base_name)

_add_data_table_entry( data_manager_dict, "twobit", data_table_entry )
_add_data_table_entry(data_manager_dict, "twobit", data_table_entry)
# alignseq
data_table_entry = dict( type="seq", value=sequence_id, path=twobit_base_name )
data_table_entry = dict(type="seq", value=sequence_id, path=twobit_base_name)

_add_data_table_entry( data_manager_dict, "alignseq_seq", data_table_entry )
_add_data_table_entry(data_manager_dict, "alignseq_seq", data_table_entry)


def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ):
data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} )
data_manager_dict['data_tables'][ data_table_name ] = data_manager_dict['data_tables'].get( data_table_name, [] )
data_manager_dict['data_tables'][ data_table_name ].append( data_table_entry )
def _add_data_table_entry(data_manager_dict, data_table_name, data_table_entry):
data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get(data_table_name, [])
data_manager_dict['data_tables'][data_table_name].append(data_table_entry)
return data_manager_dict


def main():
parser = optparse.OptionParser()
parser.add_option( '-f', '--fasta_filename', dest='fasta_filename', action='store', type="string", default=None, help='fasta_filename' )
parser.add_option( '-d', '--fasta_dbkey', dest='fasta_dbkey', action='store', type="string", default=None, help='fasta_dbkey' )
parser.add_option( '-t', '--fasta_description', dest='fasta_description', action='store', type="string", default=None, help='fasta_description' )
parser.add_option('-f', '--fasta_filename', dest='fasta_filename', action='store', type="string", default=None, help='fasta_filename')
parser.add_option('-d', '--fasta_dbkey', dest='fasta_dbkey', action='store', type="string", default=None, help='fasta_dbkey')
parser.add_option('-t', '--fasta_description', dest='fasta_description', action='store', type="string", default=None, help='fasta_description')
(options, args) = parser.parse_args()

filename = args[0]

params = loads( open( filename ).read() )
params = loads(open(filename).read())

target_directory = params[ 'output_data' ][0]['extra_files_path']
os.mkdir( target_directory )
target_directory = params['output_data'][0]['extra_files_path']
os.mkdir(target_directory)
data_manager_dict = {}

dbkey = options.fasta_dbkey

if dbkey in [ None, '', '?' ]:
raise Exception( '"%s" is not a valid dbkey. You must specify a valid dbkey.' % ( dbkey ) )
if dbkey in [None, '', '?']:
raise Exception('"%s" is not a valid dbkey. You must specify a valid dbkey.' % (dbkey))

sequence_id, sequence_name = get_id_name( params, dbkey=dbkey, fasta_description=options.fasta_description )
sequence_id, sequence_name = get_id_name(params, dbkey=dbkey, fasta_description=options.fasta_description)

# build the index
build_twobit( data_manager_dict, options.fasta_filename, params, target_directory, dbkey, sequence_id, sequence_name )
build_twobit(data_manager_dict, options.fasta_filename, params, target_directory, dbkey, sequence_id, sequence_name)

# save info to json file
open( filename, 'wb' ).write( dumps( data_manager_dict ) )
open(filename, 'w').write(dumps(data_manager_dict, sort_keys=True))


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<tool id="twobit_builder_data_manager" name="TwoBit" tool_type="manage_data" version="0.0.2">
<tool id="twobit_builder_data_manager" name="TwoBit" tool_type="manage_data" version="0.0.3">
<requirements>
<requirement type="package" version="324">ucsc-fatotwobit</requirement>
<requirement type="package" version="377">ucsc-fatotwobit</requirement>
</requirements>
<description>builder</description>
<command detect_errors="exit_code"><![CDATA[
Expand All @@ -25,7 +25,7 @@
<test>
<param name="all_fasta_source" value="sacCer2"/>
<param name="sequence_name" value=""/>
<output name="sequence_id" file="data_manager_two_bit_out_1.dat"/>
<output name="out_file" file="data_manager_twobit.json"/>
</test>
</tests>

Expand Down
57 changes: 57 additions & 0 deletions data_managers/data_manager_twobit_builder/test-data/alignseq.loc
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#This is a sample file distributed with Galaxy that enables tools
#to use alignment data stored as axt files (lines starting with "align")
#or nib files (lines starting with "seq"). You will need to index
#them and then create an alignseq.loc file similar to this one (store
#it in this directory) that points to the directories in which those
#alignments are stored. The "align" data referred to by the alignseq.loc
#file has this format (white space characters are TAB characters):
#
#align <build1> <build2> <dir>
#
#So, for example, if you had hg18/bosTau2 alignment files stored in
#/depot/data2/galaxy/hg18/align/bosTau2, then the alignseq.loc entry
#would look like this:
#
#align hg18 bosTau2 /depot/data2/galaxy/hg18/align/bosTau2
#
#and your /depot/data2/galaxy/hg18/align/bosTau2 directory would
#contain all of your alignment files (e.g.):
#
#-rw-rw-r-- 1 nate galaxy 151842783 2006-01-08 01:00 chr10.axt
#-rw-rw-r-- 1 nate galaxy 79575 2006-01-08 01:00 chr10_random.axt
#-rw-rw-r-- 1 nate galaxy 155015634 2006-01-08 01:01 chr11.axt
#...etc...
#
#Your alignseq.loc file should include an entry per line for each alignment
#file you have stored. For example:
#
#align anoGam1 dm1 /depot/data2/galaxy/anoGam1/align/dm1
#align anoGam1 dm2 /depot/data2/galaxy/anoGam1/align/dm2
#align canFam1 hg17 /depot/data2/galaxy/canFam1/align/hg17
#...etc...
#
#The "seq" data referred to by the alignseq.loc file has this
#format (white space characters are TAB characters):
#
#seq <build1> <dir>
#
#So, for example, if you had anoGam1 sequence files stored in
#/depot/data2/galaxy/anoGam1/seq, then the alignseq.loc entry
#would look like this:
#
#seq anoGam1 /depot/data2/galaxy/anoGam1/seq
#and your seq anoGam1 /depot/data2/galaxy/anoGam1/seq directory would
#contain all of your sequence files (e.g.):
#
#-rw-rw-r-- 1 nate galaxy 24397551 2006-06-26 12:51 chr2L.nib
#-rw-rw-r-- 1 nate galaxy 31362964 2006-06-26 12:51 chr2R.nib
#-rw-rw-r-- 1 nate galaxy 20642013 2006-06-26 12:51 chr3L.nib
#-rw-rw-r-- 1 nate galaxy 26636071 2006-06-26 12:51 chr3R.nib
#
#Your alignseq.loc file should include an entry per line for each sequence
#file you have stored. For example:
#
#seq anoGam1 /depot/data2/galaxy/anoGam1/seq
#seq bosTau2 /depot/data2/galaxy/bosTau2/seq
#seq bosTau3 /depot/data2/galaxy/bosTau3/seq
#...etc...
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
sacCer2 sacCer2 sacCer2 ${__HERE__}/sacCer2.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"data_tables": {"alignseq_seq": [{"path": "sacCer2.2bit", "type": "seq", "value": "sacCer2"}], "lastz_seqs": [{"name": "sacCer2", "path": "sacCer2.2bit", "value": "sacCer2"}], "twobit": [{"path": "sacCer2.2bit", "value": "sacCer2"}]}}
30 changes: 30 additions & 0 deletions data_managers/data_manager_twobit_builder/test-data/lastz_seqs.loc
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#This is a sample file distributed with Galaxy that enables tools
#to use a directory of 2bit genome files for use with Lastz. You will
#need to supply these files and then create a lastz_seqs.loc file
#similar to this one (store it in this directory) that points to
#the directories in which those files are stored. The lastz_seqs.loc
#file has this format (white space characters are TAB characters):
#
#<unique_build_id> <display_name> <file_path>
#
#So, for example, if your lastz_seqs.loc began like this:
#
#hg18 Human (Homo sapiens): hg18 /depot/data2/galaxy/twobit/hg18.2bit
#hg19 Human (Homo sapiens): hg19 /depot/data2/galaxy/twobit/hg19.2bit
#mm9 Mouse (Mus musculus): mm9 /depot/data2/galaxy/twobit/mm9.2bit
#
#then your /depot/data2/galaxy/twobit/ directory
#would need to contain the following 2bit files:
#
#-rw-r--r-- 1 james universe 830134 2005-09-13 10:12 hg18.2bit
#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 hg19.2bit
#-rw-r--r-- 1 james universe 269808 2005-09-13 10:12 mm9.2bit
#
#Your lastz_seqs.loc file should include an entry per line for
#each file you have stored that you want to be available. Note that
#your files should all have the extension '2bit'.
#
#Note that for backwards compatibility with workflows, the unique ID of
#an entry must be the path that was in the original loc file, because that
#is the value stored in the workflow for that parameter.
#
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>chr1
ATGCATCGATCGATCGCATCGACTACGACTACGATCAGTCACTACACTACGTACAGCTACGACTACGACTACGATCGACTACGATCAGCTACGACA
26 changes: 26 additions & 0 deletions data_managers/data_manager_twobit_builder/test-data/twobit.loc
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#This is a sample file distributed with Galaxy that is used by some
#tools. The twobit.loc file has this format (white space characters
#are TAB characters):
#
#<Build> <FullPathToFile>
#
#So, for example, if you had droPer1 twobit files stored in
#/depot/data2/galaxy/droPer1/, then the twobit.loc entry
#would look like this:
#
#droPer1 /depot/data2/galaxy/droPer1/droPer1.2bit
#
#and your /depot/data2/galaxy/droPer1/ directory would
#contain all of your twobit files (e.g.):
#
#-rw-rw-r-- 1 nate galaxy 48972650 2007-05-04 11:27 droPer1.2bit
#...etc...
#
#Your twobit.loc file should include an entry per line for each twobit
#file you have stored. For example:
#
#droPer1 /depot/data2/galaxy/droPer1/droPer1.2bit
#apiMel2 /depot/data2/galaxy/apiMel2/apiMel2.2bit
#droAna1 /depot/data2/galaxy/droAna1/droAna1.2bit
#droAna2 /depot/data2/galaxy/droAna2/droAna2.2bit
#...etc...
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc-->
<tables>
<!-- Locations of all fasta files under genome directory -->
<table name="all_fasta" comment_char="#">
<columns>value, dbkey, name, path</columns>
<file path="${__HERE__}/test-data/all_fasta.loc" />
</table>
<!-- Locations of 2bit sequence files for use in Lastz -->
<table name="lastz_seqs" comment_char="#">
<columns>value, name, path</columns>
<file path="${__HERE__}/test-data/lastz_seqs.loc" />
</table>
<table name="twobit" comment_char="#">
<columns>value, path</columns>
<file path="${__HERE__}/test-data/twobit.loc" />
</table>
<table name="alignseq_seq" comment_char="#">
<columns>type, value, path</columns>
<file path="${__HERE__}/test-data/alignseq.loc" />
</table>
</tables>

0 comments on commit 30456c9

Please sign in to comment.