From 2d48a6ee275f51f6ff221cfd6e17b594ebe06fc9 Mon Sep 17 00:00:00 2001 From: Siavash Mirarab Date: Tue, 4 Mar 2014 17:05:04 -0600 Subject: [PATCH] version 1.4.0 --- README.md | 18 +++++------ resources/scripts/hmmeralign | 60 +++++++++++++++++++++--------------- sate/__init__.py | 2 +- sate/alignment.py | 2 +- sate/mainsate.py | 2 +- 5 files changed, 45 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 00904a7..01bff77 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,7 @@ All questions and inquires should be addressed to our user email group: pasta-us Acknowledgment === -The current version of this code is heavily based on the SATe code (http://phylo.bio.ku.edu/software/sate/sate.html). Refer to sate-doc -directory for documentation of the SATe code. +The current version of this code is heavily based on the SATe code (http://phylo.bio.ku.edu/software/sate/sate.html). Refer to sate-doc directory for documentation of the SATe code. INSTALLATION === @@ -35,7 +34,7 @@ Insallation steps: 2. Clone the PASTA repository from our [github repository](https://github.com/smirarab/pasta). For example you can use `git clone https://github.com/smirarab/pasta.git`. If you don't have git, you can directly download a [zip file from the repository](https://github.com/smirarab/pasta/archive/master.zip) and decompress it into your desired directory. -3. Clone the relevant "tools" directory from the SATe project. Note that there are different repositories for [linux](https://github.com/sate-dev/sate-tools-linux) and [MAC](https://github.com/sate-dev/sate-tools-mac). e.g., you can use `git clone git@github.com:sate-dev/sate-tools-linux.git` on Linux or `git@github.com:sate-dev/sate-tools-mac.git` on MAC. Or you can directly download these as zip files for [Linux](https://github.com/sate-dev/sate-tools-linux/archive/master.zip) or [MAC](https://github.com/sate-dev/sate-tools-mac/archive/master.zip) and decompress them in your target directory for PASTA code. +3. Clone the relevant "tools" directory (these are also forked from the SATe project). Note that there are different repositories for [linux](https://github.com/smirarab/sate-tools-linux) and [MAC](https://github.com/smirarab/sate-tools-mac). e.g., you can use `git clone git@github.com:smirarab/sate-tools-linux.git` on Linux or `git@github.com:smirarab/sate-tools-mac.git` on MAC. Or you can directly download these as zip files for [Linux](https://github.com/smirarab/sate-tools-linux/archive/master.zip) or [MAC](https://github.com/smirarab/sate-tools-mac/archive/master.zip) and decompress them in your target directory for PASTA code. 4. `cd pasta` @@ -73,19 +72,16 @@ python run_pasta_gui.py Starting Trees ------- -NOTE that current version of the PASTA code does NOT compute the starting tree through a -process similar to what is described in the paper. Instead, it simply uses a FastTree on -the input, if input is aligned, or else runs MAFFT on the input to align it, and then runs FastTree. - -The preferred approach for getting the PASTA starting tree is very simple, and is described below: +Since version 1.4.0, PASTA uses the procedure described in the paper for estimating the starting alignment and trees +if none is given. +The PASTA approach for getting the starting tree can be summarized as: 1. Choose a random subset of your sequences (size 100). -2. Get a SATe alignment on this subset (you need to install SATe for this; alternatively just run PASTA on it). +2. Get a MAFFT-linsi alignment on the subset. 3. Build a HMMER model on the alignment of 100 subsets. -4. Use HMMAlign to align the remaining sequences into the small subset. +4. Use hmmalign to align the remaining sequences into the small subset. 5. Run FastTree on the output of step 4. -We do have a separate program that computes this simple starting tree. See https://github.com/smirarab/sepp and use UPP. Make sure you set `-A 100 -P 100` to get the starting tree described in the PASTA paper. LICENSE === diff --git a/resources/scripts/hmmeralign b/resources/scripts/hmmeralign index a71a230..00f9115 100755 --- a/resources/scripts/hmmeralign +++ b/resources/scripts/hmmeralign @@ -5,6 +5,7 @@ import subprocess import re import os import inspect +import shutil def read_stderr(stderrdata,stderr): ''' @@ -19,7 +20,7 @@ def read_stderr(stderrdata,stderr): def log(job, name, stdoutdata, stderrdata, stderrfile): if job.returncode == 0: - print "Finished %s Job with return code: %s\n output: %s [continued]" %(name,job.returncode,stdoutdata[0:50]) + print "Finished %s Job with return code: %s\n output: %s [continued]" %(name,job.returncode,stdoutdata[0:100]) else: print ("Finished %s Job with return code: %s\n output: %s\n error:%s" @@ -88,30 +89,39 @@ if __name__ == "__main__": loc = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) - model_file = "%s.hmm" % backbone_fn - invoc = [os.path.join(loc,'hmmbuild'), "--symfrac" ,"0.0" ,"--%s" % dt, - '--informat', 'afa', model_file , backbone_fn] - buildjob = Popen(invoc, stderr = subprocess.PIPE, stdout = subprocess.PIPE) - (stdoutdata, stderrdata) = buildjob.communicate() - log(buildjob,"hmmbuild", stdoutdata, stderrdata, "") - - query_out_fn = "%s.aligned" %query_fn - invoc = [os.path.join(loc,'hmmalign'),"--%s" % dt, - #"--cpu", cpus, - "--mapali", backbone_fn, - model_file, query_fn] - alignjob = Popen(invoc, stderr = subprocess.PIPE, stdout = subprocess.PIPE) - (stdoutdata, stderrdata) = alignjob.communicate() - log(buildjob,"hmmalign", stdoutdata, stderrdata, "") - - alg = dict() - insertion = read_sto(stdoutdata, alg) - out=open("%s" % out_fn,'w') - for k,v in alg.iteritems(): - out.write(">%s\n%s\n" %(k,v)) - out.close() - - os.remove(model_file) + + if os.stat(query_fn)[6] == 0: + shutil.copy(backbone_fn, out_fn) + else: + model_file = "%s.hmm" % backbone_fn + invoc = [os.path.join(loc,'hmmbuild'), "--symfrac" ,"0.0" ,"--%s" % dt, + '--informat', 'afa', model_file , backbone_fn] + print " ".join(invoc) + buildjob = Popen(invoc, stderr = subprocess.PIPE, stdout = subprocess.PIPE) + (stdoutdata, stderrdata) = buildjob.communicate() + log(buildjob,"hmmbuild", stdoutdata, stderrdata, "") + + query_out_fn = "%s.aligned" %query_fn + invoc = [os.path.join(loc,'hmmalign'),"--%s" % dt, + #"--cpu", cpus, + "--mapali", backbone_fn, + model_file, query_fn] + print " ".join(invoc) + alignjob = Popen(invoc, stderr = subprocess.PIPE, stdout = subprocess.PIPE) + (stdoutdata, stderrdata) = alignjob.communicate() + log(buildjob,"hmmalign", stdoutdata, stderrdata, "") + + alg = dict() + insertion = read_sto(stdoutdata, alg) + out=open("%s" % out_fn,'w') + for k,v in alg.iteritems(): + out.write(">%s\n%s\n" %(k,v)) + out.close() + + if os.stat(out_fn)[6] == 0: + raise Exception("Unknown issue in generating initial alignment. Output alignment is empty. ") + + os.remove(model_file) # remove_insertion_columns(alg, insertion) # out=open("%s.masked" % out_fn,'w') # for k,v in alg.iteritems(): diff --git a/sate/__init__.py b/sate/__init__.py index 25b0120..33c3893 100644 --- a/sate/__init__.py +++ b/sate/__init__.py @@ -12,7 +12,7 @@ PROGRAM_NAME = "PASTA" PROGRAM_AUTHOR = ["Siavash Mirarab","Jiaye Yu", "Mark T. Holder", "Jeet Sukumaran", "Jamie Oaks"] PROGRAM_LICENSE = "GNU General Public License, version 3" -PROGRAM_VERSION = "1.3.0" +PROGRAM_VERSION = "1.4.0" PROGRAM_YEAR = "2013-2014" PROGRAM_DESCRIPTION = "Practical Alignment using SATe and TrAnsitivity" PROGRAM_WEBSITE = "http://phylo.bio.ku.edu/software/sate/sate.html" diff --git a/sate/alignment.py b/sate/alignment.py index 448335c..8a05158 100644 --- a/sate/alignment.py +++ b/sate/alignment.py @@ -484,7 +484,7 @@ def mask_gapy_sites(self,minimum_seq_requirement): r -= 1 if r == 0: break - if r != 0: + if r > 0: masked.append(c) _LOG.debug("%d Columns identified for masking" %len(masked)) diff --git a/sate/mainsate.py b/sate/mainsate.py index c66c772..1a8978d 100644 --- a/sate/mainsate.py +++ b/sate/mainsate.py @@ -273,7 +273,7 @@ def finish_sate_execution(sate_team, aln_job_list = [] query_fns = [] for unaligned_seqs in multilocus_dataset: - backbone = sample(unaligned_seqs.keys(), 10) + backbone = sample(unaligned_seqs.keys(), min(100,len(unaligned_seqs))) backbone_seqs = unaligned_seqs.sub_alignment(backbone) query_seq=set(unaligned_seqs.keys()) - set(backbone)