From 2d48a6ee275f51f6ff221cfd6e17b594ebe06fc9 Mon Sep 17 00:00:00 2001
From: Siavash Mirarab <toranj>
Date: Tue, 4 Mar 2014 17:05:04 -0600
Subject: [PATCH] version 1.4.0

---
 README.md                    | 18 +++++------
 resources/scripts/hmmeralign | 60 +++++++++++++++++++++---------------
 sate/__init__.py             |  2 +-
 sate/alignment.py            |  2 +-
 sate/mainsate.py             |  2 +-
 5 files changed, 45 insertions(+), 39 deletions(-)
diff --git a/README.md b/README.md
index 00904a7..01bff77 100644
--- a/README.md
+++ b/README.md
@@ -6,8 +6,7 @@ All questions and inquires should be addressed to our user email group: pasta-us
 
 Acknowledgment 
 ===
-The current version of this code is heavily based on the SATe code (http://phylo.bio.ku.edu/software/sate/sate.html). Refer to sate-doc
-directory for documentation of the SATe code. 
+The current version of this code is heavily based on the SATe code (http://phylo.bio.ku.edu/software/sate/sate.html). Refer to sate-doc directory for documentation of the SATe code. 
 
 INSTALLATION
 ===
@@ -35,7 +34,7 @@ Insallation steps:
 2. Clone the PASTA repository from our [github repository](https://github.com/smirarab/pasta). For example you can use `git clone https://github.com/smirarab/pasta.git`.
 If you don't have git, you can directly download a [zip file from the repository](https://github.com/smirarab/pasta/archive/master.zip) and decompress it into your desired directory. 
 
-3.  Clone the relevant "tools" directory from the SATe project. Note that there are different repositories for [linux](https://github.com/sate-dev/sate-tools-linux) and [MAC](https://github.com/sate-dev/sate-tools-mac). e.g., you can use `git clone git@github.com:sate-dev/sate-tools-linux.git` on Linux or `git@github.com:sate-dev/sate-tools-mac.git` on MAC. Or you can directly download these as zip files for [Linux](https://github.com/sate-dev/sate-tools-linux/archive/master.zip) or [MAC](https://github.com/sate-dev/sate-tools-mac/archive/master.zip) and decompress them in your target directory for PASTA code.
+3.  Clone the relevant "tools" directory (these are also forked from the SATe project). Note that there are different repositories for [linux](https://github.com/smirarab/sate-tools-linux) and [MAC](https://github.com/smirarab/sate-tools-mac). e.g., you can use `git clone git@github.com:smirarab/sate-tools-linux.git` on Linux or `git@github.com:smirarab/sate-tools-mac.git` on MAC. Or you can directly download these as zip files for [Linux](https://github.com/smirarab/sate-tools-linux/archive/master.zip) or [MAC](https://github.com/smirarab/sate-tools-mac/archive/master.zip) and decompress them in your target directory for PASTA code.
 
 4. `cd pasta`
 
@@ -73,19 +72,16 @@ python run_pasta_gui.py
 
 Starting Trees
 -------
-NOTE that current version of the PASTA code does NOT compute the starting tree through a
-process similar to what is described in the paper. Instead, it simply uses a FastTree on
-the input, if input is aligned, or else runs MAFFT on the input to align it, and then runs FastTree.
-
-The preferred approach for getting the PASTA starting tree is very simple, and is described below:
+Since version 1.4.0, PASTA uses the procedure described in the paper for estimating the starting alignment and trees
+if none is given. 
 
+The PASTA approach for getting the starting tree can be summarized as:
 1. Choose a random subset of your sequences (size 100).
-2. Get a SATe alignment on this subset (you need to install SATe for this; alternatively just run PASTA on it).
+2. Get a MAFFT-linsi alignment on the subset.
 3. Build a HMMER model on the alignment of 100 subsets.
-4. Use HMMAlign to align the remaining sequences into the small subset. 
+4. Use hmmalign to align the remaining sequences into the small subset. 
 5. Run FastTree on the output of step 4.
 
-We do have a separate program that computes this simple starting tree. See https://github.com/smirarab/sepp and use UPP. Make sure you set `-A 100 -P 100` to get the starting tree described in the PASTA paper. 
 
 LICENSE
 ===
diff --git a/resources/scripts/hmmeralign b/resources/scripts/hmmeralign
index a71a230..00f9115 100755
--- a/resources/scripts/hmmeralign
+++ b/resources/scripts/hmmeralign
@@ -5,6 +5,7 @@ import subprocess
 import re
 import os
 import inspect
+import shutil
 
 def read_stderr(stderrdata,stderr):    
         '''
@@ -19,7 +20,7 @@ def read_stderr(stderrdata,stderr):
         
 def log(job, name, stdoutdata, stderrdata, stderrfile):
     if job.returncode == 0:
-        print "Finished %s Job with return code: %s\n output: %s [continued]" %(name,job.returncode,stdoutdata[0:50])
+        print "Finished %s Job with return code: %s\n output: %s [continued]" %(name,job.returncode,stdoutdata[0:100])
     
     else:         
         print ("Finished %s Job with return code: %s\n output: %s\n error:%s" 
@@ -88,30 +89,39 @@ if __name__ == "__main__":
     
     loc = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
     
-    model_file = "%s.hmm" % backbone_fn
-    invoc = [os.path.join(loc,'hmmbuild'), "--symfrac" ,"0.0" ,"--%s" % dt,
-             '--informat', 'afa', model_file , backbone_fn]       
-    buildjob = Popen(invoc, stderr = subprocess.PIPE, stdout = subprocess.PIPE)
-    (stdoutdata, stderrdata) = buildjob.communicate()
-    log(buildjob,"hmmbuild", stdoutdata, stderrdata, "")
-    
-    query_out_fn = "%s.aligned" %query_fn
-    invoc = [os.path.join(loc,'hmmalign'),"--%s" % dt, 
-             #"--cpu", cpus, 
-             "--mapali", backbone_fn,
-             model_file, query_fn]       
-    alignjob = Popen(invoc, stderr = subprocess.PIPE, stdout = subprocess.PIPE)
-    (stdoutdata, stderrdata) = alignjob.communicate()
-    log(buildjob,"hmmalign", stdoutdata, stderrdata, "")
-    
-    alg = dict()
-    insertion = read_sto(stdoutdata, alg)
-    out=open("%s" % out_fn,'w')
-    for k,v in alg.iteritems():
-        out.write(">%s\n%s\n" %(k,v))
-    out.close()
-    
-    os.remove(model_file)
+   
+    if os.stat(query_fn)[6] == 0:
+        shutil.copy(backbone_fn, out_fn)
+    else:   
+        model_file = "%s.hmm" % backbone_fn
+        invoc = [os.path.join(loc,'hmmbuild'), "--symfrac" ,"0.0" ,"--%s" % dt,
+                 '--informat', 'afa', model_file , backbone_fn]       
+        print " ".join(invoc)
+        buildjob = Popen(invoc, stderr = subprocess.PIPE, stdout = subprocess.PIPE)
+        (stdoutdata, stderrdata) = buildjob.communicate()
+        log(buildjob,"hmmbuild", stdoutdata, stderrdata, "")
+        
+        query_out_fn = "%s.aligned" %query_fn
+        invoc = [os.path.join(loc,'hmmalign'),"--%s" % dt, 
+                 #"--cpu", cpus, 
+                 "--mapali", backbone_fn,
+                 model_file, query_fn] 
+        print " ".join(invoc)      
+        alignjob = Popen(invoc, stderr = subprocess.PIPE, stdout = subprocess.PIPE)
+        (stdoutdata, stderrdata) = alignjob.communicate()
+        log(buildjob,"hmmalign", stdoutdata, stderrdata, "")
+        
+        alg = dict()
+        insertion = read_sto(stdoutdata, alg)
+        out=open("%s" % out_fn,'w')
+        for k,v in alg.iteritems():
+            out.write(">%s\n%s\n" %(k,v))
+        out.close()
+        
+        if os.stat(out_fn)[6] == 0:
+            raise Exception("Unknown issue in generating initial alignment. Output alignment is empty. ")
+        
+        os.remove(model_file)
 #     remove_insertion_columns(alg, insertion)
 #     out=open("%s.masked" % out_fn,'w')
 #     for k,v in alg.iteritems():
diff --git a/sate/__init__.py b/sate/__init__.py
index 25b0120..33c3893 100644
--- a/sate/__init__.py
+++ b/sate/__init__.py
@@ -12,7 +12,7 @@
 PROGRAM_NAME = "PASTA"
 PROGRAM_AUTHOR = ["Siavash Mirarab","Jiaye Yu", "Mark T. Holder", "Jeet Sukumaran", "Jamie Oaks"]
 PROGRAM_LICENSE = "GNU General Public License, version 3"
-PROGRAM_VERSION = "1.3.0"
+PROGRAM_VERSION = "1.4.0"
 PROGRAM_YEAR = "2013-2014"
 PROGRAM_DESCRIPTION = "Practical Alignment using SATe and TrAnsitivity"
 PROGRAM_WEBSITE = "http://phylo.bio.ku.edu/software/sate/sate.html"
diff --git a/sate/alignment.py b/sate/alignment.py
index 448335c..8a05158 100644
--- a/sate/alignment.py
+++ b/sate/alignment.py
@@ -484,7 +484,7 @@ def mask_gapy_sites(self,minimum_seq_requirement):
                     r -= 1
                 if r == 0:
                     break
-            if r != 0:
+            if r > 0:
                 masked.append(c)
                  
         _LOG.debug("%d Columns identified for masking" %len(masked))
diff --git a/sate/mainsate.py b/sate/mainsate.py
index c66c772..1a8978d 100644
--- a/sate/mainsate.py
+++ b/sate/mainsate.py
@@ -273,7 +273,7 @@ def finish_sate_execution(sate_team,
                 aln_job_list = []
                 query_fns = []
                 for unaligned_seqs in multilocus_dataset:
-                    backbone = sample(unaligned_seqs.keys(), 10)   
+                    backbone = sample(unaligned_seqs.keys(), min(100,len(unaligned_seqs)))   
                     backbone_seqs = unaligned_seqs.sub_alignment(backbone)
                     
                     query_seq=set(unaligned_seqs.keys()) - set(backbone)