option to validate hmm hits by sequence score

BIONF · Dec 20, 2023 · 0404231 · 0404231
1 parent 9dc4fe9
commit 0404231
Show file tree

Hide file tree

Showing 6 changed files with 60 additions and 33 deletions.
diff --git a/.github/workflows/github_build.yml b/.github/workflows/github_build.yml
@@ -52,7 +52,7 @@ jobs:
         mkdir seeds
         path=$(fdog.setup -d ./ --getSourcepath); a="1 2 3"; for i in ${a[@]}; do cp $path/data/infile.fa seeds/$i.fa; done
         echo "TEST fdogs.run"
-        fdogs.run --seqFolder seeds --jobName test_multi --refspec HUMAN@9606@3 --fasOff --searchTaxa PARTE@5888@3,THAPS@35128@3
+        fdogs.run --seqFolder seeds --jobName test_multi --refspec HUMAN@9606@3 --fasOff --searchTaxa PARTE@5888@3,THAPS@35128@3 --hmmScoreType sequence
         echo "TEST fdog.addTaxon"
         head /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir/HUMAN@9606@3/HUMAN@9606@3.fa > hm.fa
         fdog.addTaxon -f hm.fa -i 9606 -o ./ -c -a

diff --git a/fdog/libs/hmm.py b/fdog/libs/hmm.py
@@ -35,7 +35,7 @@ def create_hmm(aln_file, out_file):
         sys.exit('ERROR: Error running hmmbuild %s' % hmmbuild_cmd)
 
 
-def sort_hmm_hits(hmm_hits, hitLimit = 10, scoreCutoff = 10, debug = False):
+def sort_hmm_hits(hmm_hits, hmm_score_type = 'domain', hitLimit = 10, scoreCutoff = 10, debug = False):
     """ Sort HMM hits
     Keep only n hits (n =< hitLimit), and hits that are not less than
     best_hit_domain_score * (100 - scoreCutoff) / 100
@@ -45,27 +45,48 @@ def sort_hmm_hits(hmm_hits, hitLimit = 10, scoreCutoff = 10, debug = False):
     cutoff = ''
     score_dict = {}
     ori_hits = {}
-    for hit in hmm_hits:
-        ori_hits[hit.name.decode('ASCII')] = len(hit.domains)
-        best_domain_score = -9999 #hit.domains[0].score
-        best_domain_hit = ''
-        if len(hit.domains) > 0:
-            # get domain with best score for this hit
-            for i in hit.domains:
-                if i.score > best_domain_score:
-                    best_domain_score = i.score
-                    best_domain_hit = i.hit.name.decode('ASCII')
-            # add hit to score_dict with increasing domain score
-            if best_domain_score > best_score:
-                best_score = best_domain_score
-            cutoff = best_score/100*(100-scoreCutoff)
-            if best_score < 0:
-                cutoff = best_score/100*(100+scoreCutoff)
-            if best_domain_score >= cutoff:
-                if best_domain_score not in score_dict:
-                    score_dict[best_domain_score] = [best_domain_hit]
-                else:
-                    score_dict[best_domain_score].append(best_domain_hit)
+    best_hit_score = -9999
+    if hmm_score_type == 'domain':
+        for hit in hmm_hits:
+            ori_hits[hit.name.decode('ASCII')] = len(hit.domains)
+            best_domain_score = -9999 #hit.domains[0].score
+            best_domain_hit = ''
+            if len(hit.domains) > 0:
+                # get domain with best score for this hit
+                for i in hit.domains:
+                    if i.score > best_domain_score:
+                        best_domain_score = i.score
+                        best_domain_hit = i.hit.name.decode('ASCII')
+                # add hit to score_dict with increasing domain score
+                if best_domain_score > best_score:
+                    best_score = best_domain_score
+                cutoff = best_score/100*(100-scoreCutoff)
+                if best_score < 0:
+                    cutoff = best_score/100*(100+scoreCutoff)
+                if best_domain_score >= cutoff:
+                    if best_domain_score not in score_dict:
+                        score_dict[best_domain_score] = [best_domain_hit]
+                    else:
+                        score_dict[best_domain_score].append(best_domain_hit)
+    else:
+        for hit in hmm_hits:
+            ori_hits[hit.name.decode('ASCII')] = len(hit.domains)
+            if hit.score > best_hit_score:
+                # get hit with best score
+                best_hit_score = hit.score
+                best_hit = hit.name.decode('ASCII')
+                # add to score_dict
+                if best_hit_score > best_score:
+                    best_score = best_hit_score
+                cutoff = best_score/100*(100-scoreCutoff)
+                if best_score < 0:
+                    cutoff = best_score/100*(100+scoreCutoff)
+                if best_hit_score >= cutoff:
+                    if best_hit_score not in score_dict:
+                        score_dict[best_hit_score] = [best_hit]
+                    else:
+                        score_dict[best_hit_score].append(best_hit)
+
     output_fn.print_debug(debug, 'All HMM hits', ori_hits)
     hmm_cand = {}
     n = 0
@@ -84,7 +105,7 @@ def sort_hmm_hits(hmm_hits, hitLimit = 10, scoreCutoff = 10, debug = False):
 
 def do_hmmsearch(
         hmm_file, search_fa, evalHmmer = 0.00001, scoreCutoff = 10,
-        hitLimit = 10, cpus = os.cpu_count(), debug = False):
+        hitLimit = 10, hmm_score_type = 'domain', cpus = os.cpu_count(), debug = False):
     """ Perform hmmsearch for a hmm file vs a multiple fasta file
     Return a dictionary of hits and their e-value and bit-score
     Only "top" hits are returned. The cutoff is defined by
@@ -100,7 +121,7 @@ def do_hmmsearch(
             for hits in pyhmmer.hmmsearch(
                     hmm_file, sequences, E = evalHmmer, cpus = cpus):
                 if len(hits) > 0:
-                    hmm_hits = sort_hmm_hits(hits, hitLimit, scoreCutoff, debug)
+                    hmm_hits = sort_hmm_hits(hits, hmm_score_type, hitLimit, scoreCutoff, debug)
         except :
             sys.exit(
                 'ERROR: Error running hmmsearch for %s agains %s'

diff --git a/fdog/libs/orthosearch.py b/fdog/libs/orthosearch.py
@@ -34,7 +34,7 @@
 def hamstr(args):
     (seqName, hmmpath, corepath, searchpath, outpath,
         refspec, seed_id, search_taxon,
-        evalHmmer, hitLimit, scoreCutoff,
+        evalHmmer, hitLimit, hmmScoreType, scoreCutoff,
         evalBlast, lowComplexityFilter,
         checkCoorthologsRefOff, rbh, rep,
         aligner, cpus, debug, silentOff, noCleanup) = args
@@ -72,7 +72,7 @@ def hamstr(args):
 
     ### (1) Do hmmsearch for query hmm against search taxon fasta
     hmm_hits = hmm_fn.do_hmmsearch(
-            hmm_file, search_fa, evalHmmer, scoreCutoff, hitLimit, cpus, debug)
+            hmm_file, search_fa, evalHmmer, scoreCutoff, hitLimit, hmmScoreType, cpus, debug)
     output_fn.print_debug(debug, 'Sorted HMM hits', hmm_hits)
     ### (2) Read fasta file of refspec and search taxon
     refspec_seqs = fasta_fn.read_fasta(refspec_fa)
@@ -202,7 +202,7 @@ def run_hamstr(args):
     (seqName, refspec, pathArgs, orthoArgs, otherArgs) = args
     (outpath, hmmpath, corepath, searchpath, annopath) = pathArgs
     (checkCoorthologsRefOff, rbh, rep, evalBlast, lowComplexityFilter,
-                        evalHmmer, hitLimit, scoreCutoff, aligner) = orthoArgs
+                        evalHmmer, hitLimit, hmmScoreType, scoreCutoff, aligner) = orthoArgs
     (searchTaxa, cpus, debug, silentOff, noCleanup, force, append) = otherArgs
 
     hamstr_jobs = []
@@ -225,7 +225,7 @@ def run_hamstr(args):
                 hamstr_jobs.append([
                     seqName, hmmpath, corepath, searchpath, outpath,
                     refspec, seed_id, search_taxon,
-                    evalHmmer, hitLimit, scoreCutoff,
+                    evalHmmer, hitLimit, hmmScoreType, scoreCutoff,
                     evalBlast, lowComplexityFilter,
                     checkCoorthologsRefOff, rbh, rep,
                     aligner, cpus, debug, silentOff, noCleanup
@@ -245,7 +245,7 @@ def run_hamstr(args):
                 hamstr_jobs.append([
                     seqName, hmmpath, corepath, searchpath, outpath,
                     refspec, seed_id, search_taxon,
-                    evalHmmer, hitLimit, scoreCutoff,
+                    evalHmmer, hitLimit, hmmScoreType, scoreCutoff,
                     evalBlast, lowComplexityFilter,
                     checkCoorthologsRefOff, rbh, rep,
                     aligner, cpus, debug, silentOff, noCleanup

diff --git a/fdog/runMulti.py b/fdog/runMulti.py
@@ -221,6 +221,8 @@ def main():
                                 action='store', default=0.0001, type=float)
     ortho_options.add_argument('--hitLimit', help='number of hits of the initial pHMM based search that should be evaluated via a reverse search. Default: 10',
                                 action='store', default=10, type=int)
+    ortho_options.add_argument('--hmmScoreType', help='Choose type of hmm score (best domain or full sequence score) for validating HMM candidates (NOTE: applied also for the core compilation). Default: domain',
+                                action='store', choices=['domain','sequence'], default='domain')
     ortho_options.add_argument('--scoreCutoff', help='Define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10',
                                 action='store', default=10, type=int)
 
@@ -295,6 +297,7 @@ def main():
     evalBlast = args.evalBlast
     evalHmmer = args.evalHmmer
     hitLimit = args.hitLimit
+    hmmScoreType = args.hmmScoreType
     scoreCutoff = args.scoreCutoff
 
     # fas arguments
@@ -417,7 +420,7 @@ def main():
 
             ### do ortholog search
             orthoArgs = [checkCoorthologsRefOff, rbh, rep, evalBlast,
-                        lowComplexityFilter, evalHmmer, hitLimit, scoreCutoff, aligner]
+                        lowComplexityFilter, evalHmmer, hitLimit, hmmScoreType, scoreCutoff, aligner]
             otherArgs = [searchTaxa, cpus, debug, silentOff, noCleanup, force, append]
             ortho_options = [orthoArgs, otherArgs, pathArgs, refspec]
             ortho_runtime = search_ortholog(ortho_options, seeds, inFol, outpath)

diff --git a/fdog/runSingle.py b/fdog/runSingle.py
@@ -93,6 +93,8 @@ def main():
                                 action='store', default=0.0001, type=float)
     ortho_options.add_argument('--hitLimit', help='number of hits of the initial pHMM based search that should be evaluated via a reverse search. Default: 10',
                                 action='store', default=10, type=int)
+    ortho_options.add_argument('--hmmScoreType', help='Choose type of hmm score (best domain or full sequence score) for validating HMM candidates (NOTE: applied also for the core compilation). Default: domain',
+                                action='store', choices=['domain','sequence'], default='domain')
     ortho_options.add_argument('--scoreCutoff', help='Define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10',
                                 action='store', default=10, type=int)
 
@@ -166,6 +168,7 @@ def main():
     evalBlast = args.evalBlast
     evalHmmer = args.evalHmmer
     hitLimit = args.hitLimit
+    hmmScoreType = args.hmmScoreType
     scoreCutoff = args.scoreCutoff
 
     # fas arguments
@@ -275,7 +278,7 @@ def main():
                 searchTaxa = ','.join(searchTaxa)
         # do ortholog search
         orthoArgs = [checkCoorthologsRefOff, rbh, rep, evalBlast,
-                    lowComplexityFilter, evalHmmer, hitLimit, scoreCutoff, aligner]
+                    lowComplexityFilter, evalHmmer, hitLimit, hmmScoreType, scoreCutoff, aligner]
         otherArgs = [searchTaxa, cpus, debug, silentOff, noCleanup, force, append]
         hamstr_out = ortho_fn.run_hamstr([seqName, refspec, pathArgs, orthoArgs, otherArgs])
         output_fn.write_hamstr(hamstr_out, outpath, seqName, force, append)

diff --git a/setup.py b/setup.py
@@ -26,7 +26,7 @@
 
 setup(
     name="fdog",
-    version="0.1.25",
+    version="0.1.26",
     python_requires='>=3.7.0',
     description="Feature-aware Directed OrtholoG search tool",
     long_description=long_description,