Skip to content

Commit

Permalink
option to validate hmm hits by sequence score
Browse files Browse the repository at this point in the history
  • Loading branch information
trvinh committed Dec 20, 2023
1 parent 9dc4fe9 commit 0404231
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 33 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/github_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
mkdir seeds
path=$(fdog.setup -d ./ --getSourcepath); a="1 2 3"; for i in ${a[@]}; do cp $path/data/infile.fa seeds/$i.fa; done
echo "TEST fdogs.run"
fdogs.run --seqFolder seeds --jobName test_multi --refspec HUMAN@9606@3 --fasOff --searchTaxa PARTE@5888@3,THAPS@35128@3
fdogs.run --seqFolder seeds --jobName test_multi --refspec HUMAN@9606@3 --fasOff --searchTaxa PARTE@5888@3,THAPS@35128@3 --hmmScoreType sequence
echo "TEST fdog.addTaxon"
head /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir/HUMAN@9606@3/HUMAN@9606@3.fa > hm.fa
fdog.addTaxon -f hm.fa -i 9606 -o ./ -c -a
Expand Down
69 changes: 45 additions & 24 deletions fdog/libs/hmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def create_hmm(aln_file, out_file):
sys.exit('ERROR: Error running hmmbuild %s' % hmmbuild_cmd)


def sort_hmm_hits(hmm_hits, hitLimit = 10, scoreCutoff = 10, debug = False):
def sort_hmm_hits(hmm_hits, hmm_score_type = 'domain', hitLimit = 10, scoreCutoff = 10, debug = False):
""" Sort HMM hits
Keep only n hits (n =< hitLimit), and hits that are not less than
best_hit_domain_score * (100 - scoreCutoff) / 100
Expand All @@ -45,27 +45,48 @@ def sort_hmm_hits(hmm_hits, hitLimit = 10, scoreCutoff = 10, debug = False):
cutoff = ''
score_dict = {}
ori_hits = {}
for hit in hmm_hits:
ori_hits[hit.name.decode('ASCII')] = len(hit.domains)
best_domain_score = -9999 #hit.domains[0].score
best_domain_hit = ''
if len(hit.domains) > 0:
# get domain with best score for this hit
for i in hit.domains:
if i.score > best_domain_score:
best_domain_score = i.score
best_domain_hit = i.hit.name.decode('ASCII')
# add hit to score_dict with increasing domain score
if best_domain_score > best_score:
best_score = best_domain_score
cutoff = best_score/100*(100-scoreCutoff)
if best_score < 0:
cutoff = best_score/100*(100+scoreCutoff)
if best_domain_score >= cutoff:
if best_domain_score not in score_dict:
score_dict[best_domain_score] = [best_domain_hit]
else:
score_dict[best_domain_score].append(best_domain_hit)
best_hit_score = -9999
if hmm_score_type == 'domain':
for hit in hmm_hits:
ori_hits[hit.name.decode('ASCII')] = len(hit.domains)
best_domain_score = -9999 #hit.domains[0].score
best_domain_hit = ''
if len(hit.domains) > 0:
# get domain with best score for this hit
for i in hit.domains:
if i.score > best_domain_score:
best_domain_score = i.score
best_domain_hit = i.hit.name.decode('ASCII')
# add hit to score_dict with increasing domain score
if best_domain_score > best_score:
best_score = best_domain_score
cutoff = best_score/100*(100-scoreCutoff)
if best_score < 0:
cutoff = best_score/100*(100+scoreCutoff)
if best_domain_score >= cutoff:
if best_domain_score not in score_dict:
score_dict[best_domain_score] = [best_domain_hit]
else:
score_dict[best_domain_score].append(best_domain_hit)
else:
for hit in hmm_hits:
ori_hits[hit.name.decode('ASCII')] = len(hit.domains)
if hit.score > best_hit_score:
# get hit with best score
best_hit_score = hit.score
best_hit = hit.name.decode('ASCII')
# add to score_dict
if best_hit_score > best_score:
best_score = best_hit_score
cutoff = best_score/100*(100-scoreCutoff)
if best_score < 0:
cutoff = best_score/100*(100+scoreCutoff)
if best_hit_score >= cutoff:
if best_hit_score not in score_dict:
score_dict[best_hit_score] = [best_hit]
else:
score_dict[best_hit_score].append(best_hit)

output_fn.print_debug(debug, 'All HMM hits', ori_hits)
hmm_cand = {}
n = 0
Expand All @@ -84,7 +105,7 @@ def sort_hmm_hits(hmm_hits, hitLimit = 10, scoreCutoff = 10, debug = False):

def do_hmmsearch(
hmm_file, search_fa, evalHmmer = 0.00001, scoreCutoff = 10,
hitLimit = 10, cpus = os.cpu_count(), debug = False):
hitLimit = 10, hmm_score_type = 'domain', cpus = os.cpu_count(), debug = False):
""" Perform hmmsearch for a hmm file vs a multiple fasta file
Return a dictionary of hits and their e-value and bit-score
Only "top" hits are returned. The cutoff is defined by
Expand All @@ -100,7 +121,7 @@ def do_hmmsearch(
for hits in pyhmmer.hmmsearch(
hmm_file, sequences, E = evalHmmer, cpus = cpus):
if len(hits) > 0:
hmm_hits = sort_hmm_hits(hits, hitLimit, scoreCutoff, debug)
hmm_hits = sort_hmm_hits(hits, hmm_score_type, hitLimit, scoreCutoff, debug)
except :
sys.exit(
'ERROR: Error running hmmsearch for %s agains %s'
Expand Down
10 changes: 5 additions & 5 deletions fdog/libs/orthosearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
def hamstr(args):
(seqName, hmmpath, corepath, searchpath, outpath,
refspec, seed_id, search_taxon,
evalHmmer, hitLimit, scoreCutoff,
evalHmmer, hitLimit, hmmScoreType, scoreCutoff,
evalBlast, lowComplexityFilter,
checkCoorthologsRefOff, rbh, rep,
aligner, cpus, debug, silentOff, noCleanup) = args
Expand Down Expand Up @@ -72,7 +72,7 @@ def hamstr(args):

### (1) Do hmmsearch for query hmm against search taxon fasta
hmm_hits = hmm_fn.do_hmmsearch(
hmm_file, search_fa, evalHmmer, scoreCutoff, hitLimit, cpus, debug)
hmm_file, search_fa, evalHmmer, scoreCutoff, hitLimit, hmmScoreType, cpus, debug)
output_fn.print_debug(debug, 'Sorted HMM hits', hmm_hits)
### (2) Read fasta file of refspec and search taxon
refspec_seqs = fasta_fn.read_fasta(refspec_fa)
Expand Down Expand Up @@ -202,7 +202,7 @@ def run_hamstr(args):
(seqName, refspec, pathArgs, orthoArgs, otherArgs) = args
(outpath, hmmpath, corepath, searchpath, annopath) = pathArgs
(checkCoorthologsRefOff, rbh, rep, evalBlast, lowComplexityFilter,
evalHmmer, hitLimit, scoreCutoff, aligner) = orthoArgs
evalHmmer, hitLimit, hmmScoreType, scoreCutoff, aligner) = orthoArgs
(searchTaxa, cpus, debug, silentOff, noCleanup, force, append) = otherArgs

hamstr_jobs = []
Expand All @@ -225,7 +225,7 @@ def run_hamstr(args):
hamstr_jobs.append([
seqName, hmmpath, corepath, searchpath, outpath,
refspec, seed_id, search_taxon,
evalHmmer, hitLimit, scoreCutoff,
evalHmmer, hitLimit, hmmScoreType, scoreCutoff,
evalBlast, lowComplexityFilter,
checkCoorthologsRefOff, rbh, rep,
aligner, cpus, debug, silentOff, noCleanup
Expand All @@ -245,7 +245,7 @@ def run_hamstr(args):
hamstr_jobs.append([
seqName, hmmpath, corepath, searchpath, outpath,
refspec, seed_id, search_taxon,
evalHmmer, hitLimit, scoreCutoff,
evalHmmer, hitLimit, hmmScoreType, scoreCutoff,
evalBlast, lowComplexityFilter,
checkCoorthologsRefOff, rbh, rep,
aligner, cpus, debug, silentOff, noCleanup
Expand Down
5 changes: 4 additions & 1 deletion fdog/runMulti.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,8 @@ def main():
action='store', default=0.0001, type=float)
ortho_options.add_argument('--hitLimit', help='number of hits of the initial pHMM based search that should be evaluated via a reverse search. Default: 10',
action='store', default=10, type=int)
ortho_options.add_argument('--hmmScoreType', help='Choose type of hmm score (best domain or full sequence score) for validating HMM candidates (NOTE: applied also for the core compilation). Default: domain',
action='store', choices=['domain','sequence'], default='domain')
ortho_options.add_argument('--scoreCutoff', help='Define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10',
action='store', default=10, type=int)

Expand Down Expand Up @@ -295,6 +297,7 @@ def main():
evalBlast = args.evalBlast
evalHmmer = args.evalHmmer
hitLimit = args.hitLimit
hmmScoreType = args.hmmScoreType
scoreCutoff = args.scoreCutoff

# fas arguments
Expand Down Expand Up @@ -417,7 +420,7 @@ def main():

### do ortholog search
orthoArgs = [checkCoorthologsRefOff, rbh, rep, evalBlast,
lowComplexityFilter, evalHmmer, hitLimit, scoreCutoff, aligner]
lowComplexityFilter, evalHmmer, hitLimit, hmmScoreType, scoreCutoff, aligner]
otherArgs = [searchTaxa, cpus, debug, silentOff, noCleanup, force, append]
ortho_options = [orthoArgs, otherArgs, pathArgs, refspec]
ortho_runtime = search_ortholog(ortho_options, seeds, inFol, outpath)
Expand Down
5 changes: 4 additions & 1 deletion fdog/runSingle.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ def main():
action='store', default=0.0001, type=float)
ortho_options.add_argument('--hitLimit', help='number of hits of the initial pHMM based search that should be evaluated via a reverse search. Default: 10',
action='store', default=10, type=int)
ortho_options.add_argument('--hmmScoreType', help='Choose type of hmm score (best domain or full sequence score) for validating HMM candidates (NOTE: applied also for the core compilation). Default: domain',
action='store', choices=['domain','sequence'], default='domain')
ortho_options.add_argument('--scoreCutoff', help='Define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10',
action='store', default=10, type=int)

Expand Down Expand Up @@ -166,6 +168,7 @@ def main():
evalBlast = args.evalBlast
evalHmmer = args.evalHmmer
hitLimit = args.hitLimit
hmmScoreType = args.hmmScoreType
scoreCutoff = args.scoreCutoff

# fas arguments
Expand Down Expand Up @@ -275,7 +278,7 @@ def main():
searchTaxa = ','.join(searchTaxa)
# do ortholog search
orthoArgs = [checkCoorthologsRefOff, rbh, rep, evalBlast,
lowComplexityFilter, evalHmmer, hitLimit, scoreCutoff, aligner]
lowComplexityFilter, evalHmmer, hitLimit, hmmScoreType, scoreCutoff, aligner]
otherArgs = [searchTaxa, cpus, debug, silentOff, noCleanup, force, append]
hamstr_out = ortho_fn.run_hamstr([seqName, refspec, pathArgs, orthoArgs, otherArgs])
output_fn.write_hamstr(hamstr_out, outpath, seqName, force, append)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

setup(
name="fdog",
version="0.1.25",
version="0.1.26",
python_requires='>=3.7.0',
description="Feature-aware Directed OrtholoG search tool",
long_description=long_description,
Expand Down

0 comments on commit 0404231

Please sign in to comment.