Skip to content

Commit

Permalink
Reduce profile size, capture diamond error
Browse files Browse the repository at this point in the history
  • Loading branch information
davidemms committed Apr 17, 2023
1 parent e449f1f commit 365397e
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 23 deletions.
2 changes: 1 addition & 1 deletion scripts_of/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1007,7 +1007,7 @@ def NewSpeciesCladesWorkflow(speciesInfoObj, seqsInfo, options, prog_caller, spe
astral_fn = files.FileHandler.GetAstralFilename()
astral.create_input_file(files.FileHandler.GetOGsTreeDir(), astral_fn)
species_tree_unrooted_fn = files.FileHandler.GetSpeciesTreeUnrootedFN()
parallel_task_manager.RunCommand(astral.get_astral_command(astral_fn, species_tree_unrooted_fn))
parallel_task_manager.RunCommand(astral.get_astral_command(astral_fn, species_tree_unrooted_fn, options.nBlast))

# Root it
core_rooted_species_tree = tree.Tree(files.FileHandler.GetCoreSpeciesTreeIDsRootedFN(), format=1)
Expand Down
31 changes: 15 additions & 16 deletions scripts_of/accelerate.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@

class XcelerateConfig(object):
def __init__(self):
# use kmeans clustering to select 1/kmeans_divide genes per orthogroup (to a minimum of 20). Otherwise, use all.
self.kmeans_divide: Optional[int] = 10 # 10 is a good value if using this option
self.n_for_profiles: Optional[int] = 10 # 10 is a good value if using this option


xcelerate_config = XcelerateConfig()
Expand All @@ -31,23 +30,26 @@ def check_for_orthoxcelerate(input_dir, speciesInfoObj):


def prepare_accelerate_database(input_dir, wd_list, nSpAll):
if xcelerate_config.kmeans_divide is None:
if xcelerate_config.n_for_profiles is None:
# create_shoot_db.create_full_database(input_dir, q_ids=True, subtrees_dir="")
fn_diamond_db = create_profiles_database(input_dir, wd_list, nSpAll, selection="all", q_ids=True, subtrees_dir="")
else:
fn_diamond_db = create_profiles_database(input_dir, wd_list, nSpAll, selection="kmeans", q_ids=True, divide=xcelerate_config.kmeans_divide, subtrees_dir="")
fn_diamond_db = create_profiles_database(input_dir, wd_list, nSpAll, selection="kmeans", q_ids=True, n_for_profile=xcelerate_config.n_for_profiles, subtrees_dir="")
return fn_diamond_db


def RunSearch(options, speciessInfoObj, fn_diamond_db, prog_caller, q_one_query=True):
name_to_print = "BLAST" if options.search_program == "blast" else options.search_program
util.PrintUnderline("Running %s profiles search" % name_to_print)
commands, results_files = GetOrderedSearchCommands(speciessInfoObj, fn_diamond_db, prog_caller, q_one_query=q_one_query, threads=options.nBlast)
if options.qStopAfterPrepare:
for command in commands:
print(command)
util.Success()
print("Using %d thread(s)" % options.nBlast)
if q_one_query:
return_code = parallel_task_manager.RunCommand(commands[0], qPrintOnError=True)
if return_code != 0:
print("ERROR: DIAMOND search failed, see messages above")
util.Fail()
util.PrintTime("Done profiles search\n")
return results_files
cmd_queue = mp.Queue()
for iCmd, cmd in enumerate(commands):
cmd_queue.put((iCmd+1, cmd))
Expand All @@ -58,7 +60,7 @@ def RunSearch(options, speciessInfoObj, fn_diamond_db, prog_caller, q_one_query=
while proc.is_alive():
proc.join()
print("")
util.PrintTime("Done profiles search\n")
util.PrintTime(" Done profiles search\n")
return results_files


Expand Down Expand Up @@ -191,15 +193,14 @@ def write_all_orthogroups(ogs, ogs_new_species, ogs_clade_specific):
return clustersFilename_pairs


def create_profiles_database(din, wd_list, nSpAll, selection="kmeans", min_for_profile=20, q_ids=True, divide=10,
def create_profiles_database(din, wd_list, nSpAll, selection="kmeans", n_for_profile=20, q_ids=True,
subtrees_dir="", q_hogs=True):
"""
Create a fasta file with profile genes from each orthogroup
Args:
din - Input OrthoFinder results directory
selection - "kmeans|random|all" - method to use for sampling genes from orthogroups
min_for_profile - The min number of genes to use from each orthogroup, when available
divide - Select `|genes| / divide` genes per orthogroup, as long as the number is greater than min_for_profile
n_for_profile - The number of genes to use from each orthogroup, when available
q_ids - Convert subtrees (with user gene accessions) back to IDs for profiles database
Notes:
If the trees have been split into subtrees then profiles will be created for
Expand All @@ -217,7 +218,7 @@ def create_profiles_database(din, wd_list, nSpAll, selection="kmeans", min_for_p
if selection == "all":
fn_fasta = wd + "profile_sequences%s.all.fa" % subtrees_label
else:
fn_fasta = wd + "profile_sequences%s.d%d_min%d_%s.fa" % (subtrees_label, divide, min_for_profile, selection)
fn_fasta = wd + "profile_sequences%s.%d_%s.fa" % (subtrees_label, n_for_profile, selection)
fn_diamond_db = fn_fasta + ".dmnd"
if os.path.exists(fn_diamond_db):
print("Profiles database already exists and will be reused: %s" % fn_diamond_db)
Expand Down Expand Up @@ -267,9 +268,7 @@ def create_profiles_database(din, wd_list, nSpAll, selection="kmeans", min_for_p
i_part = os.path.basename(fn).rsplit(".", 2)[1] if q_subtrees else None
fw_temp = fasta_writer.FastaWriter(fn)
n_in_og = len(fw_temp.SeqLists)
n_for_profile = n_in_og // divide
if n_for_profile < min_for_profile:
n_for_profile = min_for_profile
n_for_profile = min(n_in_og, n_for_profile)
if selection == "kmeans" and len(fw_temp.SeqLists) > n_for_profile:
if q_subtrees:
# MSA needs to be modified
Expand Down
4 changes: 2 additions & 2 deletions scripts_of/astral.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ def create_input_file(d, output_fn, n_skip=50):
outfile.write(t.write(format=9) + "\n")


def get_astral_command(astral_input, species_tree):
return " ".join(["astral-pro", "-i", astral_input, "-o", species_tree])
def get_astral_command(astral_input, species_tree, threads):
return " ".join(["astral-pro", "-i", astral_input, "-o", species_tree, "-t", str(threads)])


if __name__ == "__main__":
Expand Down
11 changes: 7 additions & 4 deletions scripts_of/sample_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,10 +230,13 @@ def select_from_aligned(infn, n_sample, q_trim=True):
M = embed(m)
# print("embedding successful")
# Cluster
if use_n_auto:
kmeans = cluster.KMeans(n_clusters=n_sample, random_state=0, n_init='auto').fit(M)
else:
kmeans = cluster.KMeans(n_clusters=n_sample, random_state=0).fit(M)
# Kmeans will warn if it finds fewer clusters than requested, ignore these warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore")
if use_n_auto:
kmeans = cluster.KMeans(n_clusters=n_sample, random_state=0, n_init='auto').fit(M)
else:
kmeans = cluster.KMeans(n_clusters=n_sample, random_state=0).fit(M)
# print("clustering successful")
labels = kmeans.predict(M)
n_keep = len(ikeep)
Expand Down

0 comments on commit 365397e

Please sign in to comment.