Skip to content

Commit

Permalink
Merge pull request #3 from mtisza1/update2
Browse files Browse the repository at this point in the history
Update2: to v3.2.1
  • Loading branch information
mtisza1 authored Feb 9, 2024
2 parents 9b48a74 + 51f9173 commit 74e2d77
Show file tree
Hide file tree
Showing 12 changed files with 359 additions and 273 deletions.
1 change: 1 addition & 0 deletions environment/ct3_env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ dependencies:
- sed
- grep
- samtools
- pyarrow
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "cenotetaker3"
version = "3.2.0"
version = "3.2.1"
authors = [
{ name="Mike Tisza", email="michael.tisza@gmail.com" },
]
Expand Down
378 changes: 204 additions & 174 deletions src/cenote/cenote_main.sh

Large diffs are not rendered by default.

39 changes: 24 additions & 15 deletions src/cenote/cenotetaker3.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,12 @@ def cenotetaker3():

parentpath = Path(pathname).parents[1]

__version__ = "3.2.0"
__version__ = "3.2.1"

Def_CPUs = os.cpu_count()

#def runner():
Def_workdir = os.getcwd()

parser = argparse.ArgumentParser(description='Cenote-Taker 3 is a pipeline for virus discovery \
and thorough annotation of viral contigs and genomes. Visit \
https://github.com/mtisza1/Cenote-Taker3 for help. \
Expand Down Expand Up @@ -116,6 +117,9 @@ def cenotetaker3():
optional_args.add_argument("-am", "--annotation_mode", dest="ANNOTATION_MODE", type=str2bool, default="False",
help='Default: False -- Annotate sequences only (skip discovery). Only use if you believe \
each provided sequence is viral')
optional_args.add_argument("-wd", "--working_directory", dest="c_workdir", type=str, default=Def_workdir,
help=f"Default: {Def_workdir} -- Set working directory with absolute or relative path. \
run directory will be created within.")
optional_args.add_argument("--template_file", dest="template_file", type=str,
default=str(cenote_script_path) + '/dummy_template.sbt',
help='Template file with some metadata. Real one required for GenBank submission. Takes a \
Expand Down Expand Up @@ -221,7 +225,8 @@ def cenotetaker3():
optional_args.add_argument("--wrap", dest="WRAP", type=str2bool, default="True",
help='Default: True -- Wrap/rotate DTR/circular contigs so the start codon of an ORF is \
the first nucleotide in the contig/genome')

optional_args.add_argument("--genbank", dest="GENBANK", type=str2bool, default="True",
help='Default: True -- Make GenBank files (.gbf, .sqn, .fsa, .tbl, .cmt, etc)?')

args = parser.parse_args()

Expand All @@ -236,12 +241,16 @@ def cenotetaker3():
HALL_TYPE = ' '.join(map(str,args.HALL_LIST))

## make out directory (rename any existing directory)
if not os.path.isdir(str(args.run_title)):
os.makedirs(str(args.run_title))

out_directory = os.path.join(str(args.c_workdir), str(args.run_title))

if not os.path.isdir(out_directory):
os.makedirs(out_directory)
else:
randID = ''.join(random.choices(string.ascii_uppercase + string.digits, k=5))
os.rename(str(args.run_title), f"{str(args.run_title)}_old_{randID}")
os.makedirs(str(args.run_title))
movdir = f"{out_directory}_old_{randID}"
os.rename(out_directory, movdir)
os.makedirs(out_directory)

#### define logger #####
logger = logging.getLogger("cenote_logger")
Expand All @@ -250,7 +259,7 @@ def cenotetaker3():
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.DEBUG)

file_handler = logging.FileHandler(os.path.join(str(args.run_title), f"{str(args.run_title)}_cenotetaker.log"))
file_handler = logging.FileHandler(os.path.join(out_directory, f"{str(args.run_title)}_cenotetaker.log"))
file_handler.setLevel(logging.DEBUG)

logger.addHandler(file_handler)
Expand Down Expand Up @@ -283,7 +292,7 @@ def check_ct3_dbs():
sys.exit()
## checking hmm dir
if not os.path.isdir(os.path.join(str(args.C_DBS), 'hmmscan_DBs', str(args.HMM_DBS))):
logger.warning(f"hmm db directory is not found at")
logger.warning(f"hmm db directory version {str(args.HMM_DBS)} is not found at")
logger.warning(f"{os.path.join(str(args.C_DBS), 'hmmscan_DBs', str(args.HMM_DBS))}")
logger.warning(f"looking for others in {os.path.join(str(args.C_DBS), 'hmmscan_DBs')}")
for path, subdirs, files in os.walk(os.path.join(str(args.C_DBS), 'hmmscan_DBs')):
Expand Down Expand Up @@ -380,15 +389,15 @@ def log_subprocess_output(pipe):
### run the main script
process = Popen(['bash', str(cenote_script_path) + '/cenote_main.sh', str(cenote_script_path),
str(args.original_contigs), str(args.run_title), str(args.PROPHAGE), str(args.CPU),
str(__version__), str(args.ANNOTATION_MODE), str(args.template_file),
str(READS), str(args.circ_length_cutoff), str(args.linear_length_cutoff),
str(args.CIRC_MINIMUM_DOMAINS), str(args.LIN_MINIMUM_DOMAINS),
str(HALL_TYPE), str(args.C_DBS), str(args.HMM_DBS), str(args.WRAP),
str(args.CALLER), str(args.HHSUITE_TOOL),
str(__version__), str(args.ANNOTATION_MODE), str(out_directory),
str(args.template_file), str(READS), str(args.circ_length_cutoff),
str(args.linear_length_cutoff), str(args.CIRC_MINIMUM_DOMAINS),
str(args.LIN_MINIMUM_DOMAINS), str(HALL_TYPE), str(args.C_DBS), str(args.HMM_DBS),
str(args.WRAP), str(args.CALLER), str(args.HHSUITE_TOOL),
str(args.isolation_source), str(args.collection_date), str(args.metagenome_type),
str(args.srr_number), str(args.srx_number), str(args.biosample),
str(args.bioproject), str(args.ASSEMBLER), str(args.MOLECULE_TYPE),
str(args.DATA_SOURCE)],
str(args.DATA_SOURCE), str(args.GENBANK)],
stdout=PIPE, stderr=STDOUT)

with process.stdout:
Expand Down
42 changes: 21 additions & 21 deletions src/cenote/get_ct3_databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,39 +117,39 @@ def is_tool(name):

if str(args.HHCDD) == "True":
print ("running hhsuite CDD database update/install")
subprocess.call(['rm', '-r', '-f', str(args.C_DBS) + '/NCBI_CD/'])
isExist = os.path.exists(str(args.C_DBS) + '/NCBI_CD')
subprocess.call(['rm', '-r', '-f', str(args.C_DBS) + '/hhsearch_DBs/NCBI_CD/'])
isExist = os.path.exists(str(args.C_DBS) + '/hhsearch_DBs/NCBI_CD')
if not isExist:
os.makedirs(str(args.C_DBS) + '/NCBI_CD/', exist_ok=True)
subprocess.call(['wget', '--directory-prefix=' + str(args.C_DBS),
os.makedirs(str(args.C_DBS) + '/hhsearch_DBs/NCBI_CD/', exist_ok=True)
subprocess.call(['wget', '--directory-prefix=' + str(args.C_DBS) + '/hhsearch_DBs',
'https://zenodo.org/records/3660537/files/NCBI_CD_hhsuite.tgz'])
subprocess.call(['tar', '-xvf', f'{str(args.C_DBS)}/NCBI_CD_hhsuite.tgz',
'--directory', str(args.C_DBS)])
subprocess.call(['rm', '-f', f'{str(args.C_DBS)}/NCBI_CD_hhsuite.tgz'])
subprocess.call(['tar', '-xvf', f'{str(args.C_DBS)}/hhsearch_DBs/NCBI_CD_hhsuite.tgz',
'--directory', str(args.C_DBS) + '/hhsearch_DBs'])
subprocess.call(['rm', '-f', f'{str(args.C_DBS)}/hhsearch_DBs/NCBI_CD_hhsuite.tgz'])

if str(args.HHPFAM) == "True":
print ("running PFAM database update/install")
subprocess.call(['rm', '-r', '-f', str(args.C_DBS) + '/pfam_32_db/'])
isExist = os.path.exists(str(args.C_DBS) + '/pfam_32_db')
subprocess.call(['rm', '-r', '-f', str(args.C_DBS) + '/hhsearch_DBs/pfam_32_db/'])
isExist = os.path.exists(str(args.C_DBS) + '/hhsearch_DBs/pfam_32_db')
if not isExist:
os.makedirs(str(args.C_DBS) + '/pfam_32_db/', exist_ok=True)
subprocess.call(['wget', '--directory-prefix=' + str(args.C_DBS),
os.makedirs(str(args.C_DBS) + '/hhsearch_DBs/pfam_32_db/', exist_ok=True)
subprocess.call(['wget', '--directory-prefix=' + str(args.C_DBS) + '/hhsearch_DBs',
'http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/pfamA_32.0.tar.gz'])
subprocess.call(['tar', '-xvf', f'{str(args.C_DBS)}/pfamA_32.0.tar.gz',
'--directory', str(args.C_DBS) + '/pfam_32_db'])
subprocess.call(['rm', '-f', f'{str(args.C_DBS)}/pfamA_32.0.tar.gz'])
subprocess.call(['tar', '-xvf', f'{str(args.C_DBS)}/hhsearch_DBs/pfamA_32.0.tar.gz',
'--directory', str(args.C_DBS) + '/hhsearch_DBs/pfam_32_db'])
subprocess.call(['rm', '-f', f'{str(args.C_DBS)}/hhsearch_DBs/pfamA_32.0.tar.gz'])

if str(args.HHPDB) == "True":
print ("running PDB database update/install. This could take around 2 hours.")
subprocess.call(['rm', '-r', '-f', str(args.C_DBS) + '/pdb70/'])
isExist = os.path.exists(str(args.C_DBS) + '/pdb70')
subprocess.call(['rm', '-r', '-f', str(args.C_DBS) + '/hhsearch_DBs/pdb70/'])
isExist = os.path.exists(str(args.C_DBS) + '/hhsearch_DBs/pdb70')
if not isExist:
os.makedirs(str(args.C_DBS) + '/pdb70/', exist_ok=True)
subprocess.call(['wget', '--directory-prefix=' + str(args.C_DBS),
os.makedirs(str(args.C_DBS) + '/hhsearch_DBs/pdb70/', exist_ok=True)
subprocess.call(['wget', '--directory-prefix=' + str(args.C_DBS) + '/hhsearch_DBs',
'http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/pdb70_from_mmcif_latest.tar.gz'])
subprocess.call(['tar', '-xvf', f'{str(args.C_DBS)}/pdb70_from_mmcif_latest.tar.gz',
'--directory', str(args.C_DBS) + '/pdb70'])
subprocess.call(['rm', '-f', f'{str(args.C_DBS)}/pdb70_from_mmcif_latest.tar.gz'])
subprocess.call(['tar', '-xvf', f'{str(args.C_DBS)}/hhsearch_DBs/pdb70_from_mmcif_latest.tar.gz',
'--directory', str(args.C_DBS) + '/hhsearch_DBs/pdb70'])
subprocess.call(['rm', '-f', f'{str(args.C_DBS)}/hhsearch_DBs/pdb70_from_mmcif_latest.tar.gz'])

if __name__ == "__main__":
get_ct3_dbs()
2 changes: 1 addition & 1 deletion src/cenote/python_modules/combine_hallmark_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@

merge_dt = pd.merge(hits_dt, names_dt, on = 'contig', how = 'outer')

merge_dt = merge_dt.fillna(0)
merge_dt = merge_dt.infer_objects(copy=False).fillna(0)



Expand Down
35 changes: 33 additions & 2 deletions src/cenote/python_modules/hhpred_to_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,39 @@
#with this logic, only files with results (lines starting with >) append the df
if line.startswith('>'):
full_desc = line.rstrip('\n').strip('>')
accession = full_desc.split(' ; ')[0]
annotation = full_desc.split(' ; ')[2]

## CDD models
if line.startswith('>cd') or line.startswith('>sd'):
accession = full_desc.split(' ')[0]

try:
#annotation = full_desc.split('; ')[1].split('. ')[0]
annotation = full_desc.split(' ')[1].split(';')[0]
except:
annotation = 'hypothetical protein'

## PFAM models
elif line.startswith('>PF'):
accession = full_desc.split(' ; ')[0]

try:
annotation = full_desc.split('; ')[2]
except:
annotation = 'hypothetical protein'

## PDB models
else:
accession = full_desc.split(' ')[0]

try:
annotationpd = full_desc.split(' ')[1:]
annotationpd = ' '.join(annotationpd)
if ']' in annotationpd:
annotation = annotationpd.split(']')[1].split(';')[0]
else:
annotation = annotationpd.split(';')[0]
except:
annotation = 'hypothetical protein'

full_stats = next(hrh, '').strip()
probability = full_stats.split(' ')[0].split('=')[1]
Expand Down
46 changes: 37 additions & 9 deletions src/cenote/python_modules/make_sequin_fsas.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@

DATA_SOURCE = sys.argv[16]

GENBANK = sys.argv[17]

# make out dir
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
Expand Down Expand Up @@ -84,6 +86,8 @@
repeat_df = pd.read_csv(repeat_file, sep = "\t")

#### loop each virus

desc_list = []
for seq_record in SeqIO.parse(final_contig_file, "fasta"):

#make a random alphanumeric ID 5 characters in length
Expand Down Expand Up @@ -131,14 +135,38 @@
topology = "circular"
else:
topology = "linear"

# here's the whole header string
header = (f">{str(seq_record.id)} [organism={organism} sp. ct{randID}] [gcode={gcode}] "
f"[topology={topology}] [note: taxonomic lineage {lineage}] [isolation_source={ISO_SOURCE}] "
f"[collection_date={COLLECT_DATE}] [metagenome_source={META_TYPE}] [SRA={SRR}] "
f"[note=genome binned from sequencing reads available in {SRX}] [Biosample={BIOSAMP}] "
f"[Bioproject={PRJ}] [moltype={MOL_TYPE}] [isolation_source={DATA_SOURCE}]")

seq_output_file = os.path.join(out_dir, str(seq_record.id) + ".fsa")
if GENBANK == 'True':
# here's the whole header string
header = (f">{str(seq_record.id)} [organism={organism} sp. ct{randID}] [gcode={gcode}] "
f"[topology={topology}] [note: taxonomic lineage {lineage}] [isolation_source={ISO_SOURCE}] "
f"[collection_date={COLLECT_DATE}] [metagenome_source={META_TYPE}] [SRA={SRR}] "
f"[note=genome binned from sequencing reads available in {SRX}] [Biosample={BIOSAMP}] "
f"[Bioproject={PRJ}] [moltype={MOL_TYPE}] [isolation_source={DATA_SOURCE}]")

seq_output_file = os.path.join(out_dir, str(seq_record.id) + ".fsa")

print(f"{header}\n{seq_record.seq}", file = open(seq_output_file, "a"))

## add outtable with contig, chunk, organism name (full)
try:
if "@" in seq_record.id:
contig = seq_record.id.split("@")[0]
chunkq = seq_record.id.split("@")[1]
else:
contig = seq_record.id
chunkq = None

fullorg = f'{organism} sp. ct{randID}'

desc_list.append([contig, chunkq, fullorg])
except:
print(f"{os.path.basename(__file__)}: seq record info parse failed.")


desc_df = pd.DataFrame(desc_list, columns=["contig", "chunk_name", "organism"])

if not desc_df.empty:
org_output_file = os.path.join(temp_dir, "contig_to_organism.tsv")

print(f"{header}\n{seq_record.seq}", file = open(seq_output_file, "a"))
desc_df.to_csv(org_output_file, sep = "\t", index = False)
15 changes: 11 additions & 4 deletions src/cenote/python_modules/make_sequin_tbls.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,15 @@

out_dir = sys.argv[5]

GENBANK = sys.argv[6]

if not os.path.isdir(out_dir):
os.makedirs(out_dir)

# load gene/contig table
gene_contig_df = pd.read_csv(gene_contig_file, sep = "\t")

gene_contig_df['chunk_name'] = gene_contig_df['chunk_name'].fillna("NaN")
gene_contig_df['chunk_name'] = gene_contig_df['chunk_name'].infer_objects(copy=False).fillna("NaN")

## quick_df is for adding chunk info to trnascan table via merge
quick_df = gene_contig_df[['contig', 'chunk_name', 'chunk_length',
Expand Down Expand Up @@ -68,8 +70,8 @@
tRNA_df = tRNA_df[['contig', 'chunk_name', 'gene_start', 'gene_stop', 'gene_name',
'gene_orient', 'evidence_description', 'evidence_acession', 'Evidence_source']]

tRNA_df['chunk_name'] = tRNA_df['chunk_name'].fillna("NaN")
quick_df['chunk_name'] = quick_df['chunk_name'].fillna("NaN")
tRNA_df['chunk_name'] = tRNA_df['chunk_name'].infer_objects(copy=False).fillna("NaN")
quick_df['chunk_name'] = quick_df['chunk_name'].infer_objects(copy=False).fillna("NaN")

tRNA_plus_df = pd.merge(tRNA_df, quick_df, on = ["contig", "chunk_name"], how = "left")

Expand Down Expand Up @@ -173,7 +175,7 @@


## need a chunk value for all viruses to properly group
merged_df['chunk_name'] = merged_df['chunk_name'].fillna("NaN")
merged_df['chunk_name'] = merged_df['chunk_name'].infer_objects(copy=False).fillna("NaN")

chunk_grouped_df = merged_df.groupby(['contig', 'chunk_name'], dropna = False)

Expand All @@ -185,7 +187,12 @@ def tbl_first_second(gstart, gstop, gorient):
else:
return gstop, gstart, gorient


if GENBANK == 'False':
sys.exit()

#### loop each virus

for name, seq_group in chunk_grouped_df:

# header line for tbl
Expand Down
7 changes: 5 additions & 2 deletions src/cenote/python_modules/summary_statement.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
prune_sum = sys.argv[4]
genes_sum = sys.argv[5]
ann_mode = sys.argv[6]
prune = sys.argv[7]

###

Expand Down Expand Up @@ -50,6 +51,8 @@
f'searched and {pc}{num_virus_contigs}{lc} viruses were detected and annotated.{endc}')
print(f'>>> {lc}In all, {pc}{nonhypo_genes / all_genes:.0%}{lc} of '
f'virus genes were annotated with functional information.{endc}')
print(f'>>> {pc}{over_10kb}{lc} contig(s) over 10kb went through pruning module '
f'and {pc}{pruned_seqs}{lc} were shortened by pruning.{endc}')
if prune == 'True':
print(f'>>> {pc}{over_10kb}{lc} contig(s) over 10kb went through pruning module '
f'and {pc}{pruned_seqs}{lc} were shortened by pruning.{endc}')

print('')
Loading

0 comments on commit 74e2d77

Please sign in to comment.