Merge pull request #3 from mtisza1/update2

Update2: to v3.2.1
mtisza1 · Feb 9, 2024 · 74e2d77 · 74e2d77
2 parents 9b48a74 + 51f9173
commit 74e2d77
Show file tree

Hide file tree

Showing 12 changed files with 359 additions and 273 deletions.
diff --git a/environment/ct3_env.yaml b/environment/ct3_env.yaml
@@ -22,3 +22,4 @@ dependencies:
   - sed
   - grep
   - samtools
+  - pyarrow
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "cenotetaker3"
-version = "3.2.0"
+version = "3.2.1"
 authors = [
   { name="Mike Tisza", email="michael.tisza@gmail.com" },
 ]

diff --git a/src/cenote/cenote_main.sh b/src/cenote/cenote_main.sh
diff --git a/src/cenote/cenotetaker3.py b/src/cenote/cenotetaker3.py
@@ -79,11 +79,12 @@ def cenotetaker3():
 
     parentpath = Path(pathname).parents[1]
 
-    __version__ = "3.2.0"
+    __version__ = "3.2.1"
 
     Def_CPUs = os.cpu_count()
 
-    #def runner():
+    Def_workdir = os.getcwd()
+
     parser = argparse.ArgumentParser(description='Cenote-Taker 3 is a pipeline for virus discovery \
                                     and thorough annotation of viral contigs and genomes. Visit \
                                     https://github.com/mtisza1/Cenote-Taker3 for help. \
@@ -116,6 +117,9 @@ def cenotetaker3():
     optional_args.add_argument("-am", "--annotation_mode", dest="ANNOTATION_MODE", type=str2bool, default="False", 
                             help='Default: False -- Annotate sequences only (skip discovery). Only use if you believe \
                                 each provided sequence is viral')
+    optional_args.add_argument("-wd", "--working_directory", dest="c_workdir", type=str, default=Def_workdir, 
+                            help=f"Default: {Def_workdir} -- Set working directory with absolute or relative path. \
+                                run directory will be created within.")
     optional_args.add_argument("--template_file", dest="template_file", type=str, 
                             default=str(cenote_script_path) + '/dummy_template.sbt', 
                             help='Template file with some metadata. Real one required for GenBank submission. Takes a \
@@ -221,7 +225,8 @@ def cenotetaker3():
     optional_args.add_argument("--wrap", dest="WRAP", type=str2bool, default="True", 
                             help='Default: True -- Wrap/rotate DTR/circular contigs so the start codon of an ORF is \
                                 the first nucleotide in the contig/genome')
-
+    optional_args.add_argument("--genbank", dest="GENBANK", type=str2bool, default="True", 
+                            help='Default: True -- Make GenBank files (.gbf, .sqn, .fsa, .tbl, .cmt, etc)?')
 
     args = parser.parse_args()
 
@@ -236,12 +241,16 @@ def cenotetaker3():
     HALL_TYPE = ' '.join(map(str,args.HALL_LIST))
 
     ## make out directory (rename any existing directory)
-    if not os.path.isdir(str(args.run_title)):
-        os.makedirs(str(args.run_title))
+
+    out_directory = os.path.join(str(args.c_workdir), str(args.run_title))
+
+    if not os.path.isdir(out_directory):
+        os.makedirs(out_directory)
     else:
         randID = ''.join(random.choices(string.ascii_uppercase + string.digits, k=5))
-        os.rename(str(args.run_title), f"{str(args.run_title)}_old_{randID}")
-        os.makedirs(str(args.run_title))
+        movdir = f"{out_directory}_old_{randID}"
+        os.rename(out_directory, movdir)
+        os.makedirs(out_directory)
 
     #### define logger #####
     logger = logging.getLogger("cenote_logger")
@@ -250,7 +259,7 @@ def cenotetaker3():
     stream_handler = logging.StreamHandler()
     stream_handler.setLevel(logging.DEBUG)
 
-    file_handler = logging.FileHandler(os.path.join(str(args.run_title), f"{str(args.run_title)}_cenotetaker.log"))
+    file_handler = logging.FileHandler(os.path.join(out_directory, f"{str(args.run_title)}_cenotetaker.log"))
     file_handler.setLevel(logging.DEBUG)
 
     logger.addHandler(file_handler)
@@ -283,7 +292,7 @@ def check_ct3_dbs():
             sys.exit()
         ## checking hmm dir
         if not os.path.isdir(os.path.join(str(args.C_DBS), 'hmmscan_DBs', str(args.HMM_DBS))):
-            logger.warning(f"hmm db directory is not found at")
+            logger.warning(f"hmm db directory version {str(args.HMM_DBS)} is not found at")
             logger.warning(f"{os.path.join(str(args.C_DBS), 'hmmscan_DBs', str(args.HMM_DBS))}")
             logger.warning(f"looking for others in {os.path.join(str(args.C_DBS), 'hmmscan_DBs')}")
             for path, subdirs, files in os.walk(os.path.join(str(args.C_DBS), 'hmmscan_DBs')):
@@ -380,15 +389,15 @@ def log_subprocess_output(pipe):
     ### run the main script
     process = Popen(['bash', str(cenote_script_path) + '/cenote_main.sh', str(cenote_script_path), 
                     str(args.original_contigs), str(args.run_title), str(args.PROPHAGE), str(args.CPU),  
-                    str(__version__), str(args.ANNOTATION_MODE), str(args.template_file),
-                    str(READS), str(args.circ_length_cutoff), str(args.linear_length_cutoff),
-                    str(args.CIRC_MINIMUM_DOMAINS), str(args.LIN_MINIMUM_DOMAINS), 
-                    str(HALL_TYPE), str(args.C_DBS), str(args.HMM_DBS), str(args.WRAP), 
-                    str(args.CALLER), str(args.HHSUITE_TOOL), 
+                    str(__version__), str(args.ANNOTATION_MODE), str(out_directory), 
+                    str(args.template_file), str(READS), str(args.circ_length_cutoff), 
+                    str(args.linear_length_cutoff), str(args.CIRC_MINIMUM_DOMAINS), 
+                    str(args.LIN_MINIMUM_DOMAINS), str(HALL_TYPE), str(args.C_DBS), str(args.HMM_DBS), 
+                    str(args.WRAP), str(args.CALLER), str(args.HHSUITE_TOOL), 
                     str(args.isolation_source), str(args.collection_date), str(args.metagenome_type), 
                     str(args.srr_number), str(args.srx_number), str(args.biosample), 
                     str(args.bioproject), str(args.ASSEMBLER), str(args.MOLECULE_TYPE), 
-                    str(args.DATA_SOURCE)],
+                    str(args.DATA_SOURCE), str(args.GENBANK)],
                     stdout=PIPE, stderr=STDOUT)
 
     with process.stdout:

diff --git a/src/cenote/get_ct3_databases.py b/src/cenote/get_ct3_databases.py
@@ -117,39 +117,39 @@ def is_tool(name):
 
     if str(args.HHCDD) == "True":
         print ("running hhsuite CDD database update/install")
-        subprocess.call(['rm', '-r', '-f', str(args.C_DBS) + '/NCBI_CD/'])
-        isExist = os.path.exists(str(args.C_DBS) + '/NCBI_CD')
+        subprocess.call(['rm', '-r', '-f', str(args.C_DBS) + '/hhsearch_DBs/NCBI_CD/'])
+        isExist = os.path.exists(str(args.C_DBS) + '/hhsearch_DBs/NCBI_CD')
         if not isExist:
-            os.makedirs(str(args.C_DBS) + '/NCBI_CD/', exist_ok=True)
-        subprocess.call(['wget', '--directory-prefix=' + str(args.C_DBS), 
+            os.makedirs(str(args.C_DBS) + '/hhsearch_DBs/NCBI_CD/', exist_ok=True)
+        subprocess.call(['wget', '--directory-prefix=' + str(args.C_DBS) + '/hhsearch_DBs', 
                          'https://zenodo.org/records/3660537/files/NCBI_CD_hhsuite.tgz'])
-        subprocess.call(['tar', '-xvf', f'{str(args.C_DBS)}/NCBI_CD_hhsuite.tgz', 
-                         '--directory', str(args.C_DBS)])
-        subprocess.call(['rm', '-f', f'{str(args.C_DBS)}/NCBI_CD_hhsuite.tgz'])
+        subprocess.call(['tar', '-xvf', f'{str(args.C_DBS)}/hhsearch_DBs/NCBI_CD_hhsuite.tgz', 
+                         '--directory', str(args.C_DBS) + '/hhsearch_DBs'])
+        subprocess.call(['rm', '-f', f'{str(args.C_DBS)}/hhsearch_DBs/NCBI_CD_hhsuite.tgz'])
 
     if str(args.HHPFAM) == "True":
         print ("running PFAM database update/install")
-        subprocess.call(['rm', '-r', '-f', str(args.C_DBS) + '/pfam_32_db/'])
-        isExist = os.path.exists(str(args.C_DBS) + '/pfam_32_db')
+        subprocess.call(['rm', '-r', '-f', str(args.C_DBS) + '/hhsearch_DBs/pfam_32_db/'])
+        isExist = os.path.exists(str(args.C_DBS) + '/hhsearch_DBs/pfam_32_db')
         if not isExist:
-            os.makedirs(str(args.C_DBS) + '/pfam_32_db/', exist_ok=True)
-        subprocess.call(['wget', '--directory-prefix=' + str(args.C_DBS), 
+            os.makedirs(str(args.C_DBS) + '/hhsearch_DBs/pfam_32_db/', exist_ok=True)
+        subprocess.call(['wget', '--directory-prefix=' + str(args.C_DBS) + '/hhsearch_DBs', 
                          'http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/pfamA_32.0.tar.gz'])
-        subprocess.call(['tar', '-xvf', f'{str(args.C_DBS)}/pfamA_32.0.tar.gz', 
-                         '--directory', str(args.C_DBS) + '/pfam_32_db'])
-        subprocess.call(['rm', '-f', f'{str(args.C_DBS)}/pfamA_32.0.tar.gz'])
+        subprocess.call(['tar', '-xvf', f'{str(args.C_DBS)}/hhsearch_DBs/pfamA_32.0.tar.gz', 
+                         '--directory', str(args.C_DBS) + '/hhsearch_DBs/pfam_32_db'])
+        subprocess.call(['rm', '-f', f'{str(args.C_DBS)}/hhsearch_DBs/pfamA_32.0.tar.gz'])
 
     if str(args.HHPDB) == "True":
         print ("running PDB database update/install. This could take around 2 hours.")
-        subprocess.call(['rm', '-r', '-f', str(args.C_DBS) + '/pdb70/'])
-        isExist = os.path.exists(str(args.C_DBS) + '/pdb70')
+        subprocess.call(['rm', '-r', '-f', str(args.C_DBS) + '/hhsearch_DBs/pdb70/'])
+        isExist = os.path.exists(str(args.C_DBS) + '/hhsearch_DBs/pdb70')
         if not isExist:
-            os.makedirs(str(args.C_DBS) + '/pdb70/', exist_ok=True)
-        subprocess.call(['wget', '--directory-prefix=' + str(args.C_DBS), 
+            os.makedirs(str(args.C_DBS) + '/hhsearch_DBs/pdb70/', exist_ok=True)
+        subprocess.call(['wget', '--directory-prefix=' + str(args.C_DBS) + '/hhsearch_DBs', 
                          'http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/pdb70_from_mmcif_latest.tar.gz'])
-        subprocess.call(['tar', '-xvf', f'{str(args.C_DBS)}/pdb70_from_mmcif_latest.tar.gz', 
-                         '--directory', str(args.C_DBS) + '/pdb70'])
-        subprocess.call(['rm', '-f', f'{str(args.C_DBS)}/pdb70_from_mmcif_latest.tar.gz'])
+        subprocess.call(['tar', '-xvf', f'{str(args.C_DBS)}/hhsearch_DBs/pdb70_from_mmcif_latest.tar.gz', 
+                         '--directory', str(args.C_DBS) + '/hhsearch_DBs/pdb70'])
+        subprocess.call(['rm', '-f', f'{str(args.C_DBS)}/hhsearch_DBs/pdb70_from_mmcif_latest.tar.gz'])
 
 if __name__ == "__main__":
     get_ct3_dbs()
diff --git a/src/cenote/python_modules/combine_hallmark_counts.py b/src/cenote/python_modules/combine_hallmark_counts.py
@@ -63,7 +63,7 @@
 
 merge_dt = pd.merge(hits_dt, names_dt, on = 'contig', how = 'outer')
 
-merge_dt = merge_dt.fillna(0)
+merge_dt = merge_dt.infer_objects(copy=False).fillna(0)
 
 
 

diff --git a/src/cenote/python_modules/hhpred_to_table.py b/src/cenote/python_modules/hhpred_to_table.py
@@ -32,8 +32,39 @@
             #with this logic, only files with results (lines starting with >) append the df
             if line.startswith('>'):
                 full_desc = line.rstrip('\n').strip('>')
-                accession = full_desc.split(' ; ')[0]
-                annotation = full_desc.split(' ; ')[2]
+
+                ## CDD models
+                if line.startswith('>cd') or line.startswith('>sd'):
+                    accession = full_desc.split(' ')[0]
+
+                    try:
+                        #annotation = full_desc.split('; ')[1].split('. ')[0]
+                        annotation = full_desc.split(' ')[1].split(';')[0]
+                    except:
+                        annotation = 'hypothetical protein'
+
+                ## PFAM models
+                elif line.startswith('>PF'):
+                    accession = full_desc.split(' ; ')[0]
+
+                    try:
+                        annotation = full_desc.split('; ')[2]
+                    except:
+                        annotation = 'hypothetical protein'
+
+                ## PDB models
+                else:
+                    accession = full_desc.split(' ')[0]
+
+                    try:
+                        annotationpd = full_desc.split(' ')[1:]
+                        annotationpd = ' '.join(annotationpd)
+                        if ']' in annotationpd:
+                            annotation = annotationpd.split(']')[1].split(';')[0]
+                        else:
+                            annotation = annotationpd.split(';')[0]
+                    except:
+                        annotation = 'hypothetical protein'
 
                 full_stats = next(hrh, '').strip()
                 probability = full_stats.split('  ')[0].split('=')[1]

diff --git a/src/cenote/python_modules/make_sequin_fsas.py b/src/cenote/python_modules/make_sequin_fsas.py
@@ -43,6 +43,8 @@
 
 DATA_SOURCE = sys.argv[16]
 
+GENBANK = sys.argv[17]
+
 # make out dir
 if not os.path.isdir(out_dir):
     os.makedirs(out_dir)
@@ -84,6 +86,8 @@
 repeat_df = pd.read_csv(repeat_file, sep = "\t")
 
 #### loop each virus
+
+desc_list = []
 for seq_record in SeqIO.parse(final_contig_file, "fasta"):
 
     #make a random alphanumeric ID 5 characters in length
@@ -131,14 +135,38 @@
             topology = "circular"
         else:
             topology = "linear"
-
-    # here's the whole header string
-    header = (f">{str(seq_record.id)} [organism={organism} sp. ct{randID}] [gcode={gcode}] "
-        f"[topology={topology}] [note: taxonomic lineage {lineage}] [isolation_source={ISO_SOURCE}] "
-   	    f"[collection_date={COLLECT_DATE}] [metagenome_source={META_TYPE}] [SRA={SRR}] "
-        f"[note=genome binned from sequencing reads available in {SRX}] [Biosample={BIOSAMP}] "
-        f"[Bioproject={PRJ}] [moltype={MOL_TYPE}] [isolation_source={DATA_SOURCE}]")
 
-    seq_output_file = os.path.join(out_dir, str(seq_record.id) + ".fsa")
+    if GENBANK == 'True':
+        # here's the whole header string
+        header = (f">{str(seq_record.id)} [organism={organism} sp. ct{randID}] [gcode={gcode}] "
+            f"[topology={topology}] [note: taxonomic lineage {lineage}] [isolation_source={ISO_SOURCE}] "
+            f"[collection_date={COLLECT_DATE}] [metagenome_source={META_TYPE}] [SRA={SRR}] "
+            f"[note=genome binned from sequencing reads available in {SRX}] [Biosample={BIOSAMP}] "
+            f"[Bioproject={PRJ}] [moltype={MOL_TYPE}] [isolation_source={DATA_SOURCE}]")
+
+        seq_output_file = os.path.join(out_dir, str(seq_record.id) + ".fsa")
+
+        print(f"{header}\n{seq_record.seq}", file = open(seq_output_file, "a"))
+
+    ## add outtable with contig, chunk, organism name (full)
+    try:
+        if "@" in seq_record.id:
+            contig = seq_record.id.split("@")[0]
+            chunkq = seq_record.id.split("@")[1]
+        else:
+            contig = seq_record.id
+            chunkq = None
+
+        fullorg = f'{organism} sp. ct{randID}'
+
+        desc_list.append([contig, chunkq, fullorg])
+    except:
+        print(f"{os.path.basename(__file__)}: seq record info parse failed.")
+
+
+desc_df = pd.DataFrame(desc_list, columns=["contig", "chunk_name", "organism"])
+
+if not desc_df.empty:
+    org_output_file = os.path.join(temp_dir, "contig_to_organism.tsv")
 
-    print(f"{header}\n{seq_record.seq}", file = open(seq_output_file, "a"))
+    desc_df.to_csv(org_output_file, sep = "\t", index = False)
diff --git a/src/cenote/python_modules/make_sequin_tbls.py b/src/cenote/python_modules/make_sequin_tbls.py
@@ -18,13 +18,15 @@
 
 out_dir = sys.argv[5]
 
+GENBANK = sys.argv[6]
+
 if not os.path.isdir(out_dir):
     os.makedirs(out_dir)
 
 # load gene/contig table
 gene_contig_df = pd.read_csv(gene_contig_file, sep = "\t")
 
-gene_contig_df['chunk_name'] = gene_contig_df['chunk_name'].fillna("NaN")
+gene_contig_df['chunk_name'] = gene_contig_df['chunk_name'].infer_objects(copy=False).fillna("NaN")
 
 ## quick_df is for adding chunk info to trnascan table via merge
 quick_df = gene_contig_df[['contig', 'chunk_name', 'chunk_length', 
@@ -68,8 +70,8 @@
     tRNA_df = tRNA_df[['contig', 'chunk_name', 'gene_start', 'gene_stop', 'gene_name', 
                     'gene_orient', 'evidence_description', 'evidence_acession', 'Evidence_source']]
 
-    tRNA_df['chunk_name'] = tRNA_df['chunk_name'].fillna("NaN")
-    quick_df['chunk_name'] = quick_df['chunk_name'].fillna("NaN")
+    tRNA_df['chunk_name'] = tRNA_df['chunk_name'].infer_objects(copy=False).fillna("NaN")
+    quick_df['chunk_name'] = quick_df['chunk_name'].infer_objects(copy=False).fillna("NaN")
 
     tRNA_plus_df = pd.merge(tRNA_df, quick_df, on = ["contig", "chunk_name"], how = "left")
 
@@ -173,7 +175,7 @@
 
 
 ## need a chunk value for all viruses to properly group
-merged_df['chunk_name'] = merged_df['chunk_name'].fillna("NaN")
+merged_df['chunk_name'] = merged_df['chunk_name'].infer_objects(copy=False).fillna("NaN")
 
 chunk_grouped_df = merged_df.groupby(['contig', 'chunk_name'], dropna = False)
 
@@ -185,7 +187,12 @@ def tbl_first_second(gstart, gstop, gorient):
     else:
         return gstop, gstart, gorient
 
+
+if GENBANK == 'False':
+    sys.exit()
+
 #### loop each virus 
+
 for name, seq_group in chunk_grouped_df:
 
     # header line for tbl

diff --git a/src/cenote/python_modules/summary_statement.py b/src/cenote/python_modules/summary_statement.py
@@ -14,6 +14,7 @@
 prune_sum = sys.argv[4]
 genes_sum = sys.argv[5]
 ann_mode = sys.argv[6]
+prune = sys.argv[7]
 
 ###
 
@@ -50,6 +51,8 @@
             f'searched and {pc}{num_virus_contigs}{lc} viruses were detected and annotated.{endc}')
       print(f'>>> {lc}In all, {pc}{nonhypo_genes / all_genes:.0%}{lc} of '
             f'virus genes were annotated with functional information.{endc}')
-      print(f'>>> {pc}{over_10kb}{lc} contig(s) over 10kb went through pruning module '
-            f'and {pc}{pruned_seqs}{lc} were shortened by pruning.{endc}')
+      if prune == 'True':
+            print(f'>>> {pc}{over_10kb}{lc} contig(s) over 10kb went through pruning module '
+                  f'and {pc}{pruned_seqs}{lc} were shortened by pruning.{endc}')
+
 print('')