CobiontID · prototaxites · Nov 6, 2025 · Oct 8, 2025 · Oct 8, 2025 · Oct 9, 2025
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
@@ -18,7 +18,6 @@ permissions:
 
 jobs:
   build:
-
     runs-on: ubuntu-latest
 
     steps:

diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+test/
+**.pyc
+**/__pycache__/
diff --git a/envs/busco.yaml b/envs/busco.yaml
@@ -3,4 +3,4 @@ channels:
   - conda-forge
   - bioconda
 dependencies:
-  - busco=5.2.2
+  - busco=6.0.0
diff --git a/envs/dataset.yaml b/envs/dataset.yaml
@@ -6,5 +6,4 @@ dependencies:
   - python=3.9
   - pip=21.2.1
   - kraken2
-  - pip:
-    - ncbi-datasets-pylib==12.6.0
+  - requests=2.32
diff --git a/envs/minimap.yaml b/envs/minimap.yaml
@@ -2,5 +2,5 @@ name: minimap
 channels:
   - bioconda
 dependencies:
-  - minimap2=2.17
-  - seqtk=1.3
+  - minimap2=2.30
+  - seqtk=1.5
diff --git a/scripts/BuscoConfig.py b/scripts/BuscoConfig.py
@@ -5,76 +5,121 @@
 import sys
 
 parser = argparse.ArgumentParser()
-parser.add_argument("-na", type=str, action='store', dest='namesfile', metavar='NAMES',help='NCBI names.dmp')
-parser.add_argument("-no", type=str, action='store', dest='nodesfile', metavar='NODES',help='NCBI nodes.dmp')
-parser.add_argument("-f", type=str, action='store', dest='genome', metavar='GENOME FASTA',help='fasta genome assembly file')
-parser.add_argument("-d", type=str, action='store', dest='dir', metavar='WORKDIR',help='define working directory for busco')
-parser.add_argument("-db", type=str, action='store', dest='db', help='define available dbs file')
-parser.add_argument("-dl", type=str, action='store', dest='download',help='define directory to store busco dbs')
-parser.add_argument("-c", type=int, action='store', dest='cpu',help='define cpus')
-parser.add_argument("-o", type=str, action='store', dest='out', metavar='OUTFILE',help='define configfile name')
-parser.add_argument('--version', action='version', version='%(prog)s 1.0')
+parser.add_argument(
+    "-na",
+    type=str,
+    action="store",
+    dest="namesfile",
+    metavar="NAMES",
+    help="NCBI names.dmp",
+)
+parser.add_argument(
+    "-no",
+    type=str,
+    action="store",
+    dest="nodesfile",
+    metavar="NODES",
+    help="NCBI nodes.dmp",
+)
+parser.add_argument(
+    "-f",
+    type=str,
+    action="store",
+    dest="genome",
+    metavar="GENOME FASTA",
+    help="fasta genome assembly file",
+)
+parser.add_argument(
+    "-d",
+    type=str,
+    action="store",
+    dest="dir",
+    metavar="WORKDIR",
+    help="define working directory for busco",
+)
+parser.add_argument(
+    "-db", type=str, action="store", dest="db", help="define available dbs file"
+)
+parser.add_argument(
+    "-dl",
+    type=str,
+    action="store",
+    dest="download",
+    help="define directory to store busco dbs",
+)
+parser.add_argument("-c", type=int, action="store", dest="cpu", help="define cpus")
+parser.add_argument(
+    "-o",
+    type=str,
+    action="store",
+    dest="out",
+    metavar="OUTFILE",
+    help="define configfile name",
+)
+parser.add_argument("--version", action="version", version="%(prog)s 1.0")
 args = parser.parse_args()
 
+
 def readNames(names_tax_file):
-    '''
+    """
     input:
     - name.dmp (NCBI Taxonomy)
     output:
     - dictionary of form {node: name}
     - dictionary of form {sci name: node}
-    '''
+    """
     tax_names = {}
-    tax_names_reverse= {}
-    with open(names_tax_file, 'r') as nodes_tax:
+    tax_names_reverse = {}
+    with open(names_tax_file, "r") as nodes_tax:
         for line in nodes_tax:
-            node = [field.strip() for field in line.split('|')]
-            if 'scientific' in line:
+            node = [field.strip() for field in line.split("|")]
+            if "scientific" in line:
                 tax_names[node[1]] = node[0]
                 tax_names_reverse[node[0]] = node[1]
-    return tax_names_reverse,tax_names
+    return tax_names_reverse, tax_names
 
-def readNodes(nodes_tax_file):
 
-    '''
+def readNodes(nodes_tax_file):
+    """
     input:
     - nodes.dmp (NCBI Taxonomy)
     output:
     - dictionary of form {parent: node}
     - dictionary of form {node: type}
-    '''
+    """
 
     tax_nodes = {}
     tax_types = {}
-    with open(nodes_tax_file, 'r') as nodes_tax:
+    with open(nodes_tax_file, "r") as nodes_tax:
         for line in nodes_tax:
-            node = [field.strip() for field in line.split('|')]         #make list of line
-            tax_nodes[node[0]] = node[1]                                #couple node with parent
-            tax_types[node[0]] = node[2]                                #couple node with rank
+            node = [field.strip() for field in line.split("|")]  # make list of line
+            tax_nodes[node[0]] = node[1]  # couple node with parent
+            tax_types[node[0]] = node[2]  # couple node with rank
     return tax_nodes
 
-taxparents=readNodes(args.nodesfile)
-taxnames,namestax=readNames(args.namesfile)
 
-genus=args.out.split('/config')[0].split('/')[-1]
-if '_' in genus:
-    genus=genus.replace('_',' ')
+taxparents = readNodes(args.nodesfile)
+taxnames, namestax = readNames(args.namesfile)
 
-busco_dbs=[]
-busco_short=[]
-m=open(args.db,'r')
+genus = args.out.split("/config")[0].split("/")[-1]
+if "_" in genus:
+    genus = genus.replace("_", " ")
+
+busco_dbs = []
+busco_short = []
+m = open(args.db, "r")
 for line in m:
-    line=line.strip()
-    if 'db' in line:
-        #if 'eukaryota' in line:
+    line = line.strip()
+    if "db" in line:
+        # if 'eukaryota' in line:
         #    break
-        dbname=line.split(' ')[-1].split('_odb')[0]
-        #print(dbname)
-        dbname2=line.split(' ')[-1].split('_')[0]
+        dbname = line.split(" ")[-1].split("_odb")[0]
+        # print(dbname)
+        dbname2 = line.split(" ")[-1].split("_")[0]
         busco_short.append(dbname2)
         busco_dbs.append(dbname)
 
-buscoset = 'Bacteria'
+buscoset = "Bacteria"
 if genus in namestax:
     taxid = namestax[genus]
     parent = taxparents[taxid]
@@ -83,61 +128,63 @@ def readNodes(nodes_tax_file):
         if taxnames[parent].lower() in busco_short:
             buscoset = taxnames[parent]
             print(buscoset)
-            if taxnames[parent].lower()+"_phylum" in busco_dbs:
-                buscoset = taxnames[parent].lower()+"_phylum"
+            if taxnames[parent].lower() + "_phylum" in busco_dbs:
+                buscoset = taxnames[parent].lower() + "_phylum"
             break
         parent = taxparents[parent]
 
-print(genus+'\t'+buscoset)
+print(genus + "\t" + buscoset)
+
+condadir = os.environ["CONDA_DEFAULT_ENV"]
 
-condadir = os.environ['CONDA_DEFAULT_ENV']
+config_content = f"""[busco_run]
+# Input file
+in = {args.genome}
+# Run name, used in output files and folder
+out = busco
+# Where to store the output directory
+out_path = {args.dir}
+# Path to the BUSCO dataset
+lineage_dataset = {buscoset.lower()}
+# Which mode to run (genome / proteins / transcriptome)
+mode = genome
+# How many threads to use for multithreaded steps
+cpu = {args.cpu}
+# Force rewrite if files already exist (True/False)
+;force = False
+# Local destination path for downloaded lineage datasets
+download_path = {args.download}
+;[tblastn]
+;path = {condadir}/bin/
+;command = tblastn
+;[makeblastdb]
+;path = {condadir}/bin/
+;command = makeblastdb
+;[augustus]
+;path = {condadir}/bin/
+;command = augustus
+;[etraining]
+;path = {condadir}/bin/
+;command = etraining
+;[gff2gbSmallDNA.pl]
+;path = {condadir}/bin/
+;command = gff2gbSmallDNA.pl
+;[new_species.pl]
+;path = {condadir}/bin/
+;command = new_species.pl
+;[optimize_augustus.pl]
+;path = {condadir}/bin/
+;command = optimize_augustus.pl
+;[hmmsearch]
+;path = {condadir}/bin/
+;command = hmmsearch
+;[sepp]
+;path = {condadir}/bin/
+;command = run_sepp.py
+;[prodigal]
+;path = {condadir}/bin/
+;command = prodigal
+"""
 
-l=open(args.out,'w')
-l.write('[busco_run]'+'\n')
-l.write('# Input file'+'\n')
-l.write('in = '+args.genome+'\n')
-l.write('# Run name, used in output files and folder'+'\n')
-l.write('out = busco'+'\n')
-l.write('# Where to store the output directory'+'\n')
-l.write('out_path = '+args.dir+'\n')
-l.write('# Path to the BUSCO dataset'+'\n')
-l.write('lineage_dataset =  '+buscoset.lower()+'\n')
-l.write('# Which mode to run (genome / proteins / transcriptome)'+'\n')
-l.write('mode = genome'+'\n')
-l.write('# How many threads to use for multithreaded steps'+'\n')
-l.write('cpu = '+str(args.cpu)+'\n')
-l.write('# Force rewrite if files already exist (True/False)'+'\n')
-l.write(';force = False'+'\n')
-l.write('# Local destination path for downloaded lineage datasets'+'\n')
-l.write('download_path = '+args.download+'\n')
-l.write('[tblastn]'+'\n')
-l.write('path = '+condadir+'/bin/'+'\n')
-l.write('command = tblastn'+'\n')
-l.write('[makeblastdb]'+'\n')
-l.write('path = '+condadir+'/bin/'+'\n')
-l.write('command = makeblastdb'+'\n')
-l.write('[augustus]'+'\n')
-l.write('path = '+condadir+'/bin/'+'\n')
-l.write('command = augustus'+'\n')
-l.write('[etraining]'+'\n')
-l.write('path = '+condadir+'/bin/'+'\n')
-l.write('command = etraining'+'\n')
-l.write('[gff2gbSmallDNA.pl]'+'\n')
-l.write('path = '+condadir+'/bin/'+'\n')
-l.write('command = gff2gbSmallDNA.pl'+'\n')
-l.write('[new_species.pl]'+'\n')
-l.write('path = '+condadir+'/bin/'+'\n')
-l.write('command = new_species.pl'+'\n')
-l.write('[optimize_augustus.pl]'+'\n')
-l.write('path = '+condadir+'/bin/'+'\n')
-l.write('command = optimize_augustus.pl'+'\n')
-l.write('[hmmsearch]'+'\n')
-l.write('path = '+condadir+'/bin/'+'\n')
-l.write('command = hmmsearch'+'\n')
-l.write('[sepp]'+'\n')
-l.write('path = '+condadir+'/bin/'+'\n')
-l.write('command = run_sepp.py'+'\n')
-l.write('[prodigal]'+'\n')
-l.write('path = '+condadir+'/bin/'+'\n')
-l.write('command = prodigal'+'\n')
-l.close()
+with open(args.out, "w") as l:
+    l.write(config_content)
-Original file line number
+Diff line change
@@ Expand Up / @@ -18,7 +18,6 @@ permissions: @@
     jobs:
       build:
         runs-on: ubuntu-latest
         steps:
@@ Expand Down @@