Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/docker-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ permissions:

jobs:
build:

runs-on: ubuntu-latest

steps:
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
test/
**.pyc
**/__pycache__/
2 changes: 1 addition & 1 deletion envs/busco.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ channels:
- conda-forge
- bioconda
dependencies:
- busco=5.2.2
- busco=6.0.0
3 changes: 1 addition & 2 deletions envs/dataset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,4 @@ dependencies:
- python=3.9
- pip=21.2.1
- kraken2
- pip:
- ncbi-datasets-pylib==12.6.0
- requests=2.32
4 changes: 2 additions & 2 deletions envs/minimap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ name: minimap
channels:
- bioconda
dependencies:
- minimap2=2.17
- seqtk=1.3
- minimap2=2.30
- seqtk=1.5
229 changes: 138 additions & 91 deletions scripts/BuscoConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,76 +5,121 @@
import sys

parser = argparse.ArgumentParser()
parser.add_argument("-na", type=str, action='store', dest='namesfile', metavar='NAMES',help='NCBI names.dmp')
parser.add_argument("-no", type=str, action='store', dest='nodesfile', metavar='NODES',help='NCBI nodes.dmp')
parser.add_argument("-f", type=str, action='store', dest='genome', metavar='GENOME FASTA',help='fasta genome assembly file')
parser.add_argument("-d", type=str, action='store', dest='dir', metavar='WORKDIR',help='define working directory for busco')
parser.add_argument("-db", type=str, action='store', dest='db', help='define available dbs file')
parser.add_argument("-dl", type=str, action='store', dest='download',help='define directory to store busco dbs')
parser.add_argument("-c", type=int, action='store', dest='cpu',help='define cpus')
parser.add_argument("-o", type=str, action='store', dest='out', metavar='OUTFILE',help='define configfile name')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument(
"-na",
type=str,
action="store",
dest="namesfile",
metavar="NAMES",
help="NCBI names.dmp",
)
parser.add_argument(
"-no",
type=str,
action="store",
dest="nodesfile",
metavar="NODES",
help="NCBI nodes.dmp",
)
parser.add_argument(
"-f",
type=str,
action="store",
dest="genome",
metavar="GENOME FASTA",
help="fasta genome assembly file",
)
parser.add_argument(
"-d",
type=str,
action="store",
dest="dir",
metavar="WORKDIR",
help="define working directory for busco",
)
parser.add_argument(
"-db", type=str, action="store", dest="db", help="define available dbs file"
)
parser.add_argument(
"-dl",
type=str,
action="store",
dest="download",
help="define directory to store busco dbs",
)
parser.add_argument("-c", type=int, action="store", dest="cpu", help="define cpus")
parser.add_argument(
"-o",
type=str,
action="store",
dest="out",
metavar="OUTFILE",
help="define configfile name",
)
parser.add_argument("--version", action="version", version="%(prog)s 1.0")
args = parser.parse_args()


def readNames(names_tax_file):
'''
"""
input:
- name.dmp (NCBI Taxonomy)
output:
- dictionary of form {node: name}
- dictionary of form {sci name: node}
'''
"""
tax_names = {}
tax_names_reverse= {}
with open(names_tax_file, 'r') as nodes_tax:
tax_names_reverse = {}
with open(names_tax_file, "r") as nodes_tax:
for line in nodes_tax:
node = [field.strip() for field in line.split('|')]
if 'scientific' in line:
node = [field.strip() for field in line.split("|")]
if "scientific" in line:
tax_names[node[1]] = node[0]
tax_names_reverse[node[0]] = node[1]
return tax_names_reverse,tax_names
return tax_names_reverse, tax_names

def readNodes(nodes_tax_file):

'''
def readNodes(nodes_tax_file):
"""
input:
- nodes.dmp (NCBI Taxonomy)
output:
- dictionary of form {parent: node}
- dictionary of form {node: type}
'''
"""

tax_nodes = {}
tax_types = {}
with open(nodes_tax_file, 'r') as nodes_tax:
with open(nodes_tax_file, "r") as nodes_tax:
for line in nodes_tax:
node = [field.strip() for field in line.split('|')] #make list of line
tax_nodes[node[0]] = node[1] #couple node with parent
tax_types[node[0]] = node[2] #couple node with rank
node = [field.strip() for field in line.split("|")] # make list of line
tax_nodes[node[0]] = node[1] # couple node with parent
tax_types[node[0]] = node[2] # couple node with rank
return tax_nodes

taxparents=readNodes(args.nodesfile)
taxnames,namestax=readNames(args.namesfile)

genus=args.out.split('/config')[0].split('/')[-1]
if '_' in genus:
genus=genus.replace('_',' ')
taxparents = readNodes(args.nodesfile)
taxnames, namestax = readNames(args.namesfile)

busco_dbs=[]
busco_short=[]
m=open(args.db,'r')
genus = args.out.split("/config")[0].split("/")[-1]
if "_" in genus:
genus = genus.replace("_", " ")

busco_dbs = []
busco_short = []
m = open(args.db, "r")
for line in m:
line=line.strip()
if 'db' in line:
#if 'eukaryota' in line:
line = line.strip()
if "db" in line:
# if 'eukaryota' in line:
# break
dbname=line.split(' ')[-1].split('_odb')[0]
#print(dbname)
dbname2=line.split(' ')[-1].split('_')[0]
dbname = line.split(" ")[-1].split("_odb")[0]
# print(dbname)
dbname2 = line.split(" ")[-1].split("_")[0]
busco_short.append(dbname2)
busco_dbs.append(dbname)

buscoset = 'Bacteria'
buscoset = "Bacteria"
if genus in namestax:
taxid = namestax[genus]
parent = taxparents[taxid]
Expand All @@ -83,61 +128,63 @@ def readNodes(nodes_tax_file):
if taxnames[parent].lower() in busco_short:
buscoset = taxnames[parent]
print(buscoset)
if taxnames[parent].lower()+"_phylum" in busco_dbs:
buscoset = taxnames[parent].lower()+"_phylum"
if taxnames[parent].lower() + "_phylum" in busco_dbs:
buscoset = taxnames[parent].lower() + "_phylum"
break
parent = taxparents[parent]

print(genus+'\t'+buscoset)
print(genus + "\t" + buscoset)

condadir = os.environ["CONDA_DEFAULT_ENV"]

condadir = os.environ['CONDA_DEFAULT_ENV']
config_content = f"""[busco_run]
# Input file
in = {args.genome}
# Run name, used in output files and folder
out = busco
# Where to store the output directory
out_path = {args.dir}
# Path to the BUSCO dataset
lineage_dataset = {buscoset.lower()}
# Which mode to run (genome / proteins / transcriptome)
mode = genome
# How many threads to use for multithreaded steps
cpu = {args.cpu}
# Force rewrite if files already exist (True/False)
;force = False
# Local destination path for downloaded lineage datasets
download_path = {args.download}
;[tblastn]
;path = {condadir}/bin/
;command = tblastn
;[makeblastdb]
;path = {condadir}/bin/
;command = makeblastdb
;[augustus]
;path = {condadir}/bin/
;command = augustus
;[etraining]
;path = {condadir}/bin/
;command = etraining
;[gff2gbSmallDNA.pl]
;path = {condadir}/bin/
;command = gff2gbSmallDNA.pl
;[new_species.pl]
;path = {condadir}/bin/
;command = new_species.pl
;[optimize_augustus.pl]
;path = {condadir}/bin/
;command = optimize_augustus.pl
;[hmmsearch]
;path = {condadir}/bin/
;command = hmmsearch
;[sepp]
;path = {condadir}/bin/
;command = run_sepp.py
;[prodigal]
;path = {condadir}/bin/
;command = prodigal
"""

l=open(args.out,'w')
l.write('[busco_run]'+'\n')
l.write('# Input file'+'\n')
l.write('in = '+args.genome+'\n')
l.write('# Run name, used in output files and folder'+'\n')
l.write('out = busco'+'\n')
l.write('# Where to store the output directory'+'\n')
l.write('out_path = '+args.dir+'\n')
l.write('# Path to the BUSCO dataset'+'\n')
l.write('lineage_dataset = '+buscoset.lower()+'\n')
l.write('# Which mode to run (genome / proteins / transcriptome)'+'\n')
l.write('mode = genome'+'\n')
l.write('# How many threads to use for multithreaded steps'+'\n')
l.write('cpu = '+str(args.cpu)+'\n')
l.write('# Force rewrite if files already exist (True/False)'+'\n')
l.write(';force = False'+'\n')
l.write('# Local destination path for downloaded lineage datasets'+'\n')
l.write('download_path = '+args.download+'\n')
l.write('[tblastn]'+'\n')
l.write('path = '+condadir+'/bin/'+'\n')
l.write('command = tblastn'+'\n')
l.write('[makeblastdb]'+'\n')
l.write('path = '+condadir+'/bin/'+'\n')
l.write('command = makeblastdb'+'\n')
l.write('[augustus]'+'\n')
l.write('path = '+condadir+'/bin/'+'\n')
l.write('command = augustus'+'\n')
l.write('[etraining]'+'\n')
l.write('path = '+condadir+'/bin/'+'\n')
l.write('command = etraining'+'\n')
l.write('[gff2gbSmallDNA.pl]'+'\n')
l.write('path = '+condadir+'/bin/'+'\n')
l.write('command = gff2gbSmallDNA.pl'+'\n')
l.write('[new_species.pl]'+'\n')
l.write('path = '+condadir+'/bin/'+'\n')
l.write('command = new_species.pl'+'\n')
l.write('[optimize_augustus.pl]'+'\n')
l.write('path = '+condadir+'/bin/'+'\n')
l.write('command = optimize_augustus.pl'+'\n')
l.write('[hmmsearch]'+'\n')
l.write('path = '+condadir+'/bin/'+'\n')
l.write('command = hmmsearch'+'\n')
l.write('[sepp]'+'\n')
l.write('path = '+condadir+'/bin/'+'\n')
l.write('command = run_sepp.py'+'\n')
l.write('[prodigal]'+'\n')
l.write('path = '+condadir+'/bin/'+'\n')
l.write('command = prodigal'+'\n')
l.close()
with open(args.out, "w") as l:
l.write(config_content)
Loading
Loading