Skip to content

Commit 0bcd2b9

Browse files
author
Tobias Andermann
committed
cleaning works with zipped files
1 parent 5682d7b commit 0bcd2b9

33 files changed

+5739
-4
lines changed

build/lib/secapr/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from ._version import get_versions
2+
__version__ = get_versions()['version']
3+
del get_versions

build/lib/secapr/__main__.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# encoding: utf-8
2+
3+
#author: Tobias Andermann, tobias.andermann@bioenv.gu.se
4+
#__main__.py created by Estelle, based on IgDiscover (https://bitbucket.org/igdiscover/igdiscover)
5+
6+
import os
7+
import sys
8+
from argparse import ArgumentParser
9+
import logging
10+
import warnings
11+
from . import __version__
12+
import importlib
13+
14+
15+
__author__ = "Tobias Andermann"
16+
17+
# List of all subcommands. A module of the given name must exist and define
18+
# add_arguments() and main() functions.
19+
20+
COMMANDS = [
21+
'quality_check',
22+
'clean_reads',
23+
'assemble_reads',
24+
'find_target_contigs',
25+
'align_sequences',
26+
'join_exons',
27+
'reference_assembly',
28+
'phase_alleles',
29+
'add_missing_sequences',
30+
'locus_selection',
31+
'automate_all',
32+
'concatenate_alignments',
33+
'paralogs_to_ref',
34+
'plot_sequence_yield'
35+
]
36+
37+
38+
def main(arguments=None):
39+
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
40+
parser = ArgumentParser(description=__doc__, prog='secapr')
41+
parser.add_argument('--version', action='version', version='%(prog)s ' + __version__)
42+
43+
subparsers = parser.add_subparsers()
44+
for command_name in COMMANDS:
45+
module = importlib.import_module('.' + command_name, 'secapr')
46+
subparser = subparsers.add_parser(command_name,
47+
help=module.__doc__.split('\n')[1], description=module.__doc__)
48+
subparser.set_defaults(func=module.main)
49+
module.add_arguments(subparser)
50+
51+
args = parser.parse_args(arguments)
52+
if not hasattr(args, 'func'):
53+
parser.error('Please provide the name of a subcommand to run')
54+
else:
55+
args.func(args)
56+
57+
58+
if __name__ == '__main__':
59+
main()

build/lib/secapr/_version.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
2+
# This file was generated by 'versioneer.py' (0.17) from
3+
# revision-control system data, or from the parent directory name of an
4+
# unpacked source archive. Distribution tarballs contain a pre-generated copy
5+
# of this file.
6+
7+
import json
8+
9+
version_json = '''
10+
{
11+
"date": "2022-03-19T18:33:01+0100",
12+
"dirty": true,
13+
"error": null,
14+
"full-revisionid": "5682d7ba9a0f8537df67c895e35ace5d29ae5540",
15+
"version": "2.2.3+9.g5682d7b.dirty"
16+
}
17+
''' # END VERSION_JSON
18+
19+
20+
def get_versions():
21+
return json.loads(version_json)
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# encoding: utf-8
2+
#author: Tobias Hofmann, tobias.andermann@bioenv.gu.se
3+
"""
4+
This script will add dummy sequences '?' for missing taxa in each alignments, making sure that all alignments in the input folder contain the same taxa (as required for e.g. *BEAST)
5+
"""
6+
7+
import os
8+
import sys
9+
import glob
10+
import shutil
11+
import configparser
12+
import pickle
13+
from .utils import CompletePath
14+
15+
def add_arguments(parser):
16+
parser.add_argument(
17+
'--input',
18+
required=True,
19+
action=CompletePath,
20+
default=None,
21+
help='The directory containing fasta alignments'
22+
)
23+
parser.add_argument(
24+
'--output',
25+
required=True,
26+
action=CompletePath,
27+
default=None,
28+
help='The output directory where results will be safed'
29+
)
30+
31+
32+
def read_fasta(fasta):
33+
name, seq = None, []
34+
for line in fasta:
35+
line = line.rstrip()
36+
if line.startswith(">"):
37+
if name: yield (name, ''.join(seq))
38+
name, seq = line, []
39+
else:
40+
seq.append(line)
41+
if name: yield (name, ''.join(seq))
42+
43+
44+
def main(args):
45+
# Set working directory
46+
work_dir = args.input
47+
out_dir = args.output
48+
if not os.path.exists(out_dir):
49+
os.makedirs(out_dir)
50+
51+
# Create a dictionary with the name-pattern as key and all file-names sharing that name-pattern
52+
fasta_dict = {}
53+
for fasta in os.listdir(work_dir):
54+
if fasta.endswith(".fasta") or fasta.endswith(".fa"):
55+
fasta_dict.setdefault("all",[]).append(fasta)
56+
57+
58+
59+
# Get the list of taxa names (headers) for each locus, key is out-file, values are in-files
60+
for key, value in fasta_dict.items():
61+
# Creates a list of all headers that are present in the concatenated alignments, accounting for differences in the taxon composition of each alignment
62+
list_headers=[]
63+
# Each k is a separate fasta input file belonging to the same locus ()to be joined)
64+
for k in sorted(value):
65+
with open("%s/%s" %(work_dir,k)) as f:
66+
for name, seq in read_fasta(f):
67+
if not name in list_headers:
68+
list_headers.append(name)
69+
70+
# "value" is a list of all fasta files to be concatenated
71+
# Find the missing taxa in each fasta input file and simulate a sequence of correct length (only "n")
72+
for k in sorted(value):
73+
taxa_names_single = []
74+
present_seq = []
75+
length_alignment = 0
76+
with open("%s/%s" %(work_dir,k)) as f:
77+
for name, seq in read_fasta(f):
78+
taxa_names_single.append(name)
79+
present_seq.append((name,seq))
80+
length_alignment = len(seq)
81+
# Make a list of all missing taxa in each fasta input file
82+
missing_taxa = []
83+
for header in list_headers:
84+
if header not in taxa_names_single:
85+
missing_taxa.append(header)
86+
simulated_seq = []
87+
for mistax in missing_taxa:
88+
fake_string = "n" * length_alignment
89+
simulated_seq.append((mistax,fake_string))
90+
all_seq = sorted(simulated_seq+present_seq)
91+
out_fasta = open(os.path.join(out_dir, k), 'w')
92+
for seqname, sequence in all_seq:
93+
out_fasta.write(seqname+"\n")
94+
out_fasta.write(sequence+"\n")
95+
out_fasta.close()
96+
try:
97+
pickle_in = os.path.join(args.input,'.secapr_files/sequence_origin.pickle')
98+
with open(pickle_in, 'rb') as handle:
99+
sequence_origin = pickle.load(handle)
100+
pickle_path = os.path.join(args.output,'.secapr_files/sequence_origin.pickle')
101+
with open(pickle_path, 'wb') as handle:
102+
pickle.dump(sequence_origin, handle, protocol=pickle.HIGHEST_PROTOCOL)
103+
except:
104+
pass

0 commit comments

Comments
 (0)