Skip to content

Commit

Permalink
Merge pull request #298 from loosolab/dev
Browse files Browse the repository at this point in the history
Release 0.17.1
  • Loading branch information
hschult authored Dec 13, 2024
2 parents 50d0b55 + 738c441 commit 538a865
Show file tree
Hide file tree
Showing 8 changed files with 60 additions and 18 deletions.
5 changes: 5 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.17.1 (2024-12-13)
- fixed a bug caused by usage of deprecated cython data types that arose after the update to numpy>=2
- fixed a bug with SubMerge when input region files had more columns than expected
- SubMerge received a new paramter to adjust the order or contig for sorting

## 0.17.0 (2023-07-15)
- Fixed bug when reading motifs in "long" format. An error is now written if the pfm/jaspar motif does not follow 4-row format.
- Added transfac as possible input through biopython parser.
Expand Down
7 changes: 7 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[build-system]
requires = [
"setuptools", # needed pre-install
"numpy>=2.0", # needed pre-install
"cython"
]
build-backend = "setuptools.build_meta"
2 changes: 1 addition & 1 deletion tobias/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.17.0"
__version__ = "0.17.1"
4 changes: 4 additions & 0 deletions tobias/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,10 @@ def add_submerge_arguments(parser):
#Optional arguments
optional = parser.add_argument_group('Optional arguments')
optional.add_argument( "--output", default='./merged_TFBS_subset.tsv', help="Path for output file. If file name ends with .bed, no header column will be added. If file name ends with .xlsx, file will be converted into an excel file. Default: ./merged_TFBS_subset.tsv", type=os.path.abspath, dest="output")
optional.add_argument( "--order",
default=['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM'],
help="Path to file containing the order of chormosomes to sort by. Entries should either be the first column of a table / one contig per line. Default: human chromosomes",
type=os.path.abspath, dest="order")
optional.add_argument("--TFs", help="Path to the file containing the list of TFs to subset. File has to contain one column with the TFBS names in the same format used in the BINDetect output files/directories.", type=os.path.abspath, dest="tf", default=None)
optional = add_logger_args(optional)

Expand Down
34 changes: 30 additions & 4 deletions tobias/tools/submerge.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,21 @@ def run_submerge(args):

regions_columns = len(regions_df.columns)

# convert df back to bedtool
regions_obj = BedTool.from_dataframe(regions_df)
# convert back to BedTool
regions_obj = BedTool.from_dataframe(regions_df)

elif regions_columns > 6:

logger.warning("Regions file contains more than 6 columns. Only the first 6 columns will be used.")

# convert to dataframe
regions_df = regions_obj.to_dataframe()

# remove columns
regions_df = regions_df.iloc[:, :6]

# convert back to BedTool
regions_obj = BedTool.from_dataframe(regions_df)

# Read TFs
if args.tf is not None:
Expand Down Expand Up @@ -86,10 +99,23 @@ def run_submerge(args):

logger.debug('Sorting')

# get order or chromosomes
if type(args.order) == list:
contig_order = args.order
else:
order_df = pd.read_csv(args.order, sep="\t")
# get first column
contig_order = order_df.iloc[:, 0].tolist()

# add order column
df['contig_order'] = df['query chr'].apply(lambda x: contig_order.index(x) if x in contig_order else len(contig_order))

# Sort the dataframe
df.sort_values(by=["query chr", "query start", "TFBS_name", "TFBS_chr", "TFBS_start"],
key=lambda x: x.str.replace("chr", "").astype(int) if x.name in ["query chr", "TFBS_chr"] else x,
df.sort_values(by=["contig_order", "query start", "TFBS_name", "TFBS_start"],
inplace=True)

# Drop the order column
df.drop(columns=['contig_order'], inplace=True)

if args.output.endswith(".xlsx"):
df.to_excel(args.output, index=False)
Expand Down
18 changes: 9 additions & 9 deletions tobias/utils/sequences.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class NucleotideMatrix(SequenceMatrix):

@cython.boundscheck(False)
@cython.wraparound(False)
def add_sequence(self, np.ndarray[np.int_t, ndim=1] sequence, double amount = 1.0):
def add_sequence(self, np.ndarray[np.int64_t, ndim=1] sequence, double amount = 1.0):
""" Adds sequence to SequenceMatrix the number of times specified in amount """

cdef int Sm, m
Expand Down Expand Up @@ -90,7 +90,7 @@ class NucleotideMatrix(SequenceMatrix):

@cython.boundscheck(False)
@cython.wraparound(False)
def add_background(self, np.ndarray[np.int_t, ndim=1] sequence, double amount = 1.0):
def add_background(self, np.ndarray[np.int64_t, ndim=1] sequence, double amount = 1.0):
""" Adds sequence to count of background nucleotides """

cdef int Sm, m
Expand Down Expand Up @@ -120,7 +120,7 @@ class NucleotideMatrix(SequenceMatrix):
@cython.boundscheck(False) #dont check boundaries
@cython.cdivision(True) #no check for zero division
@cython.wraparound(False) #dont deal with negative indices
def score_sequence(self, np.ndarray[np.int_t, ndim=1] sequence):
def score_sequence(self, np.ndarray[np.int64_t, ndim=1] sequence):
""" Score nucleotide sequence against motif """

cdef np.ndarray[np.float64_t, ndim=2] pssm = self.pssm
Expand Down Expand Up @@ -175,7 +175,7 @@ class DiNucleotideMatrix(SequenceMatrix):

@cython.boundscheck(False)
@cython.wraparound(False) #dont deal with negative indices
def add_sequence(self, np.ndarray[np.int_t, ndim=1] sequence, double amount = 1.0):
def add_sequence(self, np.ndarray[np.int64_t, ndim=1] sequence, double amount = 1.0):

cdef np.ndarray[np.float64_t, ndim=4] bias_counts = self.counts
cdef int L = self.length
Expand All @@ -200,7 +200,7 @@ class DiNucleotideMatrix(SequenceMatrix):

@cython.boundscheck(False)
@cython.wraparound(False)
def add_background(self, np.ndarray[np.int_t, ndim=1] sequence, double amount = 1.0):
def add_background(self, np.ndarray[np.int64_t, ndim=1] sequence, double amount = 1.0):
""" Adds sequence to count of background nucleotides """

cdef np.ndarray[np.float64_t, ndim=4] bg_counts = self.bg_counts
Expand Down Expand Up @@ -260,7 +260,7 @@ class DiNucleotideMatrix(SequenceMatrix):
@cython.boundscheck(False)
@cython.cdivision(True) #no check for zero division
@cython.wraparound(False) #dont deal with negative indices
def score_sequence(self, np.ndarray[np.int_t, ndim=1] sequence):
def score_sequence(self, np.ndarray[np.int64_t, ndim=1] sequence):
#Score nucleotide sequence against dinucleotide motif

cdef np.ndarray[np.float64_t, ndim=2] bias_PWM = self.bias_pwm_log
Expand Down Expand Up @@ -346,7 +346,7 @@ def nuc_to_num(str sequence):
""" Convert DNA sequence string to internal number format """

cdef int length = len(sequence)
cdef np.ndarray[np.int_t, ndim=1] num_sequence = np.zeros(length, dtype=int)
cdef np.ndarray[np.int64_t, ndim=1] num_sequence = np.zeros(length, dtype=int)
cdef int i, num
cdef str nuc

Expand Down Expand Up @@ -387,8 +387,8 @@ class GenomicSequence:

cdef str fasta = fasta_obj.fetch(self.region.chrom, self.region.start, self.region.end)
cdef int length = self.length
cdef np.ndarray[np.int_t, ndim=1] sequence = self.sequence
cdef np.ndarray[np.int_t, ndim=1] revcomp_sequence = self.revcomp
cdef np.ndarray[np.int64_t, ndim=1] sequence = self.sequence
cdef np.ndarray[np.int64_t, ndim=1] revcomp_sequence = self.revcomp
cdef int i, num, comp
cdef str nuc

Expand Down
4 changes: 2 additions & 2 deletions tobias/utils/signals.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,14 @@ class SignalList(list):
pass

#--------------------------------------------------------------------------------------------------#
def shuffle_array(np.ndarray[np.float64_t, ndim=1] arr, int no_rand, np.ndarray[np.int_t, ndim=1] shift_options):
def shuffle_array(np.ndarray[np.float64_t, ndim=1] arr, int no_rand, np.ndarray[np.int64_t, ndim=1] shift_options):
""" Shuffles array of values within the boundaries given in shift """

cdef int max_shift = max([abs(np.min(shift_options)), abs(np.max(shift_options))])
cdef np.ndarray[np.float64_t, ndim=1] ext_arr = np.concatenate((np.zeros(max_shift), arr, np.zeros(max_shift))) #pad with max shift to allow for shuffling outside borders
cdef int ext_arr_len = len(ext_arr)

cdef np.ndarray[np.int_t, ndim=1] nonzero_index = np.nonzero(ext_arr)[0]
cdef np.ndarray[np.int64_t, ndim=1] nonzero_index = np.nonzero(ext_arr)[0]
cdef int no_shift = len(nonzero_index)

cdef np.ndarray[np.int64_t, ndim=2] rand_rel_positions = np.random.choice(shift_options, size=(no_shift, no_rand)) #positions of shuffled reads
Expand Down
4 changes: 2 additions & 2 deletions tobias_env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ channels:
- conda-forge

dependencies:
- python
- python>=3.12
- pysam
- pybigwig
- samtools
Expand All @@ -14,7 +14,7 @@ dependencies:
- bedtools
- cython
- matplotlib>=2
- numpy
- numpy>=2
- scipy
- pypdf2
- scikit-learn
Expand Down

0 comments on commit 538a865

Please sign in to comment.