Merge pull request #298 from loosolab/dev

Release 0.17.1
loosolab · Dec 13, 2024 · 538a865 · 538a865
2 parents 50d0b55 + 738c441
commit 538a865
Show file tree

Hide file tree

Showing 8 changed files with 60 additions and 18 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,3 +1,8 @@
+## 0.17.1 (2024-12-13)
+- fixed a bug caused by usage of deprecated cython data types that arose after the update to numpy>=2
+- fixed a bug with SubMerge when input region files had more columns than expected
+- SubMerge received a new paramter to adjust the order or contig for sorting
+
 ## 0.17.0 (2023-07-15)
 - Fixed bug when reading motifs in "long" format. An error is now written if the pfm/jaspar motif does not follow 4-row format.
 - Added transfac as possible input through biopython parser.

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,7 @@
+[build-system]
+requires = [
+    "setuptools",  # needed pre-install
+    "numpy>=2.0",  # needed pre-install
+    "cython"
+]
+build-backend = "setuptools.build_meta"
diff --git a/tobias/__init__.py b/tobias/__init__.py
@@ -1 +1 @@
-__version__ = "0.17.0"
+__version__ = "0.17.1"
diff --git a/tobias/parsers.py b/tobias/parsers.py
@@ -572,6 +572,10 @@ def add_submerge_arguments(parser):
 	#Optional arguments
 	optional = parser.add_argument_group('Optional arguments')
 	optional.add_argument( "--output", default='./merged_TFBS_subset.tsv', help="Path for output file. If file name ends with .bed, no header column will be added. If file name ends with .xlsx, file will be converted into an excel file. Default: ./merged_TFBS_subset.tsv", type=os.path.abspath, dest="output")
+	optional.add_argument( "--order", 
+					   default=['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM'],
+					   help="Path to file containing the order of chormosomes to sort by. Entries should either be the first column of a table / one contig per line. Default: human chromosomes",
+					   type=os.path.abspath, dest="order")
 	optional.add_argument("--TFs", help="Path to the file containing the list of TFs to subset. File has to contain one column with the TFBS names in the same format used in the BINDetect output files/directories.", type=os.path.abspath, dest="tf", default=None)
 	optional = add_logger_args(optional)
 

diff --git a/tobias/tools/submerge.py b/tobias/tools/submerge.py
@@ -41,8 +41,21 @@ def run_submerge(args):
 
             regions_columns = len(regions_df.columns)
 
-    # convert df back to bedtool
-    regions_obj = BedTool.from_dataframe(regions_df)
+        # convert back to BedTool
+        regions_obj = BedTool.from_dataframe(regions_df)
+
+    elif regions_columns > 6:
+
+        logger.warning("Regions file contains more than 6 columns. Only the first 6 columns will be used.")
+
+        # convert to dataframe
+        regions_df = regions_obj.to_dataframe()
+
+        # remove columns
+        regions_df = regions_df.iloc[:, :6]
+
+        # convert back to BedTool
+        regions_obj = BedTool.from_dataframe(regions_df)
 
     # Read TFs
     if args.tf is not None:
@@ -86,10 +99,23 @@ def run_submerge(args):
 
     logger.debug('Sorting')
 
+    # get order or chromosomes
+    if type(args.order) == list:
+        contig_order = args.order
+    else:
+        order_df = pd.read_csv(args.order, sep="\t")
+        # get first column
+        contig_order = order_df.iloc[:, 0].tolist()
+
+    # add order column
+    df['contig_order'] = df['query chr'].apply(lambda x: contig_order.index(x) if x in contig_order else len(contig_order))
+
     # Sort the dataframe
-    df.sort_values(by=["query chr", "query start", "TFBS_name", "TFBS_chr", "TFBS_start"],
-                   key=lambda x: x.str.replace("chr", "").astype(int) if x.name in ["query chr", "TFBS_chr"] else x,
+    df.sort_values(by=["contig_order", "query start", "TFBS_name", "TFBS_start"],
                    inplace=True)
+
+    # Drop the order column
+    df.drop(columns=['contig_order'], inplace=True)
 
     if args.output.endswith(".xlsx"):
         df.to_excel(args.output, index=False)

diff --git a/tobias/utils/sequences.pyx b/tobias/utils/sequences.pyx
@@ -60,7 +60,7 @@ class NucleotideMatrix(SequenceMatrix):
 
 	@cython.boundscheck(False)
 	@cython.wraparound(False)
-	def add_sequence(self, np.ndarray[np.int_t, ndim=1] sequence, double amount = 1.0):
+	def add_sequence(self, np.ndarray[np.int64_t, ndim=1] sequence, double amount = 1.0):
 		""" Adds sequence to SequenceMatrix the number of times specified in amount """
 
 		cdef int Sm, m
@@ -90,7 +90,7 @@ class NucleotideMatrix(SequenceMatrix):
 
 	@cython.boundscheck(False)
 	@cython.wraparound(False)		
-	def add_background(self, np.ndarray[np.int_t, ndim=1] sequence, double amount = 1.0):
+	def add_background(self, np.ndarray[np.int64_t, ndim=1] sequence, double amount = 1.0):
 		""" Adds sequence to count of background nucleotides """
 
 		cdef int Sm, m
@@ -120,7 +120,7 @@ class NucleotideMatrix(SequenceMatrix):
 	@cython.boundscheck(False)	#dont check boundaries
 	@cython.cdivision(True)		#no check for zero division
 	@cython.wraparound(False) 	#dont deal with negative indices
-	def score_sequence(self, np.ndarray[np.int_t, ndim=1] sequence):
+	def score_sequence(self, np.ndarray[np.int64_t, ndim=1] sequence):
 		""" Score nucleotide sequence against motif """ 
 
 		cdef np.ndarray[np.float64_t, ndim=2] pssm = self.pssm
@@ -175,7 +175,7 @@ class DiNucleotideMatrix(SequenceMatrix):
 
 	@cython.boundscheck(False)
 	@cython.wraparound(False) 	#dont deal with negative indices
-	def add_sequence(self, np.ndarray[np.int_t, ndim=1] sequence, double amount = 1.0):
+	def add_sequence(self, np.ndarray[np.int64_t, ndim=1] sequence, double amount = 1.0):
 
 		cdef np.ndarray[np.float64_t, ndim=4] bias_counts = self.counts
 		cdef int L = self.length
@@ -200,7 +200,7 @@ class DiNucleotideMatrix(SequenceMatrix):
 
 	@cython.boundscheck(False)
 	@cython.wraparound(False)		
-	def add_background(self, np.ndarray[np.int_t, ndim=1] sequence, double amount = 1.0):
+	def add_background(self, np.ndarray[np.int64_t, ndim=1] sequence, double amount = 1.0):
 		""" Adds sequence to count of background nucleotides """
 
 		cdef np.ndarray[np.float64_t, ndim=4] bg_counts = self.bg_counts
@@ -260,7 +260,7 @@ class DiNucleotideMatrix(SequenceMatrix):
 	@cython.boundscheck(False)
 	@cython.cdivision(True)		#no check for zero division
 	@cython.wraparound(False) 	#dont deal with negative indices
-	def score_sequence(self, np.ndarray[np.int_t, ndim=1] sequence):
+	def score_sequence(self, np.ndarray[np.int64_t, ndim=1] sequence):
 		#Score nucleotide sequence against dinucleotide motif
 
 		cdef np.ndarray[np.float64_t, ndim=2] bias_PWM = self.bias_pwm_log
@@ -346,7 +346,7 @@ def nuc_to_num(str sequence):
 	""" Convert DNA sequence string to internal number format """
 
 	cdef int length = len(sequence)
-	cdef np.ndarray[np.int_t, ndim=1] num_sequence = np.zeros(length, dtype=int)
+	cdef np.ndarray[np.int64_t, ndim=1] num_sequence = np.zeros(length, dtype=int)
 	cdef int i, num
 	cdef str nuc
 
@@ -387,8 +387,8 @@ class GenomicSequence:
 
 		cdef str fasta = fasta_obj.fetch(self.region.chrom, self.region.start, self.region.end)
 		cdef int length = self.length
-		cdef np.ndarray[np.int_t, ndim=1] sequence = self.sequence
-		cdef np.ndarray[np.int_t, ndim=1] revcomp_sequence = self.revcomp
+		cdef np.ndarray[np.int64_t, ndim=1] sequence = self.sequence
+		cdef np.ndarray[np.int64_t, ndim=1] revcomp_sequence = self.revcomp
 		cdef int i, num, comp
 		cdef str nuc
 

diff --git a/tobias/utils/signals.pyx b/tobias/utils/signals.pyx
@@ -64,14 +64,14 @@ class SignalList(list):
 		pass
 
 #--------------------------------------------------------------------------------------------------#
-def shuffle_array(np.ndarray[np.float64_t, ndim=1] arr, int no_rand, np.ndarray[np.int_t, ndim=1] shift_options):
+def shuffle_array(np.ndarray[np.float64_t, ndim=1] arr, int no_rand, np.ndarray[np.int64_t, ndim=1] shift_options):
 	""" Shuffles array of values within the boundaries given in shift """
 
 	cdef int max_shift = max([abs(np.min(shift_options)), abs(np.max(shift_options))])
 	cdef np.ndarray[np.float64_t, ndim=1] ext_arr = np.concatenate((np.zeros(max_shift), arr, np.zeros(max_shift)))	 #pad with max shift to allow for shuffling outside borders
 	cdef int ext_arr_len = len(ext_arr)
 
-	cdef np.ndarray[np.int_t, ndim=1] nonzero_index = np.nonzero(ext_arr)[0]
+	cdef np.ndarray[np.int64_t, ndim=1] nonzero_index = np.nonzero(ext_arr)[0]
 	cdef int no_shift = len(nonzero_index)
 
 	cdef np.ndarray[np.int64_t, ndim=2] rand_rel_positions = np.random.choice(shift_options, size=(no_shift, no_rand)) 		#positions of shuffled reads

diff --git a/tobias_env.yaml b/tobias_env.yaml
@@ -5,7 +5,7 @@ channels:
   - conda-forge
 
 dependencies:
-  - python
+  - python>=3.12
   - pysam
   - pybigwig
   - samtools
@@ -14,7 +14,7 @@ dependencies:
   - bedtools
   - cython
   - matplotlib>=2
-  - numpy
+  - numpy>=2
   - scipy
   - pypdf2
   - scikit-learn