added tiledb writing function

avantikalal · avantikalal · commit baacaca88ad7 · 2025-02-11T00:10:44.000Z
diff --git a/src/grelu/data/preprocess.py b/src/grelu/data/preprocess.py
@@ -10,6 +10,7 @@
 import numpy as np
 import pandas as pd
 from anndata import AnnData
+from tqdm import tqdm
 
 from grelu.data.utils import get_chromosomes
 from grelu.utils import get_aggfunc
@@ -723,3 +724,126 @@ def make_insertion_bigwig(
     os.remove(bedgraph_file)
 
     return bw_file
+
+
+def write_tiledb(
+    output_path: str,
+    bw_files: Union[str, List[str]],
+    chroms: Optional[Union[str, List[str]]],
+    genome: str,
+    tasks: Optional[Union[List[str], pd.DataFrame]] = None,
+    num_threads=1,
+):
+    """
+    Write BigWig files and genome sequences to TileDB.
+
+    Args:
+        output_path: Directory where the output TileDB files should be stored.
+        bw_files: List of paths to BigWig files.
+        chroms: A list of chromosomes to read from the bigWig files. If not provided,
+            all chromosomes will be read.
+        genome: Name of the genome corresponding to the BigWig files
+        tasks: A list of task names or a pandas dataframe containing task information.
+            If a dataframe is supplied, the row indices should be the task names.
+        num_threads: Number of threads. Defaults to 1.
+    """
+    import multiprocessing
+    from multiprocessing import Pool
+
+    import pyBigWig
+    import tiledb
+    from genomicarrays import buildutils_tiledb_array as uta
+    from natsort import natsorted
+
+    from grelu.data.tdb_utils import _extract_chrom_cov, _extract_chrom_sequence
+    from grelu.data.utils import _create_task_data, get_chromosomes
+    from grelu.utils import make_list
+
+    if not os.path.isdir(output_path):
+        raise ValueError("'output_path' must be a directory.")
+
+    bw_files = make_list(bw_files)
+    tasks = tasks or [os.path.splitext(os.path.basename(f))[0] for f in bw_files]
+
+    # Check and format the chromosomes
+    lengths = []
+    chroms = get_chromosomes(chroms)
+    chroms = natsorted(chroms)
+
+    # Get chromosome lengths
+    for chrom in chroms:
+        length = None
+        for bw_file in bw_files:
+            with pyBigWig.open(bw_file) as f:
+                if chrom not in f.chroms():
+                    raise ValueError(f"Chromosome {chrom} not found in file {bw_file}")
+                else:
+                    if length is None:
+                        length = f.chroms(chrom)
+                    else:
+                        if length != f.chroms(chrom):
+                            raise ValueError(
+                                f"Chromosome {chrom} does not have the same length in all bigWig files"
+                            )
+        lengths.append(length)
+
+    # Create chromosome dataframe
+    chroms = pd.DataFrame({"chrom": chroms, "start": 0, "end": lengths})
+    chroms["uri"] = [f"{output_path}/{x}" for x in chroms.chrom]
+
+    # Write chromosome dataframe
+    _chroms_uri = f"{output_path}/chroms"
+    tiledb.from_pandas(_chroms_uri, chroms)
+
+    # Create task dataframe
+    tasks = tasks or [os.path.splitext(os.path.basename(f))[0] for f in bw_files]
+    if isinstance(tasks, List):
+        tasks = _create_task_data(tasks)
+
+    tasks["task_idx"] = range(1, len(tasks) + 1)
+    tasks["bigwig_path"] = bw_files
+
+    # Write task dataframe
+    _task_uri = f"{output_path}/tasks"
+    tiledb.from_pandas(_task_uri, tasks)
+
+    # Create empty array for each chromosome
+    for row in chroms.itertuples():
+        uta.create_tiledb_array(
+            row.uri,
+            matrix_attr_name="data",
+            matrix_dim_dtype=np.int8,
+            x_dim_length=1 + len(tasks),
+            y_dim_length=row.end,
+            is_sparse=False,
+        )
+
+    # Write sequences
+    print("Writing genome sequence")
+    chrom_options = [(genome, chroms.iloc[i]) for i in range(len(chroms))]
+    if num_threads > 1:
+        try:
+            multiprocessing.set_start_method("spawn", force=True)
+        except RuntimeError:
+            pass
+
+        with Pool(num_threads) as p:
+            p.map(_extract_chrom_sequence, chrom_options)
+    else:
+        for opt in tqdm(chrom_options):
+            _extract_chrom_sequence(opt)
+
+    # Writing the coverage
+    print("Writing coverage from BigWig files")
+    for i in range(len(chroms)):
+        print(chroms.chrom.iloc[i])
+        bw_options = [
+            (chroms.iloc[i], row.bigwig_path, row.task_idx)
+            for row in tasks.itertuples()
+        ]
+        if num_threads > 1:
+            with Pool(num_threads) as p:
+                p.map(_extract_chrom_cov, bw_options)
+        else:
+            for opt in tqdm(bw_options):
+                _extract_chrom_cov(opt)