mcvickerlab · d-laub · Mar 10, 2025 · Mar 11, 2025 · Mar 11, 2025 · Apr 2, 2025
diff --git a/.github/workflows/bump.yaml b/.github/workflows/bump.yaml
@@ -22,6 +22,6 @@ jobs:
         uses: softprops/action-gh-release@v2
         with:
           body_path: "body.md"
-          tag_name: ${{ env.REVISION }}
+          tag_name: v${{ env.REVISION }}
         env:
           GITHUB_TOKEN: ${{ secrets.COMMITIZEN }}
diff --git a/CHANGELOG.md → docs/source/changelog.md b/CHANGELOG.md → docs/source/changelog.md
@@ -852,6 +852,15 @@
 
 ## v0.9.0 (2025-03-06)
 
+This is a breaking change for GVL. Users should view the ["What's a `gvl.Dataset`?"](https://genvarloader.readthedocs.io/en/latest/dataset.html) page in the documentation for details, but major breaks include:
+
+- removed the `length` argument from `gvl.write()`. Regions/BED files are now used as-is. If you want uniform length regions centered on inputs/peaks as before, preprocess your BED file with `gvl.with_length`.
+- changed `Dataset.output_length` from a property to a dynamic setting with behavior describe in the "What's a gvl.Dataset?" page.
+- changed track output shape to have a track axis.
+- Datasets are now deterministic by default.
+
+As a result of these changes, GVL seamlessly supports ragged length output and also paves the way for on-the-fly splicing. Since many changes were made, I wouldn't be surprised if a few bugs crop up despite my best efforts -- please leave issues if so!
+
 ### Feat
 
 - option to return ragged data from gvl.Dataset. output_length is set dynamically. fix: hap reconstruction matches bcftools. change default for Dataset.deterministic from False to True. change track output from a list of arrays to having a track dimension i.e. from shape (b [p] l) to (b t [p] l). docs: add dataset.md, faq.md and overhaul geuvadis.ipynb to be simpler and reflect changes in API.

diff --git a/docs/source/changelog.md.j2 b/docs/source/changelog.md.j2
@@ -0,0 +1,21 @@
+# Changelog
+
+{% for entry in tree %}
+
+## {{ entry.version }}{% if entry.date %} ({{ entry.date }}){% endif %}
+
+{% for change_key, changes in entry.changes.items() %}
+
+{% if change_key %}
+### {{ change_key }}
+{% endif %}
+
+{% for change in changes %}
+{% if change.scope %}
+- **{{ change.scope }}**: {{ change.message }}
+{% elif change.message %}
+- {{ change.message }}
+{% endif %}
+{% endfor %}
+{% endfor %}
+{% endfor %}
diff --git a/docs/source/index.md b/docs/source/index.md
@@ -6,6 +6,7 @@ write
 geuvadis
 faq
 api
+changelog
 ```
 
 # GenVarLoader

diff --git a/docs/source/splicing.ipynb b/docs/source/splicing.ipynb
diff --git a/pixi.lock b/pixi.lock
diff --git a/pixi.toml b/pixi.toml
@@ -58,6 +58,8 @@ ipywidgets = "*"
 sphinx-book-theme = "*"
 sphinx-autobuild = "*"
 sphinx-autodoc-typehints = "*"
+seaborn = "*"
+fast-histogram = "*"
 
 [feature.pytorch-cpu.dependencies]
 pytorch-cpu = ">=2,<3"
@@ -97,6 +99,7 @@ gen = "python tests/data/generate_ground_truth.py"
 test = { cmd = "pytest tests && cargo test --release", depends-on = ["gen"] }
 
 [feature.docs.tasks]
+install-e = "uv pip install -e /cellar/users/dlaub/projects/ML4GLand/SeqPro -e /cellar/users/dlaub/projects/genoray -e ."
 i-kernel = "ipython kernel install --user --name 'gvl-docs' --display-name 'GVL Docs'"
 i-kernel-gpu = "ipython kernel install --user --name 'gvl-docs-gpu' --display-name 'GVL Docs GPU'"
 doc = "cd docs && make clean && make html"
diff --git a/pyproject.toml b/pyproject.toml
@@ -66,6 +66,7 @@ reportUninitializedInstanceVariable = false
 [tool.maturin]
 python-source = "python"
 features = ["pyo3/extension-module"]
+# compatibility = "manylinux_2_28"
 
 [tool.pytest.ini_options]
 filterwarnings = [
@@ -83,6 +84,10 @@ legacy_tag_formats = ['v$version']
 version_scheme = "semver2"
 version_provider = "pep621"
 update_changelog_on_bump = true
+changelog_file = 'docs/source/changelog.md'
+changlog_incremental = true
+changelog_start_rev = "v0.9.1"
+template = "docs/source/changelog.md.j2"
 major_version_zero = true
 allowed_prefixes = ["Merge", "Revert", "Pull request", "fixup!", "squash!", "[pre-commit.ci]"]
 

diff --git a/python/genvarloader/_dataset/_genotypes.py b/python/genvarloader/_dataset/_genotypes.py
@@ -100,7 +100,7 @@ def get_diffs_sparse(
     return diffs
 
 
-@nb.njit(parallel=True, nogil=True, cache=True)
+# @nb.njit(parallel=True, nogil=True, cache=True)
 def reconstruct_haplotypes_from_sparse(
     out: NDArray[np.uint8],
     out_offsets: NDArray[np.integer],
@@ -117,9 +117,9 @@ def reconstruct_haplotypes_from_sparse(
     ref_offsets: NDArray[np.integer],
     pad_char: int,
     keep: NDArray[np.bool_] | None = None,
-    keep_offsets: NDArray[np.int64] | None = None,
-    annot_v_idxs: NDArray[np.int32] | None = None,
-    annot_ref_pos: NDArray[np.int32] | None = None,
+    keep_offsets: NDArray[np.integer] | None = None,
+    annot_v_idxs: NDArray[np.integer] | None = None,
+    annot_ref_pos: NDArray[np.integer] | None = None,
 ):
     """Reconstruct haplotypes from reference sequence and variants.
 
@@ -211,7 +211,7 @@ def reconstruct_haplotypes_from_sparse(
             )
 
 
-@nb.njit(nogil=True, cache=True)
+# @nb.njit(nogil=True, cache=True)
 def reconstruct_haplotype_from_sparse(
     offset_idx: int,
     geno_v_idxs: NDArray[np.integer],
@@ -407,3 +407,103 @@ def reconstruct_haplotype_from_sparse(
                 annot_v_idxs[out_end_idx:] = -1
             if annot_ref_pos is not None:
                 annot_ref_pos[out_end_idx:] = np.iinfo(np.int32).max
+
+
+@nb.njit(parallel=True, nogil=True, cache=True)
+def choose_exonic_variants(
+    starts: NDArray[np.integer],
+    ends: NDArray[np.integer],
+    geno_offset_idxs: NDArray[np.integer],
+    geno_v_idxs: NDArray[np.integer],
+    geno_offsets: NDArray[np.integer],
+    v_starts: NDArray[np.integer],
+    ilens: NDArray[np.integer],
+) -> tuple[NDArray[np.bool_], NDArray[np.integer]]:
+    """Mark variants to keep for each haplotype.
+
+    Parameters
+    ----------
+    starts : NDArray[np.int32]
+        Shape = (n_regions) Start positions for each region.
+    ends : NDArray[np.int32]
+        Shape = (n_regions) Ends for each region.
+    geno_offset_idxs : NDArray[np.intp]
+        Shape = (n_regions, ploidy) Indices for each region into offsets.
+    offsets : NDArray[np.int64]
+        Shape = (total_variants + 1) Offsets into sparse genotypes.
+    sparse_genos : NDArray[np.int32]
+        Shape = (total_variants) Sparse genotypes i.e. variant indices for ALT genotypes.
+    positions : NDArray[np.int32]
+        Shape = (total_variants) Positions of variants.
+    sizes : NDArray[np.int32]
+        Shape = (total_variants) Sizes of variants.
+    deterministic : bool
+        Whether to deterministically assign variants to groups
+    """
+    n_regions, ploidy = geno_offset_idxs.shape
+
+    lengths = np.empty((n_regions, ploidy), np.int64)
+    for query in nb.prange(n_regions):
+        for hap in range(ploidy):
+            o_idx = geno_offset_idxs[query, hap]
+            if geno_offsets.ndim == 1:
+                o_s, o_e = geno_offsets[o_idx], geno_offsets[o_idx + 1]
+            else:
+                o_s, o_e = geno_offsets[o_idx]
+            lengths[query, hap] = o_e - o_s
+    keep_offsets = np.empty(n_regions * ploidy + 1, np.int64)
+    keep_offsets[0] = 0
+    keep_offsets[1:] = lengths.cumsum()
+
+    n_variants = keep_offsets[-1]
+    keep = np.empty(n_variants, np.bool_)
+
+    for query in nb.prange(n_regions):
+        ref_start: int = starts[query]
+        ref_end: int = ends[query]
+        for hap in nb.prange(ploidy):
+            o_idx = geno_offset_idxs[query, hap]
+            o_s, o_e = geno_offsets[o_idx], geno_offsets[o_idx + 1]
+            qh_genos = geno_v_idxs[o_s:o_e]
+
+            k_idx = query * ploidy + hap
+            k_s, k_e = keep_offsets[k_idx], keep_offsets[k_idx + 1]
+            qh_keep = keep[k_s:k_e]
+
+            _choose_exonic_variants(
+                query_start=ref_start,
+                query_end=ref_end,
+                variant_idxs=qh_genos,
+                positions=v_starts,
+                sizes=ilens,
+                keep=qh_keep,
+            )
+
+    return keep, keep_offsets
+
+
+@nb.njit(nogil=True, cache=True)
+def _choose_exonic_variants(
+    query_start: int,
+    query_end: int,
+    variant_idxs: NDArray[np.integer],  # (v)
+    positions: NDArray[np.integer],  # (total variants)
+    sizes: NDArray[np.integer],  # (total variants)
+    keep: NDArray[np.bool_],  # (v)
+):
+    """Create a mask for variants that are fully contained within the query interval, which is
+    assumed to correspond to the exon boundaries."""
+    # no variants
+    if len(variant_idxs) == 0:
+        return
+
+    for v in range(len(variant_idxs)):
+        v_idx: int = variant_idxs[v]
+        v_pos = positions[v_idx]
+        # +1 for atomized
+        v_ref_end = v_pos - min(0, sizes[v_idx]) + 1
+
+        if v_pos >= query_start and v_ref_end <= query_end:
+            keep[v] = True
+        else:
+            keep[v] = False
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,7 @@ write @@
     geuvadis
     faq
     api
+    changelog
     ```
     # GenVarLoader
@@ Expand Down @@