open2c · sergpolly · Apr 1, 2022 · Jan 13, 2022 · Jan 13, 2022 · Mar 31, 2022
diff --git a/cooltools/api/coverage.py b/cooltools/api/coverage.py
@@ -111,13 +111,16 @@ def coverage(
 
     n_bins = clr.info["nbins"]
     covs = chunks.pipe(_get_chunk_coverage).reduce(np.add, np.zeros((2, n_bins)))
+    covs = covs[0].astype(int), covs[1].astype(int)
 
     if store:
         with clr.open("r+") as grp:
             for store_name, cov_arr in zip(store_names, covs):
                 if store_name in grp["bins"]:
                     del grp["bins"][store_name]
                 h5opts = dict(compression="gzip", compression_opts=6)
-                grp["bins"].create_dataset(store_name, data=cov_arr, **h5opts)
-
+                grp["bins"].create_dataset(
+                    store_name, data=cov_arr, **h5opts, dtype=int
+                )
+            grp.attrs.create("cis", np.sum(covs[0]), dtype=int)
     return covs
diff --git a/cooltools/api/sample.py b/cooltools/api/sample.py
@@ -3,6 +3,7 @@
 
 import cooler
 import cooler.tools
+from .coverage import coverage
 
 
 def sample_pixels_approx(pixels, frac):
@@ -47,6 +48,7 @@ def sample(
     clr,
     out_clr_path,
     count=None,
+    cis_count=None,
     frac=None,
     exact=False,
     map_func=map,
@@ -63,13 +65,17 @@ def sample(
     out_clr_path : str
         A path/URI to the output.
 
-    count : float
+    count : int
         The target number of contacts in the sample.
-        Mutually exclusive with `frac`.
+        Mutually exclusive with `cis_count` and `frac`.
+
+    cis_count : int
+        The target number of cis contacts in the sample.
+        Mutually exclusive with `count` and `frac`.
 
     frac : float
         The target sample size as a fraction of contacts in the original
-        dataset. Mutually exclusive with `count`.
+        dataset. Mutually exclusive with `count` and `cis_count`.
 
     exact : bool
         If True, the resulting sample size will exactly match the target value.
@@ -87,12 +93,18 @@ def sample(
     if issubclass(type(clr), str):
         clr = cooler.Cooler(clr)
 
-    if count is not None and frac is None:
+    if frac is not None and count is None and cis_count is None:
+        pass
+    elif frac is None and count is not None and cis_count is None:
         frac = count / clr.info["sum"]
-    elif count is None and frac is not None:
-        count = np.round(frac * clr.info["sum"])
+    elif frac is None and count is None and cis_count is not None:
+        cis_total = clr.info.get("cis", np.sum(coverage(clr)[0], dtype=int))
+        frac = cis_count / cis_total
     else:
-        raise ValueError("Either frac or tot_count must be specified!")
+        raise ValueError(
+            "Please specify exactly one argument among `count`, `cis_count`"
+            " and `frac`"
+        )
 
     if frac >= 1.0:
         raise ValueError(
@@ -101,6 +113,7 @@ def sample(
         )
 
     if exact:
+        count = np.round(frac * clr.info["sum"]).astype(int)
         pixels = sample_pixels_exact(clr.pixels()[:], count)
         cooler.create_cooler(out_clr_path, clr.bins()[:], pixels, ordered=True)
 

diff --git a/cooltools/cli/sample.py b/cooltools/cli/sample.py
@@ -15,7 +15,16 @@
     "--count",
     help="The target number of contacts in the sample. "
     "The resulting sample size will not match it precisely. "
-    "Mutually exclusive with --frac",
+    "Mutually exclusive with --frac and --cis-count",
+    type=int,
+    default=None,
+    show_default=False,
+)
+@click.option(
+    "--cis-count",
+    help="The target number of cis contacts in the sample. "
+    "The resulting sample size will not match it precisely. "
+    "Mutually exclusive with --count and --frac",
     type=int,
     default=None,
     show_default=False,
@@ -24,7 +33,7 @@
     "-f",
     "--frac",
     help="The target sample size as a fraction of contacts in the original dataset. "
-    "Mutually exclusive with --count",
+    "Mutually exclusive with --count and --cis-count",
     type=float,
     default=None,
     show_default=False,
@@ -50,7 +59,7 @@
     default=int(1e7),
     show_default=True,
 )
-def random_sample(in_path, out_path, count, frac, exact, nproc, chunksize):
+def random_sample(in_path, out_path, count, cis_count, frac, exact, nproc, chunksize):
     """
     Pick a random sample of contacts from a Hi-C map, w/o replacement.
 
@@ -73,6 +82,7 @@ def random_sample(in_path, out_path, count, frac, exact, nproc, chunksize):
             in_path,
             out_path,
             count=count,
+            cis_count=cis_count,
             frac=frac,
             exact=exact,
             chunksize=chunksize,

diff --git a/tests/data/CN.mm9.10000kb.cool b/tests/data/CN.mm9.10000kb.cool
diff --git a/tests/test_coverage.py b/tests/test_coverage.py
@@ -10,10 +10,12 @@
 def test_coverage_symmetric_upper(request):
     # perform test:
     clr = cooler.Cooler(op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool"))
-    cov = cooltools.api.coverage.coverage(clr, ignore_diags=2, chunksize=int(1e7))
+    cis_cov, tot_cov = cooltools.api.coverage.coverage(
+        clr, ignore_diags=2, chunksize=int(1e7)
+    )
 
     # Test that minimal coverage is larger than 0.5
-    assert cov[cov > 0].min() >= 1
+    assert tot_cov[tot_cov > 0].min() >= 1
 
     # Test that dense matrix marginal is the same:
     mtx = clr.matrix(balance=False, as_pixels=False)[:]
@@ -22,7 +24,5 @@ def test_coverage_symmetric_upper(request):
     np.fill_diagonal(mtx[:, 1:], 0)
     cov_dense = np.sum(mtx, axis=1)
     testing.assert_allclose(
-        actual=cov[1],
-        desired=cov_dense,
-        equal_nan=True,
+        actual=tot_cov, desired=cov_dense, equal_nan=True,
     )
diff --git a/tests/test_sample.py b/tests/test_sample.py
@@ -0,0 +1,62 @@
+import os.path as op
+import cooler
+
+import cooltools
+import cooltools.api
+from numpy import testing
+
+
+def test_sample(request):
+    # perform test:
+    clr = cooler.Cooler(op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool"))
+
+    cooltools.api.sample.sample(
+        clr,
+        op.join(request.fspath.dirname, "data/CN.mm9.1000kb.test_sampled.cool"),
+        frac=0.5,
+    )
+    clr_result = cooler.Cooler(
+        op.join(request.fspath.dirname, "data/CN.mm9.1000kb.test_sampled.cool")
+    )
+    # Test that deviation from expected total is very small
+    testing.assert_allclose(clr_result.info["sum"], clr.info["sum"] / 2, rtol=1e-3)
+
+    cooltools.api.sample.sample(
+        clr,
+        op.join(request.fspath.dirname, "data/CN.mm9.1000kb.test_sampled.cool"),
+        count=200000000,
+    )
+    clr_result = cooler.Cooler(
+        op.join(request.fspath.dirname, "data/CN.mm9.1000kb.test_sampled.cool")
+    )
+    # Test that deviation from expected total is very small
+    testing.assert_allclose(clr_result.info["sum"], 200000000, rtol=1e-3)
+
+
+def test_sample_exact(request):
+    # Exact sampling is very slow! So commented out
+    clr = cooler.Cooler(op.join(request.fspath.dirname, "data/CN.mm9.10000kb.cool"))
+
+    cooltools.api.sample.sample(
+        clr,
+        op.join(request.fspath.dirname, "data/CN.mm9.10000kb.test_sampled.cool"),
+        frac=0.5,
+        exact=True,
+    )
+    clr_result = cooler.Cooler(
+        op.join(request.fspath.dirname, "data/CN.mm9.10000kb.test_sampled.cool")
+    )
+    # Test that result matches expectation exactly
+    testing.assert_equal(clr_result.info["sum"], round(clr.info["sum"] * 0.5))
+
+    cooltools.api.sample.sample(
+        clr,
+        op.join(request.fspath.dirname, "data/CN.mm9.10000kb.test_sampled.cool"),
+        count=200000000,
+        exact=True,
+    )
+    clr_result = cooler.Cooler(
+        op.join(request.fspath.dirname, "data/CN.mm9.10000kb.test_sampled.cool")
+    )
+    # Test that result matches expectation exactly
+    testing.assert_equal(clr_result.info["sum"], 200000000)