Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions cooltools/api/coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,16 @@ def coverage(

n_bins = clr.info["nbins"]
covs = chunks.pipe(_get_chunk_coverage).reduce(np.add, np.zeros((2, n_bins)))
covs = covs[0].astype(int), covs[1].astype(int)

if store:
with clr.open("r+") as grp:
for store_name, cov_arr in zip(store_names, covs):
if store_name in grp["bins"]:
del grp["bins"][store_name]
h5opts = dict(compression="gzip", compression_opts=6)
grp["bins"].create_dataset(store_name, data=cov_arr, **h5opts)

grp["bins"].create_dataset(
store_name, data=cov_arr, **h5opts, dtype=int
)
grp.attrs.create("cis", np.sum(covs[0]), dtype=int)
return covs
27 changes: 20 additions & 7 deletions cooltools/api/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import cooler
import cooler.tools
from .coverage import coverage


def sample_pixels_approx(pixels, frac):
Expand Down Expand Up @@ -47,6 +48,7 @@ def sample(
clr,
out_clr_path,
count=None,
cis_count=None,
frac=None,
exact=False,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consider adding ignore_diags here ? or ultimately dist_min and dist_max ?
if it does not end up oin this PR - leave an issue, maybe ? for the future

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would ignore diags only apply to cis_count? Or any count?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd say yes ... Also just make sure what ignore_diags means exactly - ignore diags to set the count target, of course - but what should happen with the ignored_diags in the subsamples matrix ? Should they be zero ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dunno - maybe leave ignore_diags as an issue to discuss a bit more - but proceed with this one as is ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes meaning ... which of the options?
I'd say the calculation of the subsampling fraction should not take ignore_diags diags into account, but subsampling would be applied to all data including those diags

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK! Merge then?

map_func=map,
Expand All @@ -63,13 +65,17 @@ def sample(
out_clr_path : str
A path/URI to the output.

count : float
count : int
The target number of contacts in the sample.
Mutually exclusive with `frac`.
Mutually exclusive with `cis_count` and `frac`.

cis_count : int
The target number of cis contacts in the sample.
Mutually exclusive with `count` and `frac`.

frac : float
The target sample size as a fraction of contacts in the original
dataset. Mutually exclusive with `count`.
dataset. Mutually exclusive with `count` and `cis_count`.

exact : bool
If True, the resulting sample size will exactly match the target value.
Expand All @@ -87,12 +93,18 @@ def sample(
if issubclass(type(clr), str):
clr = cooler.Cooler(clr)

if count is not None and frac is None:
if frac is not None and count is None and cis_count is None:
pass
elif frac is None and count is not None and cis_count is None:
frac = count / clr.info["sum"]
elif count is None and frac is not None:
count = np.round(frac * clr.info["sum"])
elif frac is None and count is None and cis_count is not None:
cis_total = clr.info.get("cis", np.sum(coverage(clr)[0], dtype=int))
frac = cis_count / cis_total
else:
raise ValueError("Either frac or tot_count must be specified!")
raise ValueError(
"Please specify exactly one argument among `count`, `cis_count`"
" and `frac`"
)

if frac >= 1.0:
raise ValueError(
Expand All @@ -101,6 +113,7 @@ def sample(
)

if exact:
count = np.round(frac * clr.info["sum"]).astype(int)
pixels = sample_pixels_exact(clr.pixels()[:], count)
cooler.create_cooler(out_clr_path, clr.bins()[:], pixels, ordered=True)

Expand Down
16 changes: 13 additions & 3 deletions cooltools/cli/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,16 @@
"--count",
help="The target number of contacts in the sample. "
"The resulting sample size will not match it precisely. "
"Mutually exclusive with --frac",
"Mutually exclusive with --frac and --cis-count",
type=int,
default=None,
show_default=False,
)
@click.option(
"--cis-count",
help="The target number of cis contacts in the sample. "
"The resulting sample size will not match it precisely. "
"Mutually exclusive with --count and --frac",
type=int,
default=None,
show_default=False,
Expand All @@ -24,7 +33,7 @@
"-f",
"--frac",
help="The target sample size as a fraction of contacts in the original dataset. "
"Mutually exclusive with --count",
"Mutually exclusive with --count and --cis-count",
type=float,
default=None,
show_default=False,
Expand All @@ -50,7 +59,7 @@
default=int(1e7),
show_default=True,
)
def random_sample(in_path, out_path, count, frac, exact, nproc, chunksize):
def random_sample(in_path, out_path, count, cis_count, frac, exact, nproc, chunksize):
"""
Pick a random sample of contacts from a Hi-C map, w/o replacement.

Expand All @@ -73,6 +82,7 @@ def random_sample(in_path, out_path, count, frac, exact, nproc, chunksize):
in_path,
out_path,
count=count,
cis_count=cis_count,
frac=frac,
exact=exact,
chunksize=chunksize,
Expand Down
Binary file added tests/data/CN.mm9.10000kb.cool
Binary file not shown.
10 changes: 5 additions & 5 deletions tests/test_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
def test_coverage_symmetric_upper(request):
# perform test:
clr = cooler.Cooler(op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool"))
cov = cooltools.api.coverage.coverage(clr, ignore_diags=2, chunksize=int(1e7))
cis_cov, tot_cov = cooltools.api.coverage.coverage(
clr, ignore_diags=2, chunksize=int(1e7)
)

# Test that minimal coverage is larger than 0.5
assert cov[cov > 0].min() >= 1
assert tot_cov[tot_cov > 0].min() >= 1

# Test that dense matrix marginal is the same:
mtx = clr.matrix(balance=False, as_pixels=False)[:]
Expand All @@ -22,7 +24,5 @@ def test_coverage_symmetric_upper(request):
np.fill_diagonal(mtx[:, 1:], 0)
cov_dense = np.sum(mtx, axis=1)
testing.assert_allclose(
actual=cov[1],
desired=cov_dense,
equal_nan=True,
actual=tot_cov, desired=cov_dense, equal_nan=True,
)
62 changes: 62 additions & 0 deletions tests/test_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import os.path as op
import cooler

import cooltools
import cooltools.api
from numpy import testing


def test_sample(request):
# perform test:
clr = cooler.Cooler(op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool"))

cooltools.api.sample.sample(
clr,
op.join(request.fspath.dirname, "data/CN.mm9.1000kb.test_sampled.cool"),
frac=0.5,
)
clr_result = cooler.Cooler(
op.join(request.fspath.dirname, "data/CN.mm9.1000kb.test_sampled.cool")
)
# Test that deviation from expected total is very small
testing.assert_allclose(clr_result.info["sum"], clr.info["sum"] / 2, rtol=1e-3)

cooltools.api.sample.sample(
clr,
op.join(request.fspath.dirname, "data/CN.mm9.1000kb.test_sampled.cool"),
count=200000000,
)
clr_result = cooler.Cooler(
op.join(request.fspath.dirname, "data/CN.mm9.1000kb.test_sampled.cool")
)
# Test that deviation from expected total is very small
testing.assert_allclose(clr_result.info["sum"], 200000000, rtol=1e-3)


def test_sample_exact(request):
# Exact sampling is very slow! So commented out
clr = cooler.Cooler(op.join(request.fspath.dirname, "data/CN.mm9.10000kb.cool"))

cooltools.api.sample.sample(
clr,
op.join(request.fspath.dirname, "data/CN.mm9.10000kb.test_sampled.cool"),
frac=0.5,
exact=True,
)
clr_result = cooler.Cooler(
op.join(request.fspath.dirname, "data/CN.mm9.10000kb.test_sampled.cool")
)
# Test that result matches expectation exactly
testing.assert_equal(clr_result.info["sum"], round(clr.info["sum"] * 0.5))

cooltools.api.sample.sample(
clr,
op.join(request.fspath.dirname, "data/CN.mm9.10000kb.test_sampled.cool"),
count=200000000,
exact=True,
)
clr_result = cooler.Cooler(
op.join(request.fspath.dirname, "data/CN.mm9.10000kb.test_sampled.cool")
)
# Test that result matches expectation exactly
testing.assert_equal(clr_result.info["sum"], 200000000)