Open
Description
@nspope sent me this, which is a good core for a new tutorial on cross coalescence rates. Probably worth waiting until some of the core functionality (e.g. rates_from_ecdf
) is implemented in tskit.
import stdpopsim
import numpy as np
import matplotlib.pyplot as plt
def rates_from_ecdf(weights, atoms, quantiles):
"""
Estimate rates from weighted time-to-event data
"""
assert weights.size == atoms.size
assert quantiles.size > 2
assert quantiles[0] == 0.0 and quantiles[-1] == 1.0
# sort and filter inputs
time_order = np.argsort(atoms)
weights = weights[time_order]
atoms = atoms[time_order]
nonzero = weights > 0
atoms = atoms[nonzero]
weights = weights[nonzero]
# find interior quantiles
weights = np.append(0, weights)
atoms = np.append(0, atoms)
ecdf = np.cumsum(weights)
indices = np.searchsorted(ecdf, ecdf[-1] * quantiles[1:-1], side='left')
lower, upper = atoms[indices - 1], atoms[indices]
ecdfl, ecdfu = ecdf[indices - 1] / ecdf[-1], ecdf[indices] / ecdf[-1]
# interpolate ECDF
assert np.all(ecdfu - ecdfl > 0)
slope = (upper - lower) / (ecdfu - ecdfl)
breaks = np.append(0, lower + slope * (quantiles[1:-1] - ecdfl))
# calculate coalescence rates within intervals
# this uses a Kaplan-Meier-type censored estimator: https://github.com/tskit-dev/tskit/pull/2119
coalrate = np.full(quantiles.size - 1, np.nan)
propcoal = np.diff(quantiles[:-1]) / (1 - quantiles[:-2])
coalrate[:-1] = -np.log(1 - propcoal) / np.diff(breaks)
last = indices[-1]
coalrate[-1] = np.sum(weights[last:]) / np.dot(atoms[last:] - breaks[-1], weights[last:])
return coalrate, breaks
# --- on data
import stdpopsim
engine = stdpopsim.get_engine("msprime")
homsap = stdpopsim.get_species("HomSap")
demogr = homsap.get_demographic_model("OutOfAfrica_4J17")
contig = homsap.get_contig("chr17", genetic_map='HapMapII_GRCh38', right=1e7)
sample = {"CEU" : 100, "YRI" : 100, "JPT" : 100, "CHB" : 100}
ts = engine.simulate(demogr, contig, sample, seed=1024).trim()
pop_idx = {p.metadata['name'] : i for i, p in enumerate(ts.populations())}
sample_sets = [
np.flatnonzero(ts.nodes_population[:ts.num_samples] == i)
for i in range(ts.num_populations)
]
quantiles = np.linspace(0.0, 1.0, 25)
indexes = [(pop_idx["CEU"], pop_idx["CHB"])] # CEU vs CHB
node_coal_weight = ts.pair_coalescence_counts(sample_sets, indexes=indexes).flatten()
rates, breaks = rates_from_ecdf(node_coal_weight, ts.nodes_time, quantiles)
# sanity check: mean of cross-coalescence time distribution should equal branch-mode divergence
print(
"Mean of cross-coalescence time distr: ",
2 * node_coal_weight @ ts.nodes_time / np.sum(node_coal_weight),
"Branch-mode divergence: ",
ts.divergence(sample_sets, indexes=indexes, mode='branch'),
)
# sanity check: compare estimated rates to expectation under model
theory_rates = demogr.model.debug().coalescence_rate_trajectory(breaks, {"CEU":1, "CHB":1})[0]
plt.clf()
plt.step(breaks, rates, color='firebrick', label='estimated')
plt.step(breaks, theory_rates, color='black', label='theoretical')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Generations in past')
plt.ylabel('Cross-coalescence rate')
plt.legend()
plt.savefig('rate-check.png')
# as an aside: rates are a 1-to-1 transformation of the pair coalescence time distribution. We can plot this distribution using e.g. a weighted density estimate or histogram.
hist_density, hist_breaks = np.histogram(
np.log10(ts.nodes_time[ts.num_samples:]),
weights=node_coal_weight[ts.num_samples:],
bins=25,
density=True,
)
plt.clf()
plt.fill_between(hist_breaks[:-1], hist_density, step='pre', alpha=0.5)
plt.xlabel('log10 generations in past')
plt.ylabel('Density')
plt.savefig('rate-hist.png')
# this also gives a very easy way to get coalescence rates if we have a fixed timegrid
# (but, rates will be noisier in bins with few nodes, and have to watch for nan)
hist_density, hist_breaks = np.histogram(
ts.nodes_time, # note natural time scale
weights=node_coal_weight,
bins=np.logspace(2, 5, 25),
density=False,
)
hist_density /= hist_density.sum()
hist_rates = -np.log(1 - hist_density / (1 - np.cumsum(hist_density))) / np.diff(hist_breaks)
hist_theory_rates = demogr.model.debug().coalescence_rate_trajectory(hist_breaks, {"CEU":1, "CHB":1})[0]
hist_theory_rates = hist_theory_rates[1:] # remove [0, first break]
plt.clf()
plt.step(hist_breaks[:-1], hist_rates, color='firebrick', label='estimated')
plt.step(hist_breaks[:-1], hist_theory_rates, color='black', label='theoretical')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Generations in past')
plt.ylabel('Cross-coalescence rate')
plt.legend()
plt.savefig('rate-check-2.png')
Metadata
Metadata
Assignees
Labels
No labels