Skip to content

Cross coalescence rates: advanced analysis tutorial. #277

Open
@hyanwong

Description

@hyanwong

@nspope sent me this, which is a good core for a new tutorial on cross coalescence rates. Probably worth waiting until some of the core functionality (e.g. rates_from_ecdf) is implemented in tskit.

import stdpopsim
import numpy as np
import matplotlib.pyplot as plt

def rates_from_ecdf(weights, atoms, quantiles):
    """
    Estimate rates from weighted time-to-event data
    """
    assert weights.size == atoms.size
    assert quantiles.size > 2
    assert quantiles[0] == 0.0 and quantiles[-1] == 1.0

    # sort and filter inputs
    time_order = np.argsort(atoms)
    weights = weights[time_order]
    atoms = atoms[time_order]
    nonzero = weights > 0
    atoms = atoms[nonzero]
    weights = weights[nonzero]

    # find interior quantiles
    weights = np.append(0, weights)
    atoms = np.append(0, atoms)
    ecdf = np.cumsum(weights)
    indices = np.searchsorted(ecdf, ecdf[-1] * quantiles[1:-1], side='left')
    lower, upper = atoms[indices - 1], atoms[indices]
    ecdfl, ecdfu = ecdf[indices - 1] / ecdf[-1], ecdf[indices] / ecdf[-1]

    # interpolate ECDF
    assert np.all(ecdfu - ecdfl > 0)
    slope = (upper - lower) / (ecdfu - ecdfl)
    breaks = np.append(0, lower + slope * (quantiles[1:-1] - ecdfl))

    # calculate coalescence rates within intervals
    # this uses a Kaplan-Meier-type censored estimator: https://github.com/tskit-dev/tskit/pull/2119
    coalrate = np.full(quantiles.size - 1, np.nan)
    propcoal = np.diff(quantiles[:-1]) / (1 - quantiles[:-2])
    coalrate[:-1] = -np.log(1 - propcoal) / np.diff(breaks)
    last = indices[-1]
    coalrate[-1] = np.sum(weights[last:]) / np.dot(atoms[last:] - breaks[-1], weights[last:]) 

    return coalrate, breaks


# --- on data

import stdpopsim
engine = stdpopsim.get_engine("msprime")
homsap = stdpopsim.get_species("HomSap")
demogr = homsap.get_demographic_model("OutOfAfrica_4J17")
contig = homsap.get_contig("chr17", genetic_map='HapMapII_GRCh38', right=1e7)
sample = {"CEU" : 100, "YRI" : 100, "JPT" : 100, "CHB" : 100}
ts = engine.simulate(demogr, contig, sample, seed=1024).trim()

pop_idx = {p.metadata['name'] : i for i, p in enumerate(ts.populations())}
sample_sets = [
    np.flatnonzero(ts.nodes_population[:ts.num_samples] == i) 
    for i in range(ts.num_populations)
]
quantiles = np.linspace(0.0, 1.0, 25)
indexes = [(pop_idx["CEU"], pop_idx["CHB"])] # CEU vs CHB
node_coal_weight = ts.pair_coalescence_counts(sample_sets, indexes=indexes).flatten()
rates, breaks = rates_from_ecdf(node_coal_weight, ts.nodes_time, quantiles)


# sanity check: mean of cross-coalescence time distribution should equal branch-mode divergence
print(
    "Mean of cross-coalescence time distr: ",
    2 * node_coal_weight @ ts.nodes_time / np.sum(node_coal_weight),
    "Branch-mode divergence: ",
    ts.divergence(sample_sets, indexes=indexes, mode='branch'),
)


# sanity check: compare estimated rates to expectation under model
theory_rates = demogr.model.debug().coalescence_rate_trajectory(breaks, {"CEU":1, "CHB":1})[0]
plt.clf()
plt.step(breaks, rates, color='firebrick', label='estimated')
plt.step(breaks, theory_rates, color='black', label='theoretical')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Generations in past')
plt.ylabel('Cross-coalescence rate')
plt.legend()
plt.savefig('rate-check.png')


# as an aside: rates are a 1-to-1 transformation of the pair coalescence time distribution. We can plot this distribution using e.g. a weighted density estimate or histogram.
hist_density, hist_breaks = np.histogram(
    np.log10(ts.nodes_time[ts.num_samples:]), 
    weights=node_coal_weight[ts.num_samples:],
    bins=25,
    density=True,
)
plt.clf()
plt.fill_between(hist_breaks[:-1], hist_density, step='pre', alpha=0.5)
plt.xlabel('log10 generations in past')
plt.ylabel('Density')
plt.savefig('rate-hist.png')

# this also gives a very easy way to get coalescence rates if we have a fixed timegrid
# (but, rates will be noisier in bins with few nodes, and have to watch for nan)
hist_density, hist_breaks = np.histogram(
    ts.nodes_time, # note natural time scale
    weights=node_coal_weight,
    bins=np.logspace(2, 5, 25),
    density=False,
)
hist_density /= hist_density.sum()
hist_rates = -np.log(1 - hist_density / (1 - np.cumsum(hist_density))) / np.diff(hist_breaks)
hist_theory_rates = demogr.model.debug().coalescence_rate_trajectory(hist_breaks, {"CEU":1, "CHB":1})[0]
hist_theory_rates = hist_theory_rates[1:] # remove [0, first break]

plt.clf()
plt.step(hist_breaks[:-1], hist_rates, color='firebrick', label='estimated')
plt.step(hist_breaks[:-1], hist_theory_rates, color='black', label='theoretical')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Generations in past')
plt.ylabel('Cross-coalescence rate')
plt.legend()
plt.savefig('rate-check-2.png')

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions