Skip to content

Commit

Permalink
Improve speed ot tree likelihood
Browse files Browse the repository at this point in the history
- avoid copying tipdata to partials with a series of if else statements
- use init to initialize VB, HMC and LBFGS
  • Loading branch information
4ment committed May 26, 2022
1 parent 25c24bb commit 59784fb
Show file tree
Hide file tree
Showing 4 changed files with 193 additions and 51 deletions.
174 changes: 135 additions & 39 deletions phylostan/generate_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ def get_delta_rates():

def get_rates_from_deltas():
"""
Create substrates from deltas and rate.
Used by MRF
"""
Create substrates from deltas and rate.
Used by MRF
"""
code_str = '''
{
// no rate at root and rate of first child has a different prior
Expand All @@ -62,12 +62,12 @@ def get_rates_from_deltas():

def autocorrelated_prior(heterochronous):
"""
Thorne et al 1998
mu_i = log(r_a)
sigma_i = nu*t_i
Thorne et al 1998
mu_i = log(r_a)
sigma_i = nu*t_i
log(r_i) ~ N(mu_i, sigma_i^2)
"""
log(r_i) ~ N(mu_i, sigma_i^2)
"""
code_str = '''
real logn_autocorrelated_log(real[] rates, vector heights, int[,] map, real nu{0}){{
int S = rows(heights) + 1;
Expand Down Expand Up @@ -101,14 +101,14 @@ def autocorrelated_prior(heterochronous):

def acln_prior(heterochronous):
"""
Kishino et al 2001 autocorrelated lognormal model
Kishino et al 2001 autocorrelated lognormal model
sigma_i = (nu*t_i)^1/2
E[r_i|r_a] = r_a = e^{mu_i + sigma_i^2/2}
mu_i = log(r_a) - nu*t_i/2
sigma_i = (nu*t_i)^1/2
E[r_i|r_a] = r_a = e^{mu_i + sigma_i^2/2}
mu_i = log(r_a) - nu*t_i/2
r_i ~ LN(mu_i, sigma_i)
"""
r_i ~ LN(mu_i, sigma_i)
"""
code_str = '''
real acln_log(real[] rates, vector heights, int[,] map, real nu{0}){{
int S = rows(heights) + 1;
Expand Down Expand Up @@ -142,16 +142,16 @@ def acln_prior(heterochronous):

def acg_prior(heterochronous):
"""
Aris-Brosou and Yang 2002 autocorrelated gamma model
Aris-Brosou and Yang 2002 autocorrelated gamma model
E[r_i|r_a] = r_a = shape_i/rate_i
Var[r_i|r_a] = nu*t_i = shape_i/rate_i^2
E[r_i|r_a] = r_a = shape_i/rate_i
Var[r_i|r_a] = nu*t_i = shape_i/rate_i^2
shape_i = r_a^2/(nu*t_i)
rate_i = r_a/(nu*t_i)
shape_i = r_a^2/(nu*t_i)
rate_i = r_a/(nu*t_i)
r_i ~ Gamma(shape_i, rate_i)
"""
r_i ~ Gamma(shape_i, rate_i)
"""
code_str = '''
real acg_log(real[] rates, vector heights, int[,] map, real nu{0}){{
int S = rows(heights) + 1;
Expand Down Expand Up @@ -185,11 +185,11 @@ def acg_prior(heterochronous):

def ace_prior():
"""
Aris-Brosou and Yang 2002 autocorrelated exponential model
E[r_i] = r_a
Aris-Brosou and Yang 2002 autocorrelated exponential model
E[r_i] = r_a
r_i ~ Exp(1/r_a)
"""
r_i ~ Exp(1/r_a)
"""
code_str = '''
real ace_log(real[] rates, int[,] map){
int nodeCount = size(rates) + 1;
Expand All @@ -211,9 +211,9 @@ def ace_prior():

def aoup_prior(heterochronous):
'''
Aris-Brous & Yang 2002 Ornstein-Uhlenbeck process
nu is sigma^2
'''
Aris-Brous & Yang 2002 Ornstein-Uhlenbeck process
nu is sigma^2
'''
str = '''
real aoup_log(real[] rates, vector heights, int[,] map, real beta, real nu{0}){{
int S = rows(heights) + 1;
Expand Down Expand Up @@ -475,7 +475,7 @@ def transform_heights(heterochronous=False):
for( i in 2:nodeCount ){{
// internal node: transform
if(map[i,1] > S){{
heights[map[i,1]] = {1}(heights[map[i,2]]{2})*p[j];
heights[map[i,1]] = {1}(heights[map[i,2]]{2})*p[map[i,1]-S];
j += 1;
}}
else{{
Expand Down Expand Up @@ -726,6 +726,81 @@ def P_matrix_function():
"""


def likelihood_new(mixture, clock=True):
model_calculate_logP = """
for( n in 1:(S-1) ) {
p1 = peel[n,1];
p2 = peel[n,2];
p3 = peel[n,3] - S;
if(peel[n,1] <= S && peel[n,2] <= S){
for( i in 1:L ) {
partials[p3,i] = (pmats[p1]*tipdata[p1,i]) .* (pmats[p2]*tipdata[p2,i]);
}
}
else if(peel[n,1] <= S){
for( i in 1:L ) {
partials[p3,i] = (pmats[p1]*tipdata[p1,i]) .* (pmats[p2]*partials[p2-S,i]);
}
}
else if(peel[n,2] <= S){
for( i in 1:L ) {
partials[p3,i] = (pmats[p1]*partials[p1-S,i]) .* (pmats[p2]*tipdata[p2,i]);
}
}
else{
for( i in 1:L ) {
partials[p3,i] = (pmats[p1]*partials[p1-S,i]) .* (pmats[p2]*partials[p2-S,i]);
}
}
}
p3 = peel[S-1,3] - S;
for( i in 1:L ) {
target += log(sum(partials[p3,i] .* freqs))*weights[i];
}
"""
model_calculate_mixture_logP = """
for(c in 1:C){
for( n in 1:(S-1) ) {
p1 = peel[n,1];
p2 = peel[n,2];
p3 = peel[n,3] - S;
if(peel[n,1] <= S && peel[n,2] <= S){
for( i in 1:L ) {
partials[c,p3,i] = (pmats[p1+(c-1)*bcount]*tipdata[p1,i]) .* (pmats[p2+(c-1)*bcount]*tipdata[p2,i]);
}
}
else if(peel[n,1] <= S){
for( i in 1:L ) {
partials[c,p3,i] = (pmats[p1+(c-1)*bcount]*tipdata[p1,i]) .* (pmats[p2+(c-1)*bcount]*partials[c,p2-S,i]);
}
}
else if(peel[n,2] <= S){
for( i in 1:L ) {
partials[c,p3,i] = (pmats[p1+(c-1)*bcount]*partials[c,p1-S,i]) .* (pmats[p2+(c-1)*bcount]*tipdata[p2,i]);
}
}
else{
for( i in 1:L ) {
partials[c,p3,i] = (pmats[p1+(c-1)*bcount]*partials[c,p1-S,i]) .* (pmats[p2+(c-1)*bcount]*partials[c,p2-S,i]);
}
}
}
}
p3 = peel[S-1,3] - S;
for( i in 1:L ) {
for(c in 1:C){
probs[c] = ps[c] * sum(partials[c,p3,i] .* freqs);
}
target += log(sum(probs))*weights[i];
}
"""
model = '// calculate tree likelihood\n'
if not mixture:
return model + model_calculate_logP
else:
return model + model_calculate_mixture_logP


def likelihood(mixture, clock=True):
init_tip_partials = """
// copy tip data into node probability vector
Expand Down Expand Up @@ -987,16 +1062,27 @@ def get_model(params):
if params.invariant or params.categories > 1:
data_block.append('int C;')
model_block_declarations.append('real probs[C];')
model_block_declarations.append(
'vector[4] partials[C,2*S,L]; // partial probabilities for the S tips and S-1 internal nodes'
)
if params.clock is not None:
model_block_declarations.append(
'vector[4] partials[C,S-1,L]; // partial probabilities for the S tips and S-1 internal nodes'
)
else:
model_block_declarations.append(
'vector[4] partials[C,2*S,L]; // partial probabilities for the S-1 internal nodes'
)
model_block_declarations.append(
'matrix[4,4] pmats[bcount*C]; // finite-time transition matrices for each branch'
)

else:
model_block_declarations.append(
'vector[4] partials[2*S,L]; // partial probabilities for the S tips and S-1 internal nodes'
)
if params.clock is not None:
model_block_declarations.append(
'vector[4] partials[S-1,L]; // partial probabilities for the S-1 internal nodes'
)
else:
model_block_declarations.append(
'vector[4] partials[2*S,L]; // partial probabilities for the S tips and S-1 internal nodes'
)
model_block_declarations.append(
'matrix[4,4] pmats[bcount]; // finite-time transition matrices for each branch'
)
Expand Down Expand Up @@ -1293,10 +1379,20 @@ def get_model(params):
else:
raise ValueError('Supports JC69, HKY and GTR only.')

# Tree likelihood
model_block.append(
likelihood(params.categories > 1 or params.invariant, params.clock is not None)
)
# Tree likelihood
if params.clock is not None:
model_block_declarations.append(
"""int p1;
int p2;
int p3;"""
)
model_block.append(likelihood_new(params.categories > 1 or params.invariant))
else:
model_block.append(
likelihood(
params.categories > 1 or params.invariant, params.clock is not None
)
)

if params.clock is not None:
model_block.append(jacobian(params.heterochronous))
Expand Down
23 changes: 20 additions & 3 deletions phylostan/phylostan.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import sys

import dendropy
import numpy
import numpy as np
import pystan
from dendropy import DnaCharacterMatrix, Tree

Expand Down Expand Up @@ -79,6 +79,11 @@ def create_run_parser(subprasers):
help="""Comma-separated (csv) file containing sequence dates with header
'name,date'""",
)
parser.add_argument(
'--heights_init',
action="store_true",
help="""initialize node heights using input tree file""",
)

parser.add_argument(
'-a',
Expand Down Expand Up @@ -530,7 +535,7 @@ def run(arg):
if arg.clock is not None:
if arg.coalescent == 'skygrid':
data['G'] = arg.grid - 1
data['grid'] = numpy.linspace(0, arg.cutoff, arg.grid)[1:]
data['grid'] = np.linspace(0, arg.cutoff, arg.grid)[1:]
elif arg.coalescent == 'skyride':
# number of coalescent intervals
data['I'] = sequence_count - 1
Expand Down Expand Up @@ -571,7 +576,19 @@ def run(arg):
if len(row) > 0:
line = line.split(':')
inits[line[0].strip()] = list(map(float, line[1].split(',')))
stan_args['init'] = (inits,)
stan_args['init'] = inits
elif arg.heights_init or arg.rate is not None:
inits = {}
if arg.heights_init:
ratios, root_height = utils.ratios_root_height_from_branch_lengths(tree)
# ratios_unres = np.log(ratios / (1.0 - ratios))
# root_height_unres = np.log(root_height - data['lower_root'])
inits['props'] = ratios.tolist() # ratios_unres.tolist()
inits['height'] = root_height.item() - data['lower_root']
inits['rate'] = arg.rate
elif arg.rate is not None:
inits['rate'] = arg.rate
stan_args['init'] = inits

if arg.algorithm == 'LBFGS':
fit = sm.optimizing(**stan_args)
Expand Down
41 changes: 35 additions & 6 deletions phylostan/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,35 @@
import csv

import numpy
import numpy as np


def ratios_root_height_from_branch_lengths(tree, eps=1.0e-6):
bounds = get_lowers(tree)
heights = heights_from_branch_lengths(tree, eps)
tip_count = len(tree.taxon_namespace)
ratios = [None] * (tip_count - 2)
for node in tree.preorder_node_iter(
lambda n: n != tree.seed_node and n.is_internal()
):
ratios[node.index - 1 - tip_count] = (
heights[node.index - 1] - bounds[node.index - 1]
) / (heights[node.parent_node.index - 1] - bounds[node.index - 1])
return np.array(ratios), np.array(heights[-1:])


def heights_from_branch_lengths(tree, eps=1.0e-6):
heights = [None] * (2 * len(tree.taxon_namespace) - 1)
for node in tree.postorder_node_iter():
if node.is_leaf():
heights[node.index - 1] = node.date
else:
heights[node.index - 1] = max(
[
heights[c.index - 1] + max(eps, c.edge_length)
for c in node.child_node_iter()
]
)
return heights


def setup_dates(tree, dates=None, heterochronous=False):
Expand Down Expand Up @@ -90,7 +119,7 @@ def get_lowers(tree):


def get_dna_leaves_partials(alignment):
tipdata = numpy.zeros((len(alignment), alignment.sequence_size, 4), dtype=numpy.int)
tipdata = np.zeros((len(alignment), alignment.sequence_size, 4), dtype=np.int)
dna_map = {
'a': [1, 0, 0, 0],
'c': [0, 1, 0, 0],
Expand Down Expand Up @@ -128,7 +157,7 @@ def get_dna_leaves_partials_compressed(alignment):
if keep[i]:
weights.append(patterns[indexes[i]])

tipdata = numpy.zeros((len(alignment), len(patterns.keys()), 4), dtype=numpy.int)
tipdata = np.zeros((len(alignment), len(patterns.keys()), 4), dtype=np.int)

dna_map = {
'a': [1, 0, 0, 0],
Expand Down Expand Up @@ -253,8 +282,8 @@ def convert_samples_to_nexus(tree, input, output, rate=None):


def descriptive_stats(d, alpha):
median, low, high = numpy.quantile(d, (0.5, alpha / 2.0, 1.0 - alpha / 2.0))
return numpy.mean(d), median, low, high
median, low, high = np.quantile(d, (0.5, alpha / 2.0, 1.0 - alpha / 2.0))
return np.mean(d), median, low, high


def parse_log(inputfile, alpha=0.05, tree=None):
Expand Down Expand Up @@ -349,7 +378,7 @@ def parse_log(inputfile, alpha=0.05, tree=None):
sum_distance += n.rate * edge_length
sum_time += edge_length
d.append(sum_distance / sum_time)
variances.append(numpy.var(rates))
variances.append(np.var(rates))
mean, median, low, high = descriptive_stats(d, alpha)
print(
'Mean rate mean: {} {}% CI: ({},{})'.format(
Expand Down
6 changes: 3 additions & 3 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
[metadata]
name = phylostan
version = 1.0.4
url = https://github.com/4ment/phyostan
version = 1.0.5
url = https://github.com/4ment/phylostan
author = Mathieu Fourment
author_email = mathieu.fourment@uts.edu.au
keywords = phylogenetics, variational, Bayes, Stan
keywords = phylogenetics, variational, HMC, Bayes, Stan
description = Phylogenetic inference with Stan
long_description = file: README.md
long_description_content_type = text/markdown
Expand Down

0 comments on commit 59784fb

Please sign in to comment.