Skip to content

Commit

Permalink
Create calc_h2.py
Browse files Browse the repository at this point in the history
  • Loading branch information
omerwe committed Dec 16, 2014
1 parent fe6739c commit 652d0de
Showing 1 changed file with 222 additions and 0 deletions.
222 changes: 222 additions & 0 deletions calc_h2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
import numpy as np
from optparse import OptionParser
import scipy.linalg as la
import scipy.stats as stats
import scipy.linalg.blas as blas
import csv
import sklearn.linear_model
import time
import sys
import VertexCut as vc
from pysnptools.pysnptools.snpreader.bed import Bed
from fastlmm.pyplink import plink
import pysnptools.pysnptools.util.util as pyutil
np.set_printoptions(precision=3, linewidth=200)
import leapUtils

parser = OptionParser()
parser.add_option('--bfilesim', metavar='bfilesim', default=None, help='Binary plink file')
parser.add_option('--extractSim', metavar='extractSim', default=None, help='SNPs subset to use')
parser.add_option('--prev', metavar='prev', type=float, default=None, help='Trait prevalence')
parser.add_option('--numRemovePCs', metavar='numRemovePCs', type=int, default=10, help='Number of principal components to fit')
parser.add_option('--pheno', metavar='pheno', default=None, help='Phenotype file in Plink format')
parser.add_option('--eigen', metavar='eigen', default=None, help='eigen file')
parser.add_option('--related', metavar='related', default=None, help='relatedness file')

parser.add_option('--lowtail', metavar='lowtail', type=int, default=0, help='Assume that both tails of the liabilities distribution are oversampled (0 or 1 - default 0)')
parser.add_option('--h2coeff', metavar='h2coeff', type=float, default=0.85, help='Heritability coefficient (set to 1.0 for synthetic data)')
parser.add_option('--relCutoff', metavar='relCutoff', type=float, default=0.05, help='relatedness cutoff (set to negative value to override relatedness check)')
parser.add_option('--missingPhenotype', metavar='missingPhenotype', type=float, default=-9, help='identifier for missing values (default: -9)')
(options, args) = parser.parse_args()

if (options.bfilesim is None): raise Exception('--bfilesim must be supplied')
if (options.prev is None): raise Exception('--prev must be supplied')
if (options.pheno is None): raise Exception('--pheno must be supplied')


def calcLiabThreholds(U, s, keepArr, phe):

#Run logistic regression
G = U[:, :options.numRemovePCs] * np.sqrt(s[:options.numRemovePCs])
Logreg = sklearn.linear_model.LogisticRegression(penalty='l2', C=500000, fit_intercept=True)
Logreg.fit(G[keepArr, :options.numRemovePCs], phe[keepArr])

#Compute individual thresholds
Pi = Logreg.predict_proba(G[:, :options.numRemovePCs])[:,1]

#Compute thresholds and save to files
P = np.sum(phe==1) / float(phe.shape[0])
K = options.prev
Ki = K*(1-P) / (P*(1-K)) * Pi / (1 + K*(1-P) / (P*(1-K))*Pi - Pi)
thresholds = stats.norm(0,1).isf(Ki)
thresholds[Ki>=1.] = -999999999
thresholds[Ki<=0.] = 999999999

return Pi, thresholds


def calcH2Continuous_twotails(XXT, phe, keepArr):

print 'computing h2 for a two-tails ascertained study...'

XXT = XXT[np.ix_(keepArr, keepArr)]
phe = phe[keepArr]

t1 = stats.norm(0,1).ppf(options.prev)
t2 = stats.norm(0,1).isf(options.prev)
phit1 = stats.norm(0,1).pdf(t1)
phit2 = stats.norm(0,1).pdf(t2)

K1 = options.prev
K2 = options.prev

xCoeff = ((phit2*t2 - phit1*t1 + K1 + K2)**2 * (K1+K2)**2 - (phit2-phit1)**4) / (K1 + K2)**4
intersect = ((phit2-phit1) / (K1+K2))**2

pheMean = 0
pheVar = 1

x = (xCoeff * options.h2coeff) * XXT
y = np.outer((phe-pheMean)/np.sqrt(pheVar), (phe-pheMean)/np.sqrt(pheVar))
y -= intersect

y = y[np.triu_indices(y.shape[0], 1)]
x = x[np.triu_indices(x.shape[0], 1)]

slope, intercept, rValue, pValue, stdErr = stats.linregress(x,y)
return slope



def calcH2Continuous(XXT, phe, keepArr):
t = stats.norm(0,1).isf(options.prev)
phit = stats.norm(0,1).pdf(t)

K1 = 1 - options.prev
K2 = 1 - K1
P = np.sum(phe<t) / float(phe.shape[0])
P2 = 1.0
P1 = K2*P2*P / (K1*(1-P))
R = P2 / P1

XXT = XXT[np.ix_(keepArr, keepArr)]
phe = phe[keepArr]

xCoeff = (((R-1)*phit*t + K1 + R*K2)**2 * (K1+R*K2)**2 - ((R-1)*phit)**4) / (K1 + R*K2)**4
x = (xCoeff * options.h2coeff) * XXT
pheMean = 0
pheVar = 1
y = np.outer((phe-pheMean) / np.sqrt(pheVar), (phe-pheMean)/np.sqrt(pheVar))
y -= ((R-1)*phit / (K1+R*K2))**2

y = y[np.triu_indices(y.shape[0], 1)]
x = x[np.triu_indices(x.shape[0], 1)]

slope, intercept, rValue, pValue, stdErr = stats.linregress(x,y)
return slope



def calcH2Binary(XXT, phe, probs, thresholds, keepArr):
K = options.prev
P = np.sum(phe>0) / float(phe.shape[0])

XXT = XXT[np.ix_(keepArr, keepArr)]
phe = phe[keepArr]

if (thresholds is None):
t = stats.norm(0,1).isf(K)
phit = stats.norm(0,1).pdf(t)
xCoeff = P*(1-P) / (K**2 * (1-K)**2) * phit**2 * options.h2coeff
y = np.outer((phe-P) / np.sqrt(P*(1-P)), (phe-P) / np.sqrt(P*(1-P)))
x = xCoeff * XXT

else:
probs = probs[keepArr]
thresholds = thresholds[keepArr]
Ki = K*(1-P) / (P*(1-K)) * probs / (1 + K*(1-P) / (P*(1-K))*probs - probs)
phit = stats.norm(0,1).pdf(thresholds)
probsInvOuter = np.outer(probs*(1-probs), probs*(1-probs))
y = np.outer(phe-probs, phe-probs) / np.sqrt(probsInvOuter)
sumProbs = np.tile(np.column_stack(probs).T, (1,probs.shape[0])) + np.tile(probs, (probs.shape[0], 1))
Atag0 = np.outer(phit, phit) * (1 - (sumProbs)*(P-K)/(P*(1-K)) + np.outer(probs, probs)*(((P-K)/(P*(1-K)))**2)) / np.sqrt(probsInvOuter)
B0 = np.outer(Ki + (1-Ki)*(K*(1-P))/(P*(1-K)), Ki + (1-Ki)*(K*(1-P))/(P*(1-K)))
x = (Atag0 / B0 * options.h2coeff) * XXT

y = y[np.triu_indices(y.shape[0], 1)]
x = x[np.triu_indices(x.shape[0], 1)]

slope, intercept, rValue, pValue, stdErr = stats.linregress(x,y)
return slope

######## main ######

#Determine if this is a case-control study
pheOrig = np.loadtxt(options.pheno, usecols=[2])
pheOrig = pheOrig[pheOrig != options.missingPhenotype]
pheUnique = np.unique(pheOrig)
if (pheUnique.shape[0] < 2): raise Exception('Less than two different phenotypes observed')
isCaseControl = (pheUnique.shape[0] == 2)

#Read eigen file
if (options.eigen is not None): eigen = np.load(options.eigen)

#Read bfilesim and pheno file for heritability computation
loadSNPs = (options.eigen is None)
bed, phe = leapUtils.loadData(options.bfilesim, options.extractSim, options.pheno, options.missingPhenotype, loadSNPs=loadSNPs, standardize=True)

# # # #Standardize SNPs according to frq file
# # # pheThreshold = (np.mean(pheUnique) if isCaseControl else stats.norm(0,1).isf(options.prev))
# # # empMean = np.mean(bed.val, axis=0) / 2.0
# # # bed.val[:, empMean>0.5] = 2 - bed.val[:, empMean>0.5]
# # # controls = (phe<pheThreshold)
# # # mafs = bed.val[controls, :].mean(axis=0)/2.0
# # # #mafs = np.loadtxt(options.bfilesim+'.frq', usecols=[1,2]).mean(axis=1)
# # # bed.val -= 2*mafs
# # # bed.val /= np.sqrt(2*mafs*(1-mafs))


#Compute kinship matrix
if (options.eigen is None): XXT = leapUtils.symmetrize(blas.dsyrk(1.0, bed.val, lower=1)) / bed.val.shape[1]
else: XXT = eigen['XXT'] / bed.sid.shape[0]

#Compute relatedness
if (options.relCutoff <= 0): keepArr = np.ones(bed.iid.shape[0], dtype=bool)
else:
bed2 = bed
if (options.related is None):
if (options.extractSim is not None): bed2, _ = leapUtils.loadData(options.bfilesim, None, options.pheno, options.missingPhenotype, loadSNPs=True)
keepArr = leapUtils.findRelated(bed2, options.relCutoff)
else:
keepArr = leapUtils.loadRelatedFile(bed, options.related)

#Remove top PCs from kinship matrix
if (options.numRemovePCs > 0):
if (options.eigen is None): s,U = leapUtils.eigenDecompose(XXT)
else: s, U = eigen['s']**2 / bed.sid.shape[0], eigen['U']

XXT -= (U[:, :options.numRemovePCs]*s[:options.numRemovePCs]).dot(U[:, :options.numRemovePCs].T)

if isCaseControl:
print 'Computing h2 for a binary phenotype'
pheMean = phe.mean()
phe[phe <= pheMean] = 0
phe[phe > pheMean] = 1
if (options.numRemovePCs > 0):
probs, thresholds = calcLiabThreholds(U, s, keepArr, phe)
h2 = calcH2Binary(XXT, phe, probs, thresholds, keepArr)
else: h2 = calcH2Binary(XXT, phe, None, None, keepArr)
else:
print 'Computing h2 for a continuous phenotype'
if (options.lowtail == 0): h2 = calcH2Continuous(XXT, phe, keepArr)
else: h2 = calcH2Continuous_twotails(XXT, phe, keepArr)

if (h2 <= 0): raise Exception("Negative heritability found. Exitting...")
if (np.isnan(h2)): raise Exception("Invalid heritability estimate. Please double-check your input for any errors.")

print 'h2: %0.6f'%h2





0 comments on commit 652d0de

Please sign in to comment.