forked from vsmolyakov/ml_algo_in_depth
-
Notifications
You must be signed in to change notification settings - Fork 0
/
kde.py
74 lines (58 loc) · 2.9 KB
/
kde.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(14)
class KDE():
def __init__(self):
#Histogram and Gaussian Kernel Estimator used to
#analyze RNA-seq data for flux estimation of a T7 promoter
self.G = 1e9 #length of genome in base pairs (bp)
self.C = 1e3 #number of unique molecules
self.L = 100 #length of a read, bp
self.N = 1e6 #number of reads, L bp long
self.M = 1e4 #number of unique read sequences, bp
self.LN = 1000 #total length of assembled / mapped RNA-seq reads
self.FDR = 0.05 #false discovery rate
#uniform sampling (poisson model)
self.lmbda = (self.N * self.L) / self.G #expected number of bases covered
self.C_est = self.M/(1-np.exp(-self.lmbda)) #library size estimate
self.C_cvrg = self.G - self.G * np.exp(-self.lmbda) #base coverage
self.N_gaps = self.N * np.exp(-self.lmbda) #number of gaps (uncovered bases)
#gamma prior sampling (negative binomial model)
#X = "number of failures before rth success"
self.k = 0.5 # dispersion parameter (fit to data)
self.p = self.lmbda/(self.lmbda + 1/self.k) # success probability
self.r = 1/self.k # number of successes
#RNAP binding data (RNA-seq)
self.data = np.random.negative_binomial(self.r, self.p, size=self.LN)
def histogram(self):
self.bin_delta = 1 #smoothing parameter
self.bin_range = np.arange(1, np.max(self.data), self.bin_delta)
self.bin_counts, _ = np.histogram(self.data, bins=self.bin_range)
#histogram density estimation
#P = integral_R p(x) dx, where X is in R^3
#p(x) = K/(NxV), where K=number of points in region R
#N=total number of points, V=volume of region R
rnap_density_est = self.bin_counts/(sum(self.bin_counts) * self.bin_delta)
return rnap_density_est
def kernel(self):
#Gaussian kernel density estimator with smoothing parameter h
#sum N Guassians centered at each data point, parameterized by common std dev h
x_dim = 1 #dimension of x
h = 10 #standard deviation
rnap_density_support = np.arange(np.max(self.data))
rnap_density_est = 0
for i in range(np.sum(self.bin_counts)):
rnap_density_est += (1/(2*np.pi*h**2)**(x_dim/2.0))*np.exp(-(rnap_density_support - self.data[i])**2 / (2.0*h**2))
#end for
rnap_density_est = rnap_density_est / np.sum(rnap_density_est)
return rnap_density_est
if __name__ == "__main__":
kde = KDE()
est1 = kde.histogram()
est2 = kde.kernel()
plt.figure()
plt.plot(est1, '-b', label='histogram')
plt.plot(est2, '--r', label='gaussian kernel')
plt.title("RNA-seq density estimate based on negative binomial model")
plt.xlabel("read length, [base pairs]"); plt.ylabel("density"); plt.legend()
plt.show()