-
Notifications
You must be signed in to change notification settings - Fork 12
/
model.go
197 lines (174 loc) · 5.72 KB
/
model.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
/*
* Filename: /Users/bao/code/allhic/model.go
* Path: /Users/bao/code/allhic
* Created Date: Friday, July 6th 2018, 10:47:29 pm
* Author: bao
*
* Copyright (c) 2018 Haibao Tang
*/
package allhic
import (
"bufio"
"fmt"
"math"
"os"
)
// LinkDensityModel is a power-law model Y = A * X ^ B, stores co-efficients
// this density than needs to multiply C - X to make it a probability distribution
// where C is chromosome length
type LinkDensityModel struct {
A, B float64
binStarts []int
binNorms []int
nLinks []int
linkDensity []float64
}
// ********* Calculation of link distribution model ************
// NewLinkDensityModel makes an empty link distribution ready to be filled in
func NewLinkDensityModel() *LinkDensityModel {
return &LinkDensityModel{
binStarts: []int{},
binNorms: make([]int, nBins),
nLinks: make([]int, nBins),
linkDensity: make([]float64, nBins),
}
}
// writeDistribution writes the link size distribution to file
func (r *LinkDensityModel) writeDistribution(outfile string) {
f, _ := os.Create(outfile)
w := bufio.NewWriter(f)
_, _ = fmt.Fprintf(w, DistributionHeader)
for i := 0; i < nBins; i++ {
_, _ = fmt.Fprintf(w, "%d\t%d\t%d\t%d\t%d\t%.4g\n",
i, r.binStarts[i], r.BinSize(i), r.nLinks[i], r.binNorms[i], r.linkDensity[i])
}
_ = w.Flush()
log.Noticef("Link size distribution written to `%s`", outfile)
_ = f.Close()
}
// linkBin takes a link distance and convert to a binID
func (r *LinkDensityModel) linkBin(dist int) int {
if dist < MinLinkDist {
return -1
}
distOverMin := dist / MinLinkDist
log2i := uintLog2(uint(distOverMin))
log2f := uintLog2Frac(float64(dist) / float64(int(MinLinkDist)<<log2i))
return int(16*log2i + log2f)
}
// BinSize returns the size of each bin
func (r *LinkDensityModel) BinSize(i int) int {
return r.binStarts[i+1] - r.binStarts[i]
}
// makeNorms computes the normalization size for each bin
func (r *LinkDensityModel) makeNorms(contigSizes []int) {
for _, size := range contigSizes {
for j := 0; j < nBins; j++ {
z := size - r.binStarts[j]
if z < 0 {
break
}
r.binNorms[j] += z
}
}
}
// makeBins makes geometric bins and count links that fall in each bin
// This heavily borrows the method form LACHESIS
// https://github.com/shendurelab/LACHESIS/blob/master/src/LinkSizeExtracter.cc
func (r *LinkDensityModel) makeBins() {
// Step 1: make geometrically sized bins
for i := 0; 16*i <= nBins; i++ {
jpower := 1.0
for j := 0; j < 16 && 16*i+j <= nBins; j++ {
binStart := MinLinkDist << uint(i)
r.binStarts = append(r.binStarts, int(float64(binStart)*jpower))
jpower *= GeometricBinSize
}
}
}
// countBinDensities counts the links in each bin and divide by the biniesorm
func (r *LinkDensityModel) countBinDensities(contigs []*ContigInfo) {
maxLinkDist := math.MinInt32
for _, contig := range contigs {
for _, link := range contig.links {
if link > maxLinkDist {
maxLinkDist = link
}
}
}
// Step 2: calculate assayable sequence length
// Find the length of assayable intra-contig sequence in each bin
intraContigLinkRange := math.Log2(float64(maxLinkDist) / float64(MinLinkDist))
nIntraContigBins := int(math.Ceil(intraContigLinkRange * 16))
if nIntraContigBins > len(r.nLinks) {
nIntraContigBins = len(r.nLinks)
}
// Step 3: loop through all links and tabulate the counts
for _, contig := range contigs {
for _, link := range contig.links {
bin := r.linkBin(link)
if bin == -1 || bin >= len(r.nLinks) {
continue
}
r.nLinks[bin]++
}
}
// Step 4: normalize to calculate link density
for i := 0; i < nIntraContigBins; i++ {
r.linkDensity[i] = float64(r.nLinks[i]) / float64(r.binNorms[i]) / float64(r.BinSize(i))
}
// Step 5: in LACHESIS, we assume the distribution approximates 1/x
// for large x. This is not accurate. We should instead infer a power law
// distribution.
topBin := nIntraContigBins - 1
nTopLinks := 0
nTopLinksNeeded := len(contigs[0].links) / 100
for ; nTopLinks < nTopLinksNeeded; topBin-- {
nTopLinks += r.nLinks[topBin]
}
Xs := make([]int, 0)
Ys := make([]float64, 0)
for i := 0; i < topBin; i++ {
if r.nLinks[i] == 0 { // This will trigger nan in regression
continue
}
Xs = append(Xs, r.binStarts[i])
Ys = append(Ys, r.linkDensity[i])
}
r.fitPowerLaw(Xs, Ys)
// Overwrite the values of last few bins, or a bin with na values
for i := 0; i < nBins; i++ {
if r.linkDensity[i] == 0 || i >= topBin {
r.linkDensity[i] = r.transformPowerLaw(r.binStarts[i])
}
}
}
// fitPowerLaw fits power law distribution
// See reference: http://mathworld.wolfram.com/LeastSquaresFittingPowerLaw.html
// Assumes the form Y = A * X^B, returns (A, B), the coefficients
func (r *LinkDensityModel) fitPowerLaw(Xs []int, Ys []float64) {
SumLogXLogY, SumLogXLogX, SumLogX, SumLogY := 0.0, 0.0, 0.0, 0.0
n := len(Xs)
for i := 0; i < n; i++ {
logXs, logYs := math.Log(float64(Xs[i])), math.Log(Ys[i])
SumLogXLogY += logXs * logYs
SumLogXLogX += logXs * logXs
SumLogX += logXs
SumLogY += logYs
}
B := (float64(n)*SumLogXLogY - SumLogX*SumLogY) / (float64(n)*SumLogXLogX - SumLogX*SumLogX)
A := math.Exp((SumLogY - B*SumLogX) / float64(n))
r.A, r.B = A, B
log.Noticef("Power law Y = %.3g * X ^ %.4f", A, B)
}
// transformPowerLaw interpolate probability value given a link size
func (r *LinkDensityModel) transformPowerLaw(X int) float64 {
return r.A * math.Pow(float64(X), r.B)
}
// transformLogProb calculates the probability given a link size
func (r *LinkDensityModel) tranformLogProb(X int) float64 {
// The following two version have subtle differences, first one is more accurate, but
// in reality the difference seems to be negligible
// return math.Log(float64(r.Seqsize-X)) + r.model.B*math.Log(float64(X))
return r.B * math.Log(float64(X))
}