-
Notifications
You must be signed in to change notification settings - Fork 104
/
Copy pathfind-peaks.py
131 lines (107 loc) · 4.82 KB
/
find-peaks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""
Find peak-regions given a file of values (likely -log10(p)).
%prog [options] data.file
e.g.
%prog --skip 1000 --seed 5 --threshold 3 my.data.txt
will seed on values in the 2nd column that are larger than `5` and
extend as long as it continues to find values greater than `3` within `1000`
basepairs in either direction--where the locations is determined by the 1st
column.
if the -g options is used. The columns are: "chromosome" "position" "value"
otherwise, they are: "position" "value".
The file must be be sorted by columns 1, 2 with `-g` and column 1 without `-g`
If --keep-cols is specified the final output column includes values from each
specified column *and* the value column (3rd column). for any rows above
the threshold.
"""
import argparse
from itertools import groupby
from operator import itemgetter
from toolshed import reader
def gen_regions(fh, skip, seed, threshold, group, keep_cols, report_cutoff):
if group == False:
def fhgen(): # insert None so that they all get grouped the same...
for row in fh:
row.insert(0, None)
yield row
fhiter = fhgen()
else:
fhiter = fh
# turn the indexes into a function that returns their values.
if keep_cols:
# also keep the p-value...
keep_cols.append(2)
col_getter = itemgetter(*keep_cols)
else:
keep_cols, col_getter = None, None
for key, grouped in groupby(fhiter, itemgetter(0)):
for region in find_region(grouped, skip, seed, threshold, col_getter,
report_cutoff):
yield key, region
def get_and_clear_region(region, col_getter, cutoff):
start, end = region[0][0], region[-1][0]
# r looks like: (67390903, ['chr10', '673903', '3.831', 'mm9-10-67390903'])
rows = (r[1] for r in region if float(r[1][2]) > cutoff)
extra = "|".join([",".join(col_getter(r)) for r in rows] if col_getter else [])
l = len(region)
region[:] = []
return start, end, l, extra
def find_region(aiter, skip, seed, threshold, col_getter, report_cutoff):
current_region = []
seeded = False
for row in aiter:
chrom, pos, val = row[:3]
pos = int(pos)
val = float(val)
# first check if we are too far away to continue the region.
if seeded and pos - current_region[-1][0] > skip:
yield get_and_clear_region(current_region, col_getter,
report_cutoff)
assert current_region == []
seeded = False
elif current_region != [] and pos - current_region[-1][0] > skip:
current_region = []
assert seeded == False
# if it's greater than the seed, everything's easy.
if val >= seed:
current_region.append((pos, row))
seeded = True
elif val >= threshold:
current_region.append((pos, row))
else:
# nothing, it's not a large value
pass
if current_region and seeded:
yield get_and_clear_region(current_region, col_getter, report_cutoff)
def main():
p = argparse.ArgumentParser(__doc__)
p.add_argument("-g", dest="group", help="group by the first column (usually"
" chromosome or probe) if this [optional]", default=False,
action="store_true")
p.add_argument("--skip", dest="skip", help="Maximum number of intervening "
"basepairs to skip before seeing a value. If this number is "
"exceeded, the region is ended chromosome or probe "
"[default: %default]", type=int, default=50000)
p.add_argument("--min-region-size", dest="min-region", help="minimum "
"length of the region. regions shorter than this are not printed"
"[default: %default] (no minimum)", type=int, default=0)
p.add_argument("--seed", dest="seed", help="A value must be at least this"
" large in order to seed a region. [default: %default]",
type=float, default=5.0)
p.add_argument("--keep-cols", dest="keep", help="comma separated list of"
"columns to add to the output data", default="")
p.add_argument("--threshold", dest="threshold", help="After seeding, a value"
"of at least this number can extend a region [default: "
"%default]", type=float, default=3.0)
p.add_argument("regions")
args = p.parse_args()
f = reader(args.regions, header=False, sep="\t")
keep = [int(k) for k in args.keep.strip().split(",") if k]
report_cutoff = args.seed
for key, region in gen_regions(f, args.skip, args.seed, args.threshold,
args.group, keep, report_cutoff):
print key + "\t" + "\t".join(map(str, region))
if __name__ == "__main__":
import doctest
if doctest.testmod(optionflags=doctest.ELLIPSIS).failed == 0:
main()