-
Notifications
You must be signed in to change notification settings - Fork 192
/
Copy pathstats.py
235 lines (211 loc) · 13.3 KB
/
stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
from __future__ import absolute_import
import os
import csv
import shutil
import datetime
import numpy as np
from collections import Counter
from . import util, files
def OrthogroupsMatrix(iSpecies, properOGs):
speciesIndexDict = {iSp: iCol for iCol, iSp in enumerate(iSpecies)}
nSpecies = len(iSpecies)
nGroups = len(properOGs)
# (i, j)-th entry of ogMatrix gives the number of genes from i in orthogroup j
ogMatrix = np.zeros((nGroups, nSpecies))
for i_og, og in enumerate(properOGs):
for species, _ in og:
ogMatrix[i_og, speciesIndexDict[species]] += 1
return ogMatrix
def Stats_SpeciesOverlaps(fn, speciesNamesDict, iSpecies, speciesPresence):
""" Number of orthogroups in which each species-pair is present. Called by Stats"""
with open(fn, util.csv_write_mode) as outfile:
writer = csv.writer(outfile, delimiter="\t")
writer.writerow([""] + [speciesNamesDict[index] for index in iSpecies])
for iSp in iSpecies:
overlap = [len([1 for og in speciesPresence if (iSp in og and jSp in og)]) for jSp in iSpecies]
writer.writerow([speciesNamesDict[iSp]] + overlap)
def Stats_SizeTable(writer_sum, writer_sp, properOGs, allGenesCounter, iSpecies, speciesPresence):
""" Overall and per-species histogram tables of orthogroup sizes. Called by Stats"""
bins = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 21, 51, 101, 151, 201, 501, 1001, 9e99]
writer_sum.writerow([])
writer_sp.writerow([])
nGenesPerOG = [len(og) for og in properOGs]
nGenesPerOGPerSpecies = [[len([1 for g in og if g[0] == iSp]) for og in properOGs] for iSp in iSpecies]
counters_GenesPerOGPerSpecies = [Counter(spCount) for spCount in nGenesPerOGPerSpecies]
counters_GenesPerOG = Counter(nGenesPerOG)
nSp = len(iSpecies)
nOGs = len(properOGs)
nGenesTotal = sum([len(og) for og in properOGs])
percentFormat = "%0.1f"
writer_sum.writerow(
["Average number of genes per-species in orthogroup", "Number of orthogroups", "Percentage of orthogroups",
"Number of genes", "Percentage of genes"])
# Per-species tables
table_NO = [["Number of genes per-species in orthogroup"] + ["Number of orthogroups" for _ in
iSpecies]] # number of orthogroups
table_PO = [["Number of genes per-species in orthogroup"] + ["Percentage of orthogroups" for _ in
iSpecies]] # percentage of orthogroups
table_NG = [
["Number of genes per-species in orthogroup"] + ["Number of genes" for _ in iSpecies]] # Number of genes
table_PG = [["Number of genes per-species in orthogroup"] + ["Percentage of genes" for _ in
iSpecies]] # percentage of genes
for start, end in zip(bins, bins[1:]):
binName = "<1" if start == 0 else (
"'%d" % start) if start + 1 == end else "'%d+" % start if end == 9e99 else "%d-%d" % (
start, end - 1)
nOrthogroups = sum([count for size, count in counters_GenesPerOG.items() if start * nSp <= size < end * nSp])
nGenes = sum([size * count for size, count in counters_GenesPerOG.items() if start * nSp <= size < end * nSp])
row_sum = [binName, nOrthogroups, percentFormat % (100. * nOrthogroups / nOGs), nGenes,
percentFormat % (100. * nGenes / nGenesTotal)]
writer_sum.writerow(row_sum)
# Per-species stats
if binName == "<1": binName = "'0"
nOrthogroups_ps = [sum([number for size, number in c.items() if start <= size < end]) for c in
counters_GenesPerOGPerSpecies]
nGenes_ps = [sum([size * number for size, number in c.items() if start <= size < end]) for c in
counters_GenesPerOGPerSpecies]
table_NO.append([binName] + nOrthogroups_ps)
table_PO.append([binName] + [percentFormat % (100. * n / nOGs) for n in nOrthogroups_ps])
table_NG.append([binName] + nGenes_ps)
table_PG.append(
[binName] + [percentFormat % (100. * n / allGenesCounter[iSp]) for iSp, n in zip(iSpecies, nGenes_ps)])
writer_sp.writerow([])
for r in table_NO: writer_sp.writerow(r)
writer_sp.writerow([])
for r in table_PO: writer_sp.writerow(r)
writer_sp.writerow([])
for r in table_NG: writer_sp.writerow(r)
writer_sp.writerow([])
for r in table_PG: writer_sp.writerow(r)
# Species presence
n = list(map(len, speciesPresence))
writer_sum.writerow([])
writer_sum.writerow(["Number of species in orthogroup", "Number of orthogroups"])
for i in range(1, nSp + 1):
writer_sum.writerow([i, n.count(i)])
def Stats(ogs, speciesNamesDict, iSpecies, iResultsVersion, fastaWriter, ids_dict, q_fast_add=False):
""" Top-level method for calculation of stats for the orthogroups"""
allOgs = [[list(map(int, g.split("_"))) for g in og] for og in ogs]
properOGs = [og for og in allOgs if len(og) > 1]
iogs_properOGs = [iog for iog, og in enumerate(allOgs) if len(og) > 1]
allGenes = [g for og in allOgs for g in og]
ogStatsResultsDir = files.FileHandler.GetOGsStatsResultsDirectory()
filename_sp = ogStatsResultsDir + "Statistics_PerSpecies" + (
"" if iResultsVersion == 0 else "_%d" % iResultsVersion) + ".tsv"
filename_sum = ogStatsResultsDir + "Statistics_Overall" + (
"" if iResultsVersion == 0 else "_%d" % iResultsVersion) + ".tsv"
filename_overlap = ogStatsResultsDir + "Orthogroups_SpeciesOverlaps" + (
"" if iResultsVersion == 0 else "_%d" % iResultsVersion) + ".tsv"
filename_single_copy = files.FileHandler.GetOrthogroupResultsFNBase() + "_SingleCopyOrthologues.txt"
percentFormat = "%0.1f"
with open(filename_sp, util.csv_write_mode) as outfile_species, open(filename_sum, util.csv_write_mode) as outfile_sum:
writer_sp = csv.writer(outfile_species, delimiter="\t")
writer_sum = csv.writer(outfile_sum, delimiter="\t")
# header
writer_sp.writerow([""] + [speciesNamesDict[index] for index in iSpecies])
# Number of genes
allGenesCounter = Counter([g[0] for g in allGenes])
nGenes = sum(allGenesCounter.values())
writer_sum.writerow(["Number of species", len(iSpecies)])
writer_sp.writerow(["Number of genes"] + [allGenesCounter[iSp] for iSp in iSpecies])
writer_sum.writerow(["Number of genes", nGenes])
# Number of assigned/unassigned genes
assignedGenesCounter = Counter([g[0] for og in properOGs for g in og])
nAssigned = sum(assignedGenesCounter.values())
writer_sp.writerow(["Number of genes in orthogroups"] + [assignedGenesCounter[iSp] for iSp in iSpecies])
writer_sum.writerow(["Number of genes in orthogroups"] + [nAssigned])
writer_sp.writerow(
["Number of unassigned genes"] + [allGenesCounter[iSp] - assignedGenesCounter[iSp] for iSp in iSpecies])
writer_sum.writerow(["Number of unassigned genes"] + [nGenes - nAssigned])
# Percentages
pAssigned = 100. * nAssigned / nGenes
writer_sp.writerow(["Percentage of genes in orthogroups"] + [
percentFormat % (100. * assignedGenesCounter[iSp] / allGenesCounter[iSp]) for iSp in iSpecies])
writer_sum.writerow(["Percentage of genes in orthogroups", percentFormat % pAssigned])
writer_sp.writerow(["Percentage of unassigned genes"] + [
percentFormat % (100 * (1. - (float(assignedGenesCounter[iSp]) / allGenesCounter[iSp]))) for iSp in
iSpecies])
writer_sum.writerow(
["Percentage of unassigned genes", percentFormat % (100 * (1. - (float(nAssigned) / nGenes)))])
# Number of Orthogroups
speciesPresence = [set([g[0] for g in og]) for og in properOGs]
nOgs = len(properOGs)
writer_sum.writerow(["Number of orthogroups", nOgs])
writer_sp.writerow(
["Number of orthogroups containing species"] + [sum([iSp in og_sp for og_sp in speciesPresence]) for iSp in
iSpecies])
writer_sp.writerow(["Percentage of orthogroups containing species"] + [percentFormat % (
(100. * sum([iSp in og_sp for og_sp in speciesPresence]) / len(properOGs)) if len(properOGs) > 0 else 0.)
for iSp in iSpecies])
# Species specific orthogroups - orthogroups-based
speciesSpecificOGsCounter = Counter([next(iter(og_sp)) for og_sp in speciesPresence if len(og_sp) == 1])
writer_sp.writerow(
["Number of species-specific orthogroups"] + [speciesSpecificOGsCounter[iSp] for iSp in iSpecies])
writer_sum.writerow(["Number of species-specific orthogroups", sum(speciesSpecificOGsCounter.values())])
# Species specific orthogroups - gene-based
iSpeciesSpecificOGs = [i for i, og_sp in enumerate(speciesPresence) if len(og_sp) == 1]
iSpSpecificOGsGeneCounts = [
sum([len(properOGs[iog]) for iog in iSpeciesSpecificOGs if properOGs[iog][0][0] == iSp]) for iSp in
iSpecies]
writer_sp.writerow(["Number of genes in species-specific orthogroups"] + iSpSpecificOGsGeneCounts)
writer_sum.writerow(["Number of genes in species-specific orthogroups", sum(iSpSpecificOGsGeneCounts)])
writer_sp.writerow(["Percentage of genes in species-specific orthogroups"] + [
percentFormat % (100. * n_ss / allGenesCounter[iSp]) for n_ss, iSp in
zip(iSpSpecificOGsGeneCounts, iSpecies)])
writer_sum.writerow(["Percentage of genes in species-specific orthogroups",
percentFormat % (100. * sum(iSpSpecificOGsGeneCounts) / nGenes)])
# 'averages'
l = list(sorted(list(map(len, properOGs))))
writer_sum.writerow(["Mean orthogroup size", "%0.1f" % np.mean(l)])
writer_sum.writerow(["Median orthogroup size", np.median(l)])
L = np.cumsum(l)
j, _ = next((i, x) for i, x in enumerate(L) if x > nAssigned / 2)
writer_sum.writerow(["G50 (assigned genes)", l[j]])
l2 = list(reversed(list(map(len, ogs))))
L2 = np.cumsum(l2)
j2, _ = next((i, x) for i, x in enumerate(L2) if x > nGenes / 2)
G50 = l2[j2]
writer_sum.writerow(["G50 (all genes)", G50])
writer_sum.writerow(["O50 (assigned genes)", len(l) - j])
O50 = len(l2) - j2
writer_sum.writerow(["O50 (all genes)", O50])
# Single-copy orthogroups
ogMatrix = OrthogroupsMatrix(iSpecies, properOGs) # use iogs_properOGs for indexing
nSpecies = len(iSpecies)
nPresent = (ogMatrix > np.zeros((1, nSpecies))).sum(1)
nCompleteOGs = list(nPresent).count(nSpecies)
singleCopyOGs = (ogMatrix == np.ones((1, nSpecies))).all(1).nonzero()[0]
nSingleCopy = len(singleCopyOGs)
writer_sum.writerow(["Number of orthogroups with all species present", nCompleteOGs])
writer_sum.writerow(["Number of single-copy orthogroups", nSingleCopy])
with open(filename_single_copy, 'w') as outfile_singlecopy:
outfile_singlecopy.write("\n".join(["N0.HOG%07d" % iogs_properOGs[i_] for i_ in singleCopyOGs]))
# Link single-copy orthologues
g_fmt = files.FileHandler.GetResultsSeqsDir_SingleCopy() + "N0.HOG%07d.fa"
for i in singleCopyOGs:
out_fn = g_fmt % iogs_properOGs[i]
og = ["%d_%d" % tuple(g) for g in properOGs[i]]
fastaWriter.WriteSeqsToFasta_withNewAccessions(og, out_fn, ids_dict)
# Results filenames
writer_sum.writerow(["Date", str(datetime.datetime.now()).split()[0]])
writer_sum.writerow(
["Orthogroups file", "Orthogroups" + ("" if iResultsVersion == 0 else "_%d" % iResultsVersion) + ".tsv"])
writer_sum.writerow(["Unassigned genes file", "Orthogroups" + (
"" if iResultsVersion == 0 else "_%d" % iResultsVersion) + "_UnassignedGenes.tsv"])
writer_sum.writerow(["Per-species statistics", os.path.split(filename_sp)[1]])
writer_sum.writerow(["Overall statistics", os.path.split(filename_sum)[1]])
writer_sum.writerow(["Orthogroups shared between species", os.path.split(filename_overlap)[1]])
# Sizes
Stats_SizeTable(writer_sum, writer_sp, properOGs, allGenesCounter, iSpecies, speciesPresence)
Stats_SpeciesOverlaps(filename_overlap, speciesNamesDict, iSpecies, speciesPresence)
summaryText = """OrthoFinder assigned %d genes (%0.1f%% of total) to %d orthogroups. Fifty percent of genes were in orthogroups with %d or more genes (G50 was %d) and were contained in the largest %d orthogroups (O50 was %d). There were %d orthogroups with all species present and %d of these consisted entirely of single-copy genes.""" % (
nAssigned, pAssigned, nOgs, G50, G50, O50, O50, nCompleteOGs, nSingleCopy)
if q_fast_add:
print("The majority of genes have been assigned to existing orthogroups, however, the remaining clade-specific genes not seen in the core species were also analysed with the following results:")
print(summaryText)
def add_unassigned_genes(ogs, all_seq_ids):
"""Extend OGs with unassigned genes as singletons"""
all_assigned = set([g for og in ogs for g in og])
unassigned = set(all_seq_ids).difference(all_assigned)
ogs.extend([{g,} for g in unassigned])
return ogs