-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathpreprocessing.py
More file actions
98 lines (78 loc) · 3.43 KB
/
preprocessing.py
File metadata and controls
98 lines (78 loc) · 3.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
import numpy as np
from pydeseq2 import preprocessing
def findLowCounts(adata, filter_type, minimum_reads, verbose=True):
"""
Label variables with low counts in either or all samples.
Parameters:
adata (AnnData): AnnData object containing the counts to be filtered.
filter_type (str): specify the filter type. Possible values are: 'all', or 'sum'.
minimum_reads (int): minimum number of reads.
verbose (bool): print the number of removed variables. Default is True.
Returns:
None
"""
count_bin = adata.X >= minimum_reads
if filter_type == 'all':
out = adata[:, count_bin.all(axis=0)].copy()
elif filter_type == 'sum':
out = adata[:, adata.to_df().sum(axis=0) >= minimum_reads].copy()
else:
raise ValueError(f'filter_type "{filter_type}" not recognized. Use "all", or "sum".')
if verbose:
n_removed = adata.shape[1] - out.shape[1]
# print the number of removed variables
print(
f"{n_removed} variables with less than {minimum_reads} reads (filter_type: '{filter_type}')"
)
adata.var['low_count'] = ~adata.var.index.isin(out.var.index.to_list())
def addPseudoCount(adata, behavior, value, inplace=True):
"""
Add pseudocounts to the given counts based on the specified behavior.
Args:
adata (AnnData): AnnData object containing the counts to which pseudocounts will be added.
behavior (str): The behavior for adding pseudocounts. Possible values are:
- 'default' or 'zeros_only': Add pseudocounts only to rows with at least one zero value.
- 'all_values': Add pseudocounts to all rows.
- 'filter_out': Set rows with at least one zero value to NaN.
value (float): The value of the pseudocount to be added.
inplace (bool): If True, the pseudocounts will replace the original counts in the AnnData object.
Returns:
DataFrame: The counts DataFrame with pseudocounts added based on the specified behavior.
Raises:
ValueError: If the pseudocount behavior is not recognized or not implemented.
"""
## possible pseudocount behaviors
# 1. remove 0
# 2. add pseudocount
# 3. impute 0 (it's hard)
# Source:
# https://github.com/mhorlbeck/ScreenProcessing/blob/0ee5192ecc17348665bd1387ddfa9037efb7964f/process_experiments.py#L485
counts = adata.to_df()
# pseudocount
if behavior == 'default' or behavior == 'zeros_only':
counts_pseudo = counts.replace(0, value)
elif behavior == 'all_values':
counts_pseudo = counts + value
elif behavior == 'filter_out':
counts_pseudo = counts.replace(0, np.nan)
else:
raise ValueError(
'Pseudocount behavior not recognized or not implemented')
if inplace:
adata.X = counts_pseudo.to_numpy()
else:
return counts_pseudo
def normalizeSeqDepth(adata):
"""
Normalize counts by sequencing depth and update the adata object.
This function uses the PyDESeq2 normalization method.
Args:
adata (AnnData): AnnData object containing the counts to be normalized.
"""
# normalize counts by sequencing depth
norm_counts, size_factors = preprocessing.deseq2_norm(adata.X)
# update adata object
adata.obs['size_factors'] = size_factors
adata.layers['seq_depth_norm'] = norm_counts
adata.X = adata.layers['seq_depth_norm']