-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathvalidate_data.py
executable file
·61 lines (44 loc) · 1.86 KB
/
validate_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python
import argparse
import logging
import os
import sys
import pandas as pd
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s %(message)s", "%d-%m-%Y %H:%M:%S")
def is_unique(df: pd.DataFrame, column: str) -> None:
"""Checks if a provided column is unique.
Args:
df (pd.DataFrame): Dataset
column (str): Column to check
"""
if df[column].duplicated().sum() > 0:
sys.exit(f"The {column} is not unique, please make sure there are no duplicates.")
def main(args: argparse.Namespace):
"""Main function
Args:
args (argparse.Namespace): arguments
"""
if not os.path.exists(args.input):
sys.exit(f"Provided {args.input} does no exists!")
amp = pd.read_csv(f"{args.input}/amp_batches.txt", sep="\t")
seq = pd.read_csv(f"{args.input}/seq_batches.txt", sep="\t")
wells = pd.read_csv(f"{args.input}/wells_cells.txt", sep="\t")
logging.info("Checking Well_ID, Amp_batch_ID, Seq_batch_ID")
is_unique(wells, "Well_ID")
is_unique(amp, "Amp_batch_ID")
is_unique(seq, "Seq_batch_ID")
logging.info("Checking if barcodes are non-unique in batches")
for amp_batch in wells["Amp_batch_ID"].unique():
is_unique(wells.query("Amp_batch_ID == @amp_batch"), "Cell_barcode")
if sorted(amp["Seq_batch_ID"].unique()) != sorted(seq["Seq_batch_ID"].unique()):
sys.exit("Some amplification batches are linked to undefined Seq_batch_ID")
logging.info("Validation: passed")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Validate data folder which contains all txt files.")
parser.add_argument("--version", "-v", action="version", version=f"v1.0")
parser.add_argument("--input", type=str, help="Input folder")
args = parser.parse_args()
main(args)