-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathutil.py
54 lines (46 loc) · 1.47 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""
Collection of auxiliary utility functions for graph-dock
"""
import pandas as pd
import random
import os
def preprocess_data(
sample_size=50000,
partition_ratios=[0.8, 0.1, 0.1],
raw_data_path=os.path.join(
"./", "data", "d4_table_name_smi_energy_hac_lte_25_title.csv"
),
output_path=os.path.join("./", "data", "d4_dock_data_500k.csv"),
):
"""
Formats data from docking data given by Lyu et al (classical docking) paper
into format expected by training script.
Parameters
----------
sample_size : int
Size of train+val+test sample
partition_ratios : List[int]
Ratios for training, validation, testing, respectively
raw_data_path : os.path
Location of raw data
output_path : os.path
Location of output data
"""
n = sum(1 for line in open(raw_data_path)) - 1
skip_r = sorted(random.sample(range(1, n + 1), n - sample_size))
df = pd.read_csv(raw_data_path, skiprows=skip_r)
df = df.drop(["hac"], axis=1)
# create partitions
partitions = (
["train"] * int(partition_ratios[0] * sample_size)
+ ["val"] * int(partition_ratios[1] * sample_size)
+ ["test"] * int(partition_ratios[2] * sample_size)
)
# deal with int rounding
while len(partitions) != sample_size:
partitions.append("train")
random.shuffle(partitions)
df["partition"] = partitions
df.to_csv(output_path)
if __name__ == "__main__":
preprocess_data()