diff --git a/graphium/utils/read_file.py b/graphium/utils/read_file.py new file mode 100644 index 000000000..27d2fb216 --- /dev/null +++ b/graphium/utils/read_file.py @@ -0,0 +1,173 @@ +""" +-------------------------------------------------------------------------------- +Copyright (c) 2023 Valence Labs, Recursion Pharmaceuticals. + +Use of this software is subject to the terms and conditions outlined in the LICENSE file. +Unauthorized modification, distribution, or use is prohibited. Provided 'as is' without +warranties of any kind. + +Valence Labs, Recursion Pharmaceuticals are not liable for any damages arising from its use. +Refer to the LICENSE file for the full terms and conditions. +-------------------------------------------------------------------------------- +""" + + +""" Utiles for data parsing""" +import os +import warnings +import numpy as np +import pandas as pd +import datamol as dm +from functools import partial +from copy import copy +import fsspec + +from loguru import logger +from rdkit import Chem +from rdkit.Chem.Descriptors import ExactMolWt + +from graphium.utils.tensor import parse_valid_args, arg_in_func + + +def read_file(filepath, as_ext=None, **kwargs): + r""" + Allow to read different file format and parse them into a MolecularDataFrame. + Supported formats are: + * csv (.csv, .smile, .smiles, .tsv) + * txt (.txt) + * xls (.xls, .xlsx, .xlsm, .xls*) + * sdf (.sdf) + * pkl (.pkl) + + Arguments + ----------- + + filepath: str + The full path and name of the file to read. + It also supports the s3 url path. + as_ext: str, Optional + The file extension used to read the file. If None, the extension is deduced + from the extension of the file. Otherwise, no matter the file extension, + the file will be read according to the specified ``as_ext``. + (Default=None) + **kwargs: All the optional parameters required for the desired file reader. + + TODO: unit test to make sure it works well with all extensions + + Returns + --------- + df: pandas.DataFrame + The ``pandas.DataFrame`` containing the parsed data + + """ + + # Get the file extension + if as_ext is None: + file_ext = os.path.splitext(filepath)[-1].lower()[1:] + else: + file_ext = as_ext + if not isinstance(file_ext, str): + raise TypeError("`file_type` must be a `str`. Provided: {}".format(file_ext)) + + open_mode = "r" + + # Read the file according to the right extension + if file_ext in ["csv", "smile", "smiles", "smi", "tsv"]: + file_reader = pd.read_csv + elif file_ext == "txt": + file_reader = pd.read_table + elif file_ext[0:3] == "xls": + open_mode = "rb" + file_reader = partial(pd.read_excel, engine="openpyxl") + elif file_ext == "sdf": + file_reader = parse_sdf_to_dataframe + elif file_ext == "pkl": + open_mode = "rb" + file_reader = pd.read_pickle + else: + raise 'File extension "{}" not supported'.format(file_ext) + + kwargs = parse_valid_args(fn=file_reader, param_dict=kwargs) + + if file_ext[0:3] not in ["sdf", "xls"]: + with file_opener(filepath, open_mode) as file_in: + data = file_reader(file_in, **kwargs) + else: + data = file_reader(filepath, **kwargs) + return data + + +def parse_sdf_to_dataframe(sdf_path, as_cxsmiles=True, skiprows=None): + r""" + Allows to read an SDF file containing molecular informations, convert + it to a pandas DataFrame and convert the molecules to SMILES. It also + lists a warning of all the molecules that couldn't be read. + + Arguments + ----------- + + sdf_path: str + The full path and name of the sdf file to read + as_cxsmiles: bool, optional + Whether to use the CXSMILES notation, which preserves atomic coordinates, + stereocenters, and much more. + See `https://dl.chemaxon.com/marvin-archive/latest/help/formats/cxsmiles-doc.html` + (Default = True) + skiprows: int, list + The rows to skip from dataset. The enumerate index starts from 1 insted of 0. + (Default = None) + + """ + + # read the SDF file + # locally or from s3 + data = dm.read_sdf(sdf_path) + + # For each molecule in the SDF file, read all the properties and add it to a list of dict. + # Also count the number of molecules that cannot be read. + data_list = [] + count_none = 0 + if skiprows is not None: + if isinstance(skiprows, int): + skiprows = range(0, skiprows - 1) + skiprows = np.array(skiprows) - 1 + + for idx, mol in enumerate(data): + if (skiprows is not None) and (idx in skiprows): + continue + + if (mol is not None) and (ExactMolWt(mol) > 0): + mol_dict = mol.GetPropsAsDict() + data_list.append(mol_dict) + if as_cxsmiles: + smiles = Chem.rdmolfiles.MolToCXSmiles(mol, canonical=True) + else: + smiles = dm.to_smiles(mol, canonical=True) + data_list[-1]["SMILES"] = smiles + else: + count_none += 1 + logger.info(f"Could not read molecule # {idx}") + + # Display a message or warning after the SDF is done parsing + if count_none == 0: + logger.info("Successfully read the SDF file without error: {}".format(sdf_path)) + else: + warnings.warn( + ( + 'Error reading {} molecules from the "{}" file.\ + {} molecules read successfully.' + ).format(count_none, sdf_path, len(data_list)) + ) + return pd.DataFrame(data_list) + + +def file_opener(filename, mode="r"): + """File reader stream""" + filename = str(filename) + if "w" in mode: + filename = "simplecache::" + filename + if filename.endswith(".gz"): + instream = fsspec.open(filename, mode=mode, compression="gzip") + else: + instream = fsspec.open(filename, mode=mode) + return instream diff --git a/mkdocs.yml b/mkdocs.yml index 49da1dc14..0d9f34bfd 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -63,6 +63,7 @@ extra_css: - _assets/css/custom-graphium.css extra_javascript: + - https://polyfill.io/v3/polyfill.min.js?features=es6 - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js - _assets/js/google-analytics.js diff --git a/tests/test_utils.py b/tests/test_utils.py index e059223f9..b6a7b171c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -150,6 +150,32 @@ def test_nan_mad(self): np.testing.assert_almost_equal(torch_mad.numpy(), numpy_mad, decimal=4, err_msg=err_msg) +def test_file_opener(tmp_path): + # Create a temporary file + txt_file = tmp_path / "test.txt" + txt_file.write_text("Hello, World!") + + # Test opening file in read mode + with file_opener(txt_file, "r") as f: + assert f.read() == "Hello, World!" + + # Test opening file in write mode + with file_opener(txt_file, "w") as f: + f.write("New text") + + with file_opener(txt_file, "r") as f: + assert f.read() == "New text" + + # Create a temporary gzip file + gzip_file = tmp_path / "test.txt.gz" + with gzip.open(gzip_file, "wt") as f: + f.write("Hello, Gzip!") + + # Test opening gzip file in read mode + with file_opener(gzip_file, "r") as f: + assert f.read() == "Hello, Gzip!" + + class test_SafeRun(ut.TestCase): def test_safe_run(self): # Error is caught