diff --git a/graphium/utils/read_file.py b/graphium/utils/read_file.py deleted file mode 100644 index 27d2fb216..000000000 --- a/graphium/utils/read_file.py +++ /dev/null @@ -1,173 +0,0 @@ -""" --------------------------------------------------------------------------------- -Copyright (c) 2023 Valence Labs, Recursion Pharmaceuticals. - -Use of this software is subject to the terms and conditions outlined in the LICENSE file. -Unauthorized modification, distribution, or use is prohibited. Provided 'as is' without -warranties of any kind. - -Valence Labs, Recursion Pharmaceuticals are not liable for any damages arising from its use. -Refer to the LICENSE file for the full terms and conditions. --------------------------------------------------------------------------------- -""" - - -""" Utiles for data parsing""" -import os -import warnings -import numpy as np -import pandas as pd -import datamol as dm -from functools import partial -from copy import copy -import fsspec - -from loguru import logger -from rdkit import Chem -from rdkit.Chem.Descriptors import ExactMolWt - -from graphium.utils.tensor import parse_valid_args, arg_in_func - - -def read_file(filepath, as_ext=None, **kwargs): - r""" - Allow to read different file format and parse them into a MolecularDataFrame. - Supported formats are: - * csv (.csv, .smile, .smiles, .tsv) - * txt (.txt) - * xls (.xls, .xlsx, .xlsm, .xls*) - * sdf (.sdf) - * pkl (.pkl) - - Arguments - ----------- - - filepath: str - The full path and name of the file to read. - It also supports the s3 url path. - as_ext: str, Optional - The file extension used to read the file. If None, the extension is deduced - from the extension of the file. Otherwise, no matter the file extension, - the file will be read according to the specified ``as_ext``. - (Default=None) - **kwargs: All the optional parameters required for the desired file reader. - - TODO: unit test to make sure it works well with all extensions - - Returns - --------- - df: pandas.DataFrame - The ``pandas.DataFrame`` containing the parsed data - - """ - - # Get the file extension - if as_ext is None: - file_ext = os.path.splitext(filepath)[-1].lower()[1:] - else: - file_ext = as_ext - if not isinstance(file_ext, str): - raise TypeError("`file_type` must be a `str`. Provided: {}".format(file_ext)) - - open_mode = "r" - - # Read the file according to the right extension - if file_ext in ["csv", "smile", "smiles", "smi", "tsv"]: - file_reader = pd.read_csv - elif file_ext == "txt": - file_reader = pd.read_table - elif file_ext[0:3] == "xls": - open_mode = "rb" - file_reader = partial(pd.read_excel, engine="openpyxl") - elif file_ext == "sdf": - file_reader = parse_sdf_to_dataframe - elif file_ext == "pkl": - open_mode = "rb" - file_reader = pd.read_pickle - else: - raise 'File extension "{}" not supported'.format(file_ext) - - kwargs = parse_valid_args(fn=file_reader, param_dict=kwargs) - - if file_ext[0:3] not in ["sdf", "xls"]: - with file_opener(filepath, open_mode) as file_in: - data = file_reader(file_in, **kwargs) - else: - data = file_reader(filepath, **kwargs) - return data - - -def parse_sdf_to_dataframe(sdf_path, as_cxsmiles=True, skiprows=None): - r""" - Allows to read an SDF file containing molecular informations, convert - it to a pandas DataFrame and convert the molecules to SMILES. It also - lists a warning of all the molecules that couldn't be read. - - Arguments - ----------- - - sdf_path: str - The full path and name of the sdf file to read - as_cxsmiles: bool, optional - Whether to use the CXSMILES notation, which preserves atomic coordinates, - stereocenters, and much more. - See `https://dl.chemaxon.com/marvin-archive/latest/help/formats/cxsmiles-doc.html` - (Default = True) - skiprows: int, list - The rows to skip from dataset. The enumerate index starts from 1 insted of 0. - (Default = None) - - """ - - # read the SDF file - # locally or from s3 - data = dm.read_sdf(sdf_path) - - # For each molecule in the SDF file, read all the properties and add it to a list of dict. - # Also count the number of molecules that cannot be read. - data_list = [] - count_none = 0 - if skiprows is not None: - if isinstance(skiprows, int): - skiprows = range(0, skiprows - 1) - skiprows = np.array(skiprows) - 1 - - for idx, mol in enumerate(data): - if (skiprows is not None) and (idx in skiprows): - continue - - if (mol is not None) and (ExactMolWt(mol) > 0): - mol_dict = mol.GetPropsAsDict() - data_list.append(mol_dict) - if as_cxsmiles: - smiles = Chem.rdmolfiles.MolToCXSmiles(mol, canonical=True) - else: - smiles = dm.to_smiles(mol, canonical=True) - data_list[-1]["SMILES"] = smiles - else: - count_none += 1 - logger.info(f"Could not read molecule # {idx}") - - # Display a message or warning after the SDF is done parsing - if count_none == 0: - logger.info("Successfully read the SDF file without error: {}".format(sdf_path)) - else: - warnings.warn( - ( - 'Error reading {} molecules from the "{}" file.\ - {} molecules read successfully.' - ).format(count_none, sdf_path, len(data_list)) - ) - return pd.DataFrame(data_list) - - -def file_opener(filename, mode="r"): - """File reader stream""" - filename = str(filename) - if "w" in mode: - filename = "simplecache::" + filename - if filename.endswith(".gz"): - instream = fsspec.open(filename, mode=mode, compression="gzip") - else: - instream = fsspec.open(filename, mode=mode) - return instream diff --git a/mkdocs.yml b/mkdocs.yml index 0d9f34bfd..49da1dc14 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -63,7 +63,6 @@ extra_css: - _assets/css/custom-graphium.css extra_javascript: - - https://polyfill.io/v3/polyfill.min.js?features=es6 - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js - _assets/js/google-analytics.js diff --git a/tests/test_utils.py b/tests/test_utils.py index b6a7b171c..e059223f9 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -150,32 +150,6 @@ def test_nan_mad(self): np.testing.assert_almost_equal(torch_mad.numpy(), numpy_mad, decimal=4, err_msg=err_msg) -def test_file_opener(tmp_path): - # Create a temporary file - txt_file = tmp_path / "test.txt" - txt_file.write_text("Hello, World!") - - # Test opening file in read mode - with file_opener(txt_file, "r") as f: - assert f.read() == "Hello, World!" - - # Test opening file in write mode - with file_opener(txt_file, "w") as f: - f.write("New text") - - with file_opener(txt_file, "r") as f: - assert f.read() == "New text" - - # Create a temporary gzip file - gzip_file = tmp_path / "test.txt.gz" - with gzip.open(gzip_file, "wt") as f: - f.write("Hello, Gzip!") - - # Test opening gzip file in read mode - with file_opener(gzip_file, "r") as f: - assert f.read() == "Hello, Gzip!" - - class test_SafeRun(ut.TestCase): def test_safe_run(self): # Error is caught