Merge pull request #208 from datamol-io/more_datasets

Add two more datasets + ruff linting
datamol-io · Jul 27, 2023 · 2619c61 · 2619c61
2 parents fc0431b + be52130
commit 2619c61
Show file tree

Hide file tree

Showing 27 changed files with 4,063 additions and 62 deletions.
diff --git a/.github/workflows/code-check.yml b/.github/workflows/code-check.yml
@@ -29,8 +29,8 @@ jobs:
       - name: Lint
         run: black --check .
 
-  python-typing-mypy:
-    name: Python typing check [mypy]
+  python-lint-ruff:
+    name: Python lint [ruff]
     runs-on: ubuntu-latest
     steps:
       - name: Checkout the code
@@ -41,15 +41,9 @@ jobs:
         with:
           python-version: "3.10"
 
-      - name: Install mypy
+      - name: Install ruff
         run: |
-          pip install mypy numpy pandas loguru pytest pillow scipy
+          pip install ruff
 
-      - name: Run code check
-        run: |
-          mypy . || exitCode=$?
-
-          # only fails if exit code >=2
-          if [ $exitCode -ge 2 ]; then
-            exit $exitCode
-          fi
+      - name: Lint
+        run: ruff .
diff --git a/datamol/__init__.py b/datamol/__init__.py
@@ -33,6 +33,8 @@
     "freesolv": "datamol.data",
     "cdk2": "datamol.data",
     "solubility": "datamol.data",
+    "chembl_drugs": "datamol.data",
+    "chembl_samples": "datamol.data",
     # log
     "enable_rdkit_log": "datamol.log",
     "disable_rdkit_log": "datamol.log",
@@ -224,6 +226,8 @@ def __dir__():
     from .data import freesolv
     from .data import cdk2
     from .data import solubility
+    from .data import chembl_drugs
+    from .data import chembl_samples
 
     from .log import enable_rdkit_log
     from .log import disable_rdkit_log

diff --git a/datamol/conformers/_features.py b/datamol/conformers/_features.py
@@ -1,11 +1,9 @@
-from typing import Optional, Union
+from typing import Union
 from typing import List
 from typing import Optional
 
 import numpy as np
 
-from rdkit import Chem
-
 from ..types import Mol
 from ..utils.jobs import JobRunner
 from ..utils import decorators

diff --git a/datamol/data/__init__.py b/datamol/data/__init__.py
@@ -10,14 +10,13 @@
 from typing import List
 from typing import overload
 from typing import Literal
-from typing import cast
 
 import sys
 import io
 
 try:
     import importlib.resources as importlib_resources
-except:
+except ImportError:
     import importlib_resources
 
 import pandas as pd
@@ -170,3 +169,53 @@ def solubility(as_df: bool = True, mol_column: Optional[str] = "mol"):
         return data
 
     return from_df(data, mol_column=mol_column)
+
+
+@overload
+def chembl_drugs(as_df: Literal[True] = True) -> pd.DataFrame:
+    ...
+
+
+@overload
+def chembl_drugs(as_df: Literal[False] = False) -> List[Mol]:
+    ...
+
+
+def chembl_drugs(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]:
+    """A list of ~2k molecules from ChEMBL (all drugs).
+
+    Originally, proposed by Patrick Walters at <https://github.com/PatWalters/practical_cheminformatics_posts/tree/b4dae239a8b942dab3a41e637ac55d4491aee96f/molskill>.
+    """
+
+    with open_datamol_data_file("chembl_drugs.csv") as f:
+        data = pd.read_csv(f)
+
+    if not as_df:
+        data = from_df(data)
+
+    return data
+
+
+@overload
+def chembl_samples(as_df: Literal[True] = True) -> pd.DataFrame:
+    ...
+
+
+@overload
+def chembl_samples(as_df: Literal[False] = False) -> List[Mol]:
+    ...
+
+
+def chembl_samples(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]:
+    """A list of ~2k molecules from ChEMBL.
+
+    Originally, proposed by Patrick Walters at <https://github.com/PatWalters/practical_cheminformatics_posts/tree/b4dae239a8b942dab3a41e637ac55d4491aee96f/molskill>.
+    """
+
+    with open_datamol_data_file("chembl_samples.csv") as f:
+        data = pd.read_csv(f)
+
+    if not as_df:
+        data = from_df(data)
+
+    return data
diff --git a/datamol/data/chembl_drugs.csv b/datamol/data/chembl_drugs.csv