Update documentation

chembl · Feb 21, 2024 · bc9d7b4 · bc9d7b4
1 parent d757444
commit bc9d7b4
Show file tree

Hide file tree

Showing 11 changed files with 84 additions and 56 deletions.
diff --git a/docs/source/add_filtering_columns.rst b/docs/source/add_filtering_columns.rst
@@ -0,0 +1,7 @@
+add\_filtering\_columns module
+==============================
+
+.. automodule:: add_filtering_columns
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/arguments.rst b/docs/source/arguments.rst
@@ -0,0 +1,7 @@
+arguments module
+================
+
+.. automodule:: arguments
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/dataset.rst b/docs/source/dataset.rst
@@ -0,0 +1,7 @@
+dataset module
+==============
+
+.. automodule:: dataset
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/modules.rst b/docs/source/modules.rst
@@ -7,12 +7,15 @@ src
    add_chembl_compound_properties
    add_chembl_target_class_annotations
    add_dti_annotations
+   add_filtering_columns
    add_rdkit_compound_descriptors
+   arguments
    clean_dataset
+   dataset
    get_activity_ct_pairs
    get_dataset
    get_drug_mechanism_ct_pairs
    get_stats
    main
+   output
    sanity_checks
-   write_subsets
diff --git a/docs/source/output.rst b/docs/source/output.rst
@@ -0,0 +1,7 @@
+output module
+=============
+
+.. automodule:: output
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/write_subsets.rst b/docs/source/write_subsets.rst
diff --git a/src/add_chembl_target_class_annotations.py b/src/add_chembl_target_class_annotations.py
@@ -222,10 +222,10 @@ def add_chembl_target_class_annotations(
 
     :param dataset: Dataset with compound-target pairs.
         Will be updated to only include target class annotations.
-        dataset.target_classes_level1 will be set to
+        dataset.target_classes_level1 will be set to \
             pandas DataFrame with mapping from target id to level 1 target class
-        dataset.target_classes_level2 will be set to
-            pandas DataFrame with mapping from target id to level 2 target class
+        dataset.target_classes_level2 will be set to \
+            pandas DataFrame with mapping from target id to level 2 target class 
     :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection

diff --git a/src/add_dti_annotations.py b/src/add_dti_annotations.py
@@ -14,7 +14,7 @@ def add_dti_annotations(
 
     The assignment is based on three questions:
 
-    - Is the compound-target pair in the drug_mechanisms table? = 
+    - Is the compound-target pair in the drug_mechanisms table? = \
         Is it a known relevant compound-target interaction?
     - What is the max_phase of the compound? = Is it a drug / clinical compound?
     - Is the target in the drug_mechanisms table = Is it a therapeutic target?
@@ -47,23 +47,23 @@ def add_dti_annotations(
 
     .. [#] C0_DT groups together all compounds with a max_phase not between 1 and 4. 
     
-    Since ChEMBL32 there are three possible annotations in ChEMBL 
+    Since ChEMBL32 there are three possible annotations in ChEMBL \
     with a max_phase value not between 1 and 4:
 
     - 0.5 = early phase 1 clinical trials  
-    - -1 = clinical phase unknown for drug or clinical candidate drug, 
+    - -1 = clinical phase unknown for drug or clinical candidate drug, \
             i.e., where ChEMBL cannot assign a clinical phase
     - NULL = preclinical compounds with bioactivity data
 
     All three are grouped together into the annotation C0_DT.
 
-    Compound-target pairs that were annotated with NDT, 
-    i.e., compound-target pairs that are not in the drug_mechanisms table 
-    and for which the target was also not in the drug_mechanisms table 
+    Compound-target pairs that were annotated with NDT, \
+    i.e., compound-target pairs that are not in the drug_mechanisms table \
+    and for which the target was also not in the drug_mechanisms table \
     (not a comparator compound), are discarded.
 
     :param dataset: Dataset with all relevant information:
-        - Pandas DataFrame with compound-target pairs
+        - Pandas DataFrame with compound-target pairs \
             based on activities AND drug_mechanism table
         - set of compound-target pairs in the drug_mechanism table
         - set of targets in the drug_mechanism table

diff --git a/src/add_filtering_columns.py b/src/add_filtering_columns.py
@@ -22,27 +22,28 @@ def get_data_subsets(data: pd.DataFrame, min_nof_cpds: int, desc: str) -> tuple[
     """
     Calculate and return the different subsets of interest.
 
+    - data: Pandas DataFrame with compound-target pairs \
+            without filtering columns and without \
+            the annotations for the opposite desc, \
+            e.g. if desc = "BF", the average pchembl value based on \
+            binding data only is dropped
+    - df_enough_cpds: Pandas DataFrame with targets \
+            with at least <min_nof_cpds> compounds with a pchembl value,
+    - df_c_dt_d_dt: As df_enough_cpds but with \
+            at least one compound-target pair labelled as \
+            'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT' (i.e., known interaction),
+    - df_d_dt: As df_enough_cpds but with \
+            at least one compound-target pair labelled as \
+            'D_DT' (i.e., known drug-target interaction)
+
     :param data: Pandas DataFrame with compound-target pairs
     :type data: pd.DataFrame
     :param min_nof_cpds: Miminum number of compounds per target
     :type min_nof_cpds: int
-    :param desc: Types of assays current_df contains information about.
+    :param desc: Types of assays current_df contains information about. \
         Options: "BF" (binding+functional), "B" (binding)
     :type desc: str
-    :return: List of dataset subsets and the string describing them
-        - data: Pandas DataFrame with compound-target pairs
-            without filtering columns and without
-            the annotations for the opposite desc,
-            e.g. if desc = "BF", the average pchembl value based on
-            binding data only is dropped
-        - df_enough_cpds: Pandas DataFrame with targets
-            with at least <min_nof_cpds> compounds with a pchembl value,
-        - df_c_dt_d_dt: As df_enough_cpds but with
-            at least one compound-target pair labelled as
-            'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT' (i.e., known interaction),
-        - df_d_dt: As df_enough_cpds but with
-            at least one compound-target pair labelled as
-            'D_DT' (i.e., known drug-target interaction)
+    :return: List of dataset subsets and the string describing them.
     :rtype: tuple[tuple[pd.DataFrame, str],
            tuple[pd.DataFrame, str],
            tuple[pd.DataFrame, str],
@@ -120,13 +121,14 @@ def add_subset_filtering_columns(
     """
     Add filtering column for binding + functional vs binding
 
-    :param df_combined_subset: Subset with binding+functional (BF) or binding (B) assay-based data
+    :param df_combined_subset: Subset with binding+functional (BF) \
+        or binding (B) assay-based data \
         in df_combined
     :type df_combined_subset: pd.DataFrame
-    :param dataset: Dataset with compound-target pairs.
+    :param dataset: Dataset with compound-target pairs. \
         Will be updated to only include filtering columns.
     :type dataset: Dataset
-    :param desc: Assay description,
+    :param desc: Assay description, \
         either "BF" (binding+functional) or "B" (binding)
     :type desc: str
     :param args: Arguments related to how to calculate the dataset
@@ -180,7 +182,7 @@ def add_filtering_columns(
     """
     Add filtering columns to main dataset and save subsets if required.
 
-    :param dataset: Dataset with compound-target pairs.
+    :param dataset: Dataset with compound-target pairs. \
         Will be updated to only include filtering columns.
     :type dataset: Dataset
     :param args: Arguments related to how to calculate the dataset

diff --git a/src/arguments.py b/src/arguments.py
@@ -13,12 +13,12 @@ class CalculationArgs:
     """
     Collection of arguments related to how to calculate the dataset.
 
-    chembl_version:         Version of ChEMBL for output file names
-    calculate_rdkit:        True if RDKit-based compound properties should be calculated
-    limit_to_literature:    Include only literature sources if True
-    limited_flag:           String version of limit_to_literature used in file names
-    min_nof_cpds_bf:        Minimum number of compounds per target for the BF subset
-    min_nof_cpds_b:         Minimum number of compounds per target for the B subset
+    - chembl_version:         Version of ChEMBL for output file names
+    - calculate_rdkit:        True if RDKit-based compound properties should be calculated
+    - limit_to_literature:    Include only literature sources if True
+    - limited_flag:           String version of limit_to_literature used in file names
+    - min_nof_cpds_bf:        Minimum number of compounds per target for the BF subset
+    - min_nof_cpds_b:         Minimum number of compounds per target for the B subset
     """
 
     chembl_version: str
@@ -34,13 +34,13 @@ class OutputArgs:
     """
     Collection of arguments related to how to output the dataset.
 
-    output_path:        Path to write output files to
-    delimiter:          Delimiter in csv-output
-    write_to_csv:       True if output should be written to csv
-    write_to_excel:     True if output should be written to excel
-    write_full_dataset: True if the full dataset should be written to output
-    write_bf:           True if subsets based on binding+functional data should be written to output
-    write_b:            True if subsets based on binding data only should be written to output
+    - output_path:        Path to write output files to
+    - delimiter:          Delimiter in csv-output
+    - write_to_csv:       True if output should be written to csv
+    - write_to_excel:     True if output should be written to excel
+    - write_full_dataset: True if the full dataset should be written to output
+    - write_bf:           True if subsets based on binding+functional data should be written to output
+    - write_b:            True if subsets based on binding data only should be written to output
     """
 
     output_path: str

diff --git a/src/dataset.py b/src/dataset.py
@@ -11,15 +11,17 @@
 @dataclass
 class Dataset:
     """
-    df_result:                  Pandas DataFrame with the full dataset
-    drug_mechanism_pairs_set:   Set of compound-target pairs in the drug_mechanism table,
+    Calculated compound-target pairs dataset (df_results) and related data.
+    
+    - df_result:                  Pandas DataFrame with the full dataset
+    - drug_mechanism_pairs_set:   Set of compound-target pairs in the drug_mechanism table, \
                                 used for DTI assignments
-    drug_mechanism_targets_set: Set of targets in the drug_mechanism table,
+    - drug_mechanism_targets_set: Set of targets in the drug_mechanism table, \
                                 used for DTI assigments
-    df_sizes_all:               Pandas DataFrame of intermediate sizes of the dataset,
+    - df_sizes_all:               Pandas DataFrame of intermediate sizes of the dataset, \
                                 used for debugging
-    df_sizes_pchembl:           Pandas DataFrame of intermediate sizes of the dataset,
-                                restricted to entries with a pchembl value,
+    - df_sizes_pchembl:           Pandas DataFrame of intermediate sizes of the dataset, \
+                                restricted to entries with a pchembl value, \
                                 used for debugging
     """