diff --git a/docs/source/add_filtering_columns.rst b/docs/source/add_filtering_columns.rst new file mode 100644 index 0000000..ad30059 --- /dev/null +++ b/docs/source/add_filtering_columns.rst @@ -0,0 +1,7 @@ +add\_filtering\_columns module +============================== + +.. automodule:: add_filtering_columns + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/arguments.rst b/docs/source/arguments.rst new file mode 100644 index 0000000..293fa9f --- /dev/null +++ b/docs/source/arguments.rst @@ -0,0 +1,7 @@ +arguments module +================ + +.. automodule:: arguments + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/dataset.rst b/docs/source/dataset.rst new file mode 100644 index 0000000..3046110 --- /dev/null +++ b/docs/source/dataset.rst @@ -0,0 +1,7 @@ +dataset module +============== + +.. automodule:: dataset + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/modules.rst b/docs/source/modules.rst index a20c34e..aa35a83 100644 --- a/docs/source/modules.rst +++ b/docs/source/modules.rst @@ -7,12 +7,15 @@ src add_chembl_compound_properties add_chembl_target_class_annotations add_dti_annotations + add_filtering_columns add_rdkit_compound_descriptors + arguments clean_dataset + dataset get_activity_ct_pairs get_dataset get_drug_mechanism_ct_pairs get_stats main + output sanity_checks - write_subsets diff --git a/docs/source/output.rst b/docs/source/output.rst new file mode 100644 index 0000000..31e8c0d --- /dev/null +++ b/docs/source/output.rst @@ -0,0 +1,7 @@ +output module +============= + +.. automodule:: output + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/write_subsets.rst b/docs/source/write_subsets.rst deleted file mode 100644 index 37ac511..0000000 --- a/docs/source/write_subsets.rst +++ /dev/null @@ -1,7 +0,0 @@ -write\_subsets module -===================== - -.. automodule:: write_subsets - :members: - :undoc-members: - :show-inheritance: diff --git a/src/add_chembl_target_class_annotations.py b/src/add_chembl_target_class_annotations.py index bb8d080..8248318 100644 --- a/src/add_chembl_target_class_annotations.py +++ b/src/add_chembl_target_class_annotations.py @@ -222,10 +222,10 @@ def add_chembl_target_class_annotations( :param dataset: Dataset with compound-target pairs. Will be updated to only include target class annotations. - dataset.target_classes_level1 will be set to + dataset.target_classes_level1 will be set to \ pandas DataFrame with mapping from target id to level 1 target class - dataset.target_classes_level2 will be set to - pandas DataFrame with mapping from target id to level 2 target class + dataset.target_classes_level2 will be set to \ + pandas DataFrame with mapping from target id to level 2 target class :type dataset: Dataset :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection diff --git a/src/add_dti_annotations.py b/src/add_dti_annotations.py index 94367e9..11eeb0e 100644 --- a/src/add_dti_annotations.py +++ b/src/add_dti_annotations.py @@ -14,7 +14,7 @@ def add_dti_annotations( The assignment is based on three questions: - - Is the compound-target pair in the drug_mechanisms table? = + - Is the compound-target pair in the drug_mechanisms table? = \ Is it a known relevant compound-target interaction? - What is the max_phase of the compound? = Is it a drug / clinical compound? - Is the target in the drug_mechanisms table = Is it a therapeutic target? @@ -47,23 +47,23 @@ def add_dti_annotations( .. [#] C0_DT groups together all compounds with a max_phase not between 1 and 4. - Since ChEMBL32 there are three possible annotations in ChEMBL + Since ChEMBL32 there are three possible annotations in ChEMBL \ with a max_phase value not between 1 and 4: - 0.5 = early phase 1 clinical trials - - -1 = clinical phase unknown for drug or clinical candidate drug, + - -1 = clinical phase unknown for drug or clinical candidate drug, \ i.e., where ChEMBL cannot assign a clinical phase - NULL = preclinical compounds with bioactivity data All three are grouped together into the annotation C0_DT. - Compound-target pairs that were annotated with NDT, - i.e., compound-target pairs that are not in the drug_mechanisms table - and for which the target was also not in the drug_mechanisms table + Compound-target pairs that were annotated with NDT, \ + i.e., compound-target pairs that are not in the drug_mechanisms table \ + and for which the target was also not in the drug_mechanisms table \ (not a comparator compound), are discarded. :param dataset: Dataset with all relevant information: - - Pandas DataFrame with compound-target pairs + - Pandas DataFrame with compound-target pairs \ based on activities AND drug_mechanism table - set of compound-target pairs in the drug_mechanism table - set of targets in the drug_mechanism table diff --git a/src/add_filtering_columns.py b/src/add_filtering_columns.py index 27d4076..9833779 100644 --- a/src/add_filtering_columns.py +++ b/src/add_filtering_columns.py @@ -22,27 +22,28 @@ def get_data_subsets(data: pd.DataFrame, min_nof_cpds: int, desc: str) -> tuple[ """ Calculate and return the different subsets of interest. + - data: Pandas DataFrame with compound-target pairs \ + without filtering columns and without \ + the annotations for the opposite desc, \ + e.g. if desc = "BF", the average pchembl value based on \ + binding data only is dropped + - df_enough_cpds: Pandas DataFrame with targets \ + with at least compounds with a pchembl value, + - df_c_dt_d_dt: As df_enough_cpds but with \ + at least one compound-target pair labelled as \ + 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT' (i.e., known interaction), + - df_d_dt: As df_enough_cpds but with \ + at least one compound-target pair labelled as \ + 'D_DT' (i.e., known drug-target interaction) + :param data: Pandas DataFrame with compound-target pairs :type data: pd.DataFrame :param min_nof_cpds: Miminum number of compounds per target :type min_nof_cpds: int - :param desc: Types of assays current_df contains information about. + :param desc: Types of assays current_df contains information about. \ Options: "BF" (binding+functional), "B" (binding) :type desc: str - :return: List of dataset subsets and the string describing them - - data: Pandas DataFrame with compound-target pairs - without filtering columns and without - the annotations for the opposite desc, - e.g. if desc = "BF", the average pchembl value based on - binding data only is dropped - - df_enough_cpds: Pandas DataFrame with targets - with at least compounds with a pchembl value, - - df_c_dt_d_dt: As df_enough_cpds but with - at least one compound-target pair labelled as - 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT' (i.e., known interaction), - - df_d_dt: As df_enough_cpds but with - at least one compound-target pair labelled as - 'D_DT' (i.e., known drug-target interaction) + :return: List of dataset subsets and the string describing them. :rtype: tuple[tuple[pd.DataFrame, str], tuple[pd.DataFrame, str], tuple[pd.DataFrame, str], @@ -120,13 +121,14 @@ def add_subset_filtering_columns( """ Add filtering column for binding + functional vs binding - :param df_combined_subset: Subset with binding+functional (BF) or binding (B) assay-based data + :param df_combined_subset: Subset with binding+functional (BF) \ + or binding (B) assay-based data \ in df_combined :type df_combined_subset: pd.DataFrame - :param dataset: Dataset with compound-target pairs. + :param dataset: Dataset with compound-target pairs. \ Will be updated to only include filtering columns. :type dataset: Dataset - :param desc: Assay description, + :param desc: Assay description, \ either "BF" (binding+functional) or "B" (binding) :type desc: str :param args: Arguments related to how to calculate the dataset @@ -180,7 +182,7 @@ def add_filtering_columns( """ Add filtering columns to main dataset and save subsets if required. - :param dataset: Dataset with compound-target pairs. + :param dataset: Dataset with compound-target pairs. \ Will be updated to only include filtering columns. :type dataset: Dataset :param args: Arguments related to how to calculate the dataset diff --git a/src/arguments.py b/src/arguments.py index ea02154..30cf7a4 100644 --- a/src/arguments.py +++ b/src/arguments.py @@ -13,12 +13,12 @@ class CalculationArgs: """ Collection of arguments related to how to calculate the dataset. - chembl_version: Version of ChEMBL for output file names - calculate_rdkit: True if RDKit-based compound properties should be calculated - limit_to_literature: Include only literature sources if True - limited_flag: String version of limit_to_literature used in file names - min_nof_cpds_bf: Minimum number of compounds per target for the BF subset - min_nof_cpds_b: Minimum number of compounds per target for the B subset + - chembl_version: Version of ChEMBL for output file names + - calculate_rdkit: True if RDKit-based compound properties should be calculated + - limit_to_literature: Include only literature sources if True + - limited_flag: String version of limit_to_literature used in file names + - min_nof_cpds_bf: Minimum number of compounds per target for the BF subset + - min_nof_cpds_b: Minimum number of compounds per target for the B subset """ chembl_version: str @@ -34,13 +34,13 @@ class OutputArgs: """ Collection of arguments related to how to output the dataset. - output_path: Path to write output files to - delimiter: Delimiter in csv-output - write_to_csv: True if output should be written to csv - write_to_excel: True if output should be written to excel - write_full_dataset: True if the full dataset should be written to output - write_bf: True if subsets based on binding+functional data should be written to output - write_b: True if subsets based on binding data only should be written to output + - output_path: Path to write output files to + - delimiter: Delimiter in csv-output + - write_to_csv: True if output should be written to csv + - write_to_excel: True if output should be written to excel + - write_full_dataset: True if the full dataset should be written to output + - write_bf: True if subsets based on binding+functional data should be written to output + - write_b: True if subsets based on binding data only should be written to output """ output_path: str diff --git a/src/dataset.py b/src/dataset.py index 19fd259..33b180c 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -11,15 +11,17 @@ @dataclass class Dataset: """ - df_result: Pandas DataFrame with the full dataset - drug_mechanism_pairs_set: Set of compound-target pairs in the drug_mechanism table, + Calculated compound-target pairs dataset (df_results) and related data. + + - df_result: Pandas DataFrame with the full dataset + - drug_mechanism_pairs_set: Set of compound-target pairs in the drug_mechanism table, \ used for DTI assignments - drug_mechanism_targets_set: Set of targets in the drug_mechanism table, + - drug_mechanism_targets_set: Set of targets in the drug_mechanism table, \ used for DTI assigments - df_sizes_all: Pandas DataFrame of intermediate sizes of the dataset, + - df_sizes_all: Pandas DataFrame of intermediate sizes of the dataset, \ used for debugging - df_sizes_pchembl: Pandas DataFrame of intermediate sizes of the dataset, - restricted to entries with a pchembl value, + - df_sizes_pchembl: Pandas DataFrame of intermediate sizes of the dataset, \ + restricted to entries with a pchembl value, \ used for debugging """