diff --git a/docs/release-history.rst b/docs/release-history.rst index 3da1202..b776763 100644 --- a/docs/release-history.rst +++ b/docs/release-history.rst @@ -2,6 +2,19 @@ Release History =============== +v0.6.0 (2021-06-25) +------------------- + +Enhancements +^^^^^^^^^^^^ + +* The *genomics* DataFrame Accessor no longer requires that all columns in the DataFrame are backed by a GenotypeArray + +v0.5.2 (2021-06-24) +------------------- + +* Update numpy version requirement + v0.5.1 (2021-06-23) ------------------- diff --git a/pandas_genomics/accessors/dataframe_accessor.py b/pandas_genomics/accessors/dataframe_accessor.py index fbe82af..03f2543 100644 --- a/pandas_genomics/accessors/dataframe_accessor.py +++ b/pandas_genomics/accessors/dataframe_accessor.py @@ -20,15 +20,18 @@ class GenotypeDataframeAccessor: """ def __init__(self, pandas_obj): - if not pandas_obj.dtypes.apply(lambda dt: GenotypeDtype.is_dtype(dt)).all(): - incorrect = pandas_obj.dtypes[ - ~pandas_obj.dtypes.apply(lambda dt: GenotypeDtype.is_dtype(dt)) - ] + if not pandas_obj.dtypes.apply(lambda dt: GenotypeDtype.is_dtype(dt)).any(): raise AttributeError( - f"Incompatible datatypes: all columns must be a GenotypeDtype: {incorrect}" + "Incompatible datatypes: at least one column must be a GenotypeDtype." ) - id_counts = Counter([s.genomics.variant.id for _, s in pandas_obj.iteritems()]) - if len(id_counts) < len(pandas_obj.columns): + id_counts = Counter( + [ + s.genomics.variant.id + for _, s in pandas_obj.iteritems() + if GenotypeDtype.is_dtype(s) + ] + ) + if len(id_counts) < len(pandas_obj.select_dtypes([GenotypeDtype]).columns): duplicates = [(k, v) for k, v in id_counts.items() if v >= 2] raise AttributeError( f"Duplicate Variant IDs. Column names may differ from variant IDs, but variant IDs must be unique.\n\tDuplicates: " @@ -39,37 +42,41 @@ def __init__(self, pandas_obj): ###################### # Variant Properties # ###################### + # These methods generally only return a result for each GenotypeArray column, ignoring other columns + @property def variant_info(self) -> pd.DataFrame: - """Return a DataFrame with variant info indexed by the column name""" + """Return a DataFrame with variant info indexed by the column name (one row per GenotypeArray)""" + genotypes = self._obj.select_dtypes([GenotypeDtype]) return pd.DataFrame.from_dict( { colname: series.genomics.variant_info - for colname, series in self._obj.iteritems() + for colname, series in genotypes.iteritems() }, orient="index", ) - ######################### - # Calculated Properties # - ######################### @property def maf(self): - """Return the minor allele frequency + """Return the minor allele frequency of each variant See :py:attr:`GenotypeArray.maf`""" - return self._obj.apply(lambda col: col.genomics.maf) + genotypes = self._obj.select_dtypes([GenotypeDtype]) + return genotypes.apply(lambda col: col.genomics.maf) @property def hwe_pval(self): """Return the probability that the samples are in HWE See :py:attr:`GenotypeArray.hwe_pval`""" - return self._obj.apply(lambda col: col.genomics.hwe_pval) + genotypes = self._obj.select_dtypes([GenotypeDtype]) + return genotypes.apply(lambda col: col.genomics.hwe_pval) ############ # Encoding # ############ + # These methods generally return encoded values for any GenotypeArray columns without modifying other columns + def encode_additive(self) -> pd.DataFrame: """Additive encoding of genotypes. @@ -80,7 +87,11 @@ def encode_additive(self) -> pd.DataFrame: pd.DataFrame """ return pd.concat( - [s.genomics.encode_additive() for _, s in self._obj.iteritems()], axis=1 + [ + s.genomics.encode_additive() if GenotypeDtype.is_dtype(s) else s + for _, s in self._obj.iteritems() + ], + axis=1, ) def encode_dominant(self) -> pd.DataFrame: @@ -93,7 +104,11 @@ def encode_dominant(self) -> pd.DataFrame: pd.DataFrame """ return pd.concat( - [s.genomics.encode_dominant() for _, s in self._obj.iteritems()], axis=1 + [ + s.genomics.encode_dominant() if GenotypeDtype.is_dtype(s) else s + for _, s in self._obj.iteritems() + ], + axis=1, ) def encode_recessive(self) -> pd.DataFrame: @@ -106,7 +121,11 @@ def encode_recessive(self) -> pd.DataFrame: pd.DataFrame """ return pd.concat( - [s.genomics.encode_recessive() for _, s in self._obj.iteritems()], axis=1 + [ + s.genomics.encode_recessive() if GenotypeDtype.is_dtype(s) else s + for _, s in self._obj.iteritems() + ], + axis=1, ) def encode_codominant(self) -> pd.DataFrame: @@ -119,7 +138,11 @@ def encode_codominant(self) -> pd.DataFrame: pd.DataFrame """ return pd.concat( - [s.genomics.encode_codominant() for _, s in self._obj.iteritems()], axis=1 + [ + s.genomics.encode_codominant() if GenotypeDtype.is_dtype(s) else s + for _, s in self._obj.iteritems() + ], + axis=1, ) def encode_weighted(self, encoding_info: pd.DataFrame) -> pd.DataFrame: @@ -181,6 +204,9 @@ def encode_weighted(self, encoding_info: pd.DataFrame) -> pd.DataFrame: # Process each variant results = [] for _, s in self._obj.iteritems(): + if not GenotypeDtype.is_dtype(s): + results.append(s) + continue info = encoding_info.get(s.array.variant.id, None) if info is None: warnings[ @@ -244,7 +270,7 @@ def generate_weighted_encodings( PLoS genetics 17.6 (2021): e1009534. """ return generate_weighted_encodings( - genotypes=self._obj, + genotypes=self._obj.select_dtypes([GenotypeDtype]), data=data, outcome_variable=outcome_variable, covariates=covariates, @@ -253,19 +279,24 @@ def generate_weighted_encodings( ########### # Filters # ########### + # These methods drop genotypes that fail the filter, ignoring other columns + def filter_variants_maf(self, keep_min_freq: float = 0.01) -> pd.DataFrame: """ Drop variants with a MAF less than the specified value (0.01 by default) """ - return self._obj.loc[:, self._obj.genomics.maf >= keep_min_freq] + genotypes = self._obj.select_dtypes([GenotypeDtype]) + removed = genotypes.loc[:, genotypes.genomics.maf < keep_min_freq].columns + return self._obj.drop(columns=removed) def filter_variants_hwe(self, cutoff: float = 0.05) -> pd.DataFrame: """ Drop variants with a probability of HWE less than the specified value (0.05 by default). Keep np.nan results, which occur for non-diploid variants and insufficient sample sizes """ - return self._obj.loc[ - :, - (self._obj.genomics.hwe_pval >= cutoff) - | (np.isnan(self._obj.genomics.hwe_pval)), - ] + genotypes = self._obj.select_dtypes([GenotypeDtype]) + genotype_hwe_pval = genotypes.genomics.hwe_pval + removed = genotypes.loc[ + :, (genotype_hwe_pval < cutoff) & ~np.isnan(genotype_hwe_pval) + ].columns + return self._obj.drop(columns=removed) diff --git a/pyproject.toml b/pyproject.toml index 94e0e85..8c94cf7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pandas-genomics" -version = "0.5.2" +version = "0.6.0" description = "Pandas ExtensionDtypes and ExtensionArray for working with genomics data" license = "BSD-3-Clause" authors = ["John McGuigan "] diff --git a/tests/genotype_array/conftest.py b/tests/genotype_array/conftest.py index dca24d4..5cba63c 100644 --- a/tests/genotype_array/conftest.py +++ b/tests/genotype_array/conftest.py @@ -222,7 +222,10 @@ def __get_data_for_encoding(): def genotypearray_df(): DATA_DIR = Path(__file__).parent.parent / "data" / "plink" input = DATA_DIR / "plink_test_small" - return io.from_plink(input, max_variants=20, swap_alleles=True) + df = io.from_plink(input, max_variants=20, swap_alleles=True) + df["num"] = [1.0 for n in range(len(df))] + df["bool"] = [True if n % 3 == 0 else False for n in range(len(df))] + return df @pytest.fixture diff --git a/tests/genotype_array/test_GenotypeArrayAccessors.py b/tests/genotype_array/test_GenotypeArrayAccessors.py index 122a776..5b1feb0 100644 --- a/tests/genotype_array/test_GenotypeArrayAccessors.py +++ b/tests/genotype_array/test_GenotypeArrayAccessors.py @@ -1,7 +1,7 @@ """ Test GenotypeArray Accessors """ - +import numpy as np import pandas as pd import pytest from pandas._testing import ( @@ -28,7 +28,8 @@ def test_maf(data): ) for colname in "ABC": df[colname].genomics.variant.id = colname - expected = pd.Series({n: data.maf for n in "ABC"}) + df["D"] = np.ones(len(data)) + expected = pd.Series({"A": data.maf, "B": data.maf, "C": data.maf}) assert_series_equal(df.genomics.maf, expected) @@ -37,23 +38,24 @@ def test_hwe(data): @pytest.mark.parametrize( - "filter_value, num_vars_left", [(None, 15), (0.05, 1), (0.10, 0)] + "filter_value, num_cols_left", [(None, 17), (0.05, 3), (0.10, 2)] ) -def test_filter_maf(genotypearray_df, filter_value, num_vars_left): +def test_filter_maf(genotypearray_df, filter_value, num_cols_left): if filter_value is None: result = genotypearray_df.genomics.filter_variants_maf() else: result = genotypearray_df.genomics.filter_variants_maf(filter_value) - assert len(result.columns) == num_vars_left + assert len(result.columns) == num_cols_left @pytest.mark.parametrize( - "filter_value, num_vars_left", [(None, 1), (0.05, 1), (1e-300, 2)] + "filter_value, num_cols_left", [(None, 1), (0.05, 1), (1e-300, 2)] ) -def test_filter_hwe(ga_inhwe, ga_nothwe, filter_value, num_vars_left): +def test_filter_hwe(ga_inhwe, ga_nothwe, filter_value, num_cols_left): data = pd.DataFrame({"yes": ga_inhwe, "no": ga_nothwe}) + data["num"] = [n for n in range(len(data))] if filter_value is None: result = data.genomics.filter_variants_hwe() else: result = data.genomics.filter_variants_hwe(filter_value) - assert len(result.columns) == num_vars_left + assert len(result.columns) == num_cols_left + 1 diff --git a/tests/genotype_array/test_GenotypeArrayEncoding.py b/tests/genotype_array/test_GenotypeArrayEncoding.py index 39636e1..5f10ec7 100644 --- a/tests/genotype_array/test_GenotypeArrayEncoding.py +++ b/tests/genotype_array/test_GenotypeArrayEncoding.py @@ -56,11 +56,19 @@ def test_encoding_additive(data_for_encoding): expected = pd.Series(result) result_series = pd.Series(data_for_encoding()).genomics.encode_additive() assert_series_equal(result_series, expected) - # Test using DataFrame accessor + # Test using DataFrame accessor with extra col df = pd.DataFrame.from_dict( {n: data_for_encoding() for n in "ABC"}, orient="columns" ) - expected = pd.DataFrame.from_dict({n: result_series for n in "ABC"}) + df["float"] = np.ones(len(df)) + expected = pd.DataFrame.from_dict( + { + "A": result_series, + "B": result_series, + "C": result_series, + "float": df["float"], + } + ) result_df = df.genomics.encode_additive() assert_frame_equal(result_df, expected) @@ -74,11 +82,19 @@ def test_encoding_dominant(data_for_encoding): expected = pd.Series(result) result_series = pd.Series(data_for_encoding()).genomics.encode_dominant() assert_series_equal(result_series, expected) - # Test using DataFrame accessor + # Test using DataFrame accessor with extra col df = pd.DataFrame.from_dict( {n: data_for_encoding() for n in "ABC"}, orient="columns" ) - expected = pd.DataFrame.from_dict({n: result_series for n in "ABC"}) + df["float"] = np.ones(len(df)) + expected = pd.DataFrame.from_dict( + { + "A": result_series, + "B": result_series, + "C": result_series, + "float": df["float"], + } + ) result_df = df.genomics.encode_dominant() assert_frame_equal(result_df, expected) @@ -92,11 +108,19 @@ def test_encoding_recessive(data_for_encoding): expected = pd.Series(result) result_series = pd.Series(data_for_encoding()).genomics.encode_recessive() assert_series_equal(result_series, expected) - # Test using DataFrame accessor + # Test using DataFrame accessor with extra col df = pd.DataFrame.from_dict( {n: data_for_encoding() for n in "ABC"}, orient="columns" ) - expected = pd.DataFrame.from_dict({n: result_series for n in "ABC"}) + df["float"] = np.ones(len(df)) + expected = pd.DataFrame.from_dict( + { + "A": result_series, + "B": result_series, + "C": result_series, + "float": df["float"], + } + ) result_df = df.genomics.encode_recessive() assert_frame_equal(result_df, expected) @@ -114,11 +138,19 @@ def test_encoding_codominant(data_for_encoding): expected = pd.Series(result) result_series = pd.Series(data_for_encoding()).genomics.encode_codominant() assert_series_equal(result_series, expected) - # Test using DataFrame accessor + # Test using DataFrame accessor with extra col df = pd.DataFrame.from_dict( {n: data_for_encoding() for n in "ABC"}, orient="columns" ) - expected = pd.DataFrame.from_dict({n: result_series for n in "ABC"}) + df["float"] = np.ones(len(df)) + expected = pd.DataFrame.from_dict( + { + "A": result_series, + "B": result_series, + "C": result_series, + "float": df["float"], + } + ) result_df = df.genomics.encode_codominant() assert_frame_equal(result_df, expected) @@ -171,6 +203,7 @@ def test_encoding_weighted( "var2": [0.0, 0.3, 1.0, None, None], "var3": [0.0, 0.4, 1.0, None, None], "var4": [0.0, 0.5, 1.0, None, None], + "num": [1.0, 1.0, 1.0, 1.0, 1.0], }, dtype="Float64", ), @@ -191,6 +224,7 @@ def test_encoding_weighted( "var0": [0.0, 0.1, 1.0, None, None], "var1": [0.0, 0.2, 1.0, None, None], "var4": [1.0, 0.5, 0.0, None, None], + "num": [1.0, 1.0, 1.0, 1.0, 1.0], }, dtype="Float64", ), @@ -198,7 +232,9 @@ def test_encoding_weighted( ], ) def test_encoding_weighted_df(encoding_df, encoding_info, expected): - result = encoding_df.genomics.encode_weighted(encoding_info) + df = encoding_df.copy() + df["num"] = pd.Series(np.ones(len(df))).astype("Float64") + result = df.genomics.encode_weighted(encoding_info) assert_frame_equal(expected, result)