Skip to content

Commit

Permalink
Merge pull request #22 from jrm5100/master
Browse files Browse the repository at this point in the history
DataFrame Accessor doesn't require all-Genotype columns
  • Loading branch information
jrm5100 authored Jun 25, 2021
2 parents 56de679 + 4889960 commit d63e63d
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 45 deletions.
13 changes: 13 additions & 0 deletions docs/release-history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,19 @@
Release History
===============

v0.6.0 (2021-06-25)
-------------------

Enhancements
^^^^^^^^^^^^

* The *genomics* DataFrame Accessor no longer requires that all columns in the DataFrame are backed by a GenotypeArray

v0.5.2 (2021-06-24)
-------------------

* Update numpy version requirement

v0.5.1 (2021-06-23)
-------------------

Expand Down
83 changes: 57 additions & 26 deletions pandas_genomics/accessors/dataframe_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,18 @@ class GenotypeDataframeAccessor:
"""

def __init__(self, pandas_obj):
if not pandas_obj.dtypes.apply(lambda dt: GenotypeDtype.is_dtype(dt)).all():
incorrect = pandas_obj.dtypes[
~pandas_obj.dtypes.apply(lambda dt: GenotypeDtype.is_dtype(dt))
]
if not pandas_obj.dtypes.apply(lambda dt: GenotypeDtype.is_dtype(dt)).any():
raise AttributeError(
f"Incompatible datatypes: all columns must be a GenotypeDtype: {incorrect}"
"Incompatible datatypes: at least one column must be a GenotypeDtype."
)
id_counts = Counter([s.genomics.variant.id for _, s in pandas_obj.iteritems()])
if len(id_counts) < len(pandas_obj.columns):
id_counts = Counter(
[
s.genomics.variant.id
for _, s in pandas_obj.iteritems()
if GenotypeDtype.is_dtype(s)
]
)
if len(id_counts) < len(pandas_obj.select_dtypes([GenotypeDtype]).columns):
duplicates = [(k, v) for k, v in id_counts.items() if v >= 2]
raise AttributeError(
f"Duplicate Variant IDs. Column names may differ from variant IDs, but variant IDs must be unique.\n\tDuplicates: "
Expand All @@ -39,37 +42,41 @@ def __init__(self, pandas_obj):
######################
# Variant Properties #
######################
# These methods generally only return a result for each GenotypeArray column, ignoring other columns

@property
def variant_info(self) -> pd.DataFrame:
"""Return a DataFrame with variant info indexed by the column name"""
"""Return a DataFrame with variant info indexed by the column name (one row per GenotypeArray)"""
genotypes = self._obj.select_dtypes([GenotypeDtype])
return pd.DataFrame.from_dict(
{
colname: series.genomics.variant_info
for colname, series in self._obj.iteritems()
for colname, series in genotypes.iteritems()
},
orient="index",
)

#########################
# Calculated Properties #
#########################
@property
def maf(self):
"""Return the minor allele frequency
"""Return the minor allele frequency of each variant
See :py:attr:`GenotypeArray.maf`"""
return self._obj.apply(lambda col: col.genomics.maf)
genotypes = self._obj.select_dtypes([GenotypeDtype])
return genotypes.apply(lambda col: col.genomics.maf)

@property
def hwe_pval(self):
"""Return the probability that the samples are in HWE
See :py:attr:`GenotypeArray.hwe_pval`"""
return self._obj.apply(lambda col: col.genomics.hwe_pval)
genotypes = self._obj.select_dtypes([GenotypeDtype])
return genotypes.apply(lambda col: col.genomics.hwe_pval)

############
# Encoding #
############
# These methods generally return encoded values for any GenotypeArray columns without modifying other columns

def encode_additive(self) -> pd.DataFrame:
"""Additive encoding of genotypes.
Expand All @@ -80,7 +87,11 @@ def encode_additive(self) -> pd.DataFrame:
pd.DataFrame
"""
return pd.concat(
[s.genomics.encode_additive() for _, s in self._obj.iteritems()], axis=1
[
s.genomics.encode_additive() if GenotypeDtype.is_dtype(s) else s
for _, s in self._obj.iteritems()
],
axis=1,
)

def encode_dominant(self) -> pd.DataFrame:
Expand All @@ -93,7 +104,11 @@ def encode_dominant(self) -> pd.DataFrame:
pd.DataFrame
"""
return pd.concat(
[s.genomics.encode_dominant() for _, s in self._obj.iteritems()], axis=1
[
s.genomics.encode_dominant() if GenotypeDtype.is_dtype(s) else s
for _, s in self._obj.iteritems()
],
axis=1,
)

def encode_recessive(self) -> pd.DataFrame:
Expand All @@ -106,7 +121,11 @@ def encode_recessive(self) -> pd.DataFrame:
pd.DataFrame
"""
return pd.concat(
[s.genomics.encode_recessive() for _, s in self._obj.iteritems()], axis=1
[
s.genomics.encode_recessive() if GenotypeDtype.is_dtype(s) else s
for _, s in self._obj.iteritems()
],
axis=1,
)

def encode_codominant(self) -> pd.DataFrame:
Expand All @@ -119,7 +138,11 @@ def encode_codominant(self) -> pd.DataFrame:
pd.DataFrame
"""
return pd.concat(
[s.genomics.encode_codominant() for _, s in self._obj.iteritems()], axis=1
[
s.genomics.encode_codominant() if GenotypeDtype.is_dtype(s) else s
for _, s in self._obj.iteritems()
],
axis=1,
)

def encode_weighted(self, encoding_info: pd.DataFrame) -> pd.DataFrame:
Expand Down Expand Up @@ -181,6 +204,9 @@ def encode_weighted(self, encoding_info: pd.DataFrame) -> pd.DataFrame:
# Process each variant
results = []
for _, s in self._obj.iteritems():
if not GenotypeDtype.is_dtype(s):
results.append(s)
continue
info = encoding_info.get(s.array.variant.id, None)
if info is None:
warnings[
Expand Down Expand Up @@ -244,7 +270,7 @@ def generate_weighted_encodings(
PLoS genetics 17.6 (2021): e1009534.
"""
return generate_weighted_encodings(
genotypes=self._obj,
genotypes=self._obj.select_dtypes([GenotypeDtype]),
data=data,
outcome_variable=outcome_variable,
covariates=covariates,
Expand All @@ -253,19 +279,24 @@ def generate_weighted_encodings(
###########
# Filters #
###########
# These methods drop genotypes that fail the filter, ignoring other columns

def filter_variants_maf(self, keep_min_freq: float = 0.01) -> pd.DataFrame:
"""
Drop variants with a MAF less than the specified value (0.01 by default)
"""
return self._obj.loc[:, self._obj.genomics.maf >= keep_min_freq]
genotypes = self._obj.select_dtypes([GenotypeDtype])
removed = genotypes.loc[:, genotypes.genomics.maf < keep_min_freq].columns
return self._obj.drop(columns=removed)

def filter_variants_hwe(self, cutoff: float = 0.05) -> pd.DataFrame:
"""
Drop variants with a probability of HWE less than the specified value (0.05 by default).
Keep np.nan results, which occur for non-diploid variants and insufficient sample sizes
"""
return self._obj.loc[
:,
(self._obj.genomics.hwe_pval >= cutoff)
| (np.isnan(self._obj.genomics.hwe_pval)),
]
genotypes = self._obj.select_dtypes([GenotypeDtype])
genotype_hwe_pval = genotypes.genomics.hwe_pval
removed = genotypes.loc[
:, (genotype_hwe_pval < cutoff) & ~np.isnan(genotype_hwe_pval)
].columns
return self._obj.drop(columns=removed)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pandas-genomics"
version = "0.5.2"
version = "0.6.0"
description = "Pandas ExtensionDtypes and ExtensionArray for working with genomics data"
license = "BSD-3-Clause"
authors = ["John McGuigan <jrm5100@psu.edu>"]
Expand Down
5 changes: 4 additions & 1 deletion tests/genotype_array/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,10 @@ def __get_data_for_encoding():
def genotypearray_df():
DATA_DIR = Path(__file__).parent.parent / "data" / "plink"
input = DATA_DIR / "plink_test_small"
return io.from_plink(input, max_variants=20, swap_alleles=True)
df = io.from_plink(input, max_variants=20, swap_alleles=True)
df["num"] = [1.0 for n in range(len(df))]
df["bool"] = [True if n % 3 == 0 else False for n in range(len(df))]
return df


@pytest.fixture
Expand Down
18 changes: 10 additions & 8 deletions tests/genotype_array/test_GenotypeArrayAccessors.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
Test GenotypeArray Accessors
"""

import numpy as np
import pandas as pd
import pytest
from pandas._testing import (
Expand All @@ -28,7 +28,8 @@ def test_maf(data):
)
for colname in "ABC":
df[colname].genomics.variant.id = colname
expected = pd.Series({n: data.maf for n in "ABC"})
df["D"] = np.ones(len(data))
expected = pd.Series({"A": data.maf, "B": data.maf, "C": data.maf})
assert_series_equal(df.genomics.maf, expected)


Expand All @@ -37,23 +38,24 @@ def test_hwe(data):


@pytest.mark.parametrize(
"filter_value, num_vars_left", [(None, 15), (0.05, 1), (0.10, 0)]
"filter_value, num_cols_left", [(None, 17), (0.05, 3), (0.10, 2)]
)
def test_filter_maf(genotypearray_df, filter_value, num_vars_left):
def test_filter_maf(genotypearray_df, filter_value, num_cols_left):
if filter_value is None:
result = genotypearray_df.genomics.filter_variants_maf()
else:
result = genotypearray_df.genomics.filter_variants_maf(filter_value)
assert len(result.columns) == num_vars_left
assert len(result.columns) == num_cols_left


@pytest.mark.parametrize(
"filter_value, num_vars_left", [(None, 1), (0.05, 1), (1e-300, 2)]
"filter_value, num_cols_left", [(None, 1), (0.05, 1), (1e-300, 2)]
)
def test_filter_hwe(ga_inhwe, ga_nothwe, filter_value, num_vars_left):
def test_filter_hwe(ga_inhwe, ga_nothwe, filter_value, num_cols_left):
data = pd.DataFrame({"yes": ga_inhwe, "no": ga_nothwe})
data["num"] = [n for n in range(len(data))]
if filter_value is None:
result = data.genomics.filter_variants_hwe()
else:
result = data.genomics.filter_variants_hwe(filter_value)
assert len(result.columns) == num_vars_left
assert len(result.columns) == num_cols_left + 1
54 changes: 45 additions & 9 deletions tests/genotype_array/test_GenotypeArrayEncoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,19 @@ def test_encoding_additive(data_for_encoding):
expected = pd.Series(result)
result_series = pd.Series(data_for_encoding()).genomics.encode_additive()
assert_series_equal(result_series, expected)
# Test using DataFrame accessor
# Test using DataFrame accessor with extra col
df = pd.DataFrame.from_dict(
{n: data_for_encoding() for n in "ABC"}, orient="columns"
)
expected = pd.DataFrame.from_dict({n: result_series for n in "ABC"})
df["float"] = np.ones(len(df))
expected = pd.DataFrame.from_dict(
{
"A": result_series,
"B": result_series,
"C": result_series,
"float": df["float"],
}
)
result_df = df.genomics.encode_additive()
assert_frame_equal(result_df, expected)

Expand All @@ -74,11 +82,19 @@ def test_encoding_dominant(data_for_encoding):
expected = pd.Series(result)
result_series = pd.Series(data_for_encoding()).genomics.encode_dominant()
assert_series_equal(result_series, expected)
# Test using DataFrame accessor
# Test using DataFrame accessor with extra col
df = pd.DataFrame.from_dict(
{n: data_for_encoding() for n in "ABC"}, orient="columns"
)
expected = pd.DataFrame.from_dict({n: result_series for n in "ABC"})
df["float"] = np.ones(len(df))
expected = pd.DataFrame.from_dict(
{
"A": result_series,
"B": result_series,
"C": result_series,
"float": df["float"],
}
)
result_df = df.genomics.encode_dominant()
assert_frame_equal(result_df, expected)

Expand All @@ -92,11 +108,19 @@ def test_encoding_recessive(data_for_encoding):
expected = pd.Series(result)
result_series = pd.Series(data_for_encoding()).genomics.encode_recessive()
assert_series_equal(result_series, expected)
# Test using DataFrame accessor
# Test using DataFrame accessor with extra col
df = pd.DataFrame.from_dict(
{n: data_for_encoding() for n in "ABC"}, orient="columns"
)
expected = pd.DataFrame.from_dict({n: result_series for n in "ABC"})
df["float"] = np.ones(len(df))
expected = pd.DataFrame.from_dict(
{
"A": result_series,
"B": result_series,
"C": result_series,
"float": df["float"],
}
)
result_df = df.genomics.encode_recessive()
assert_frame_equal(result_df, expected)

Expand All @@ -114,11 +138,19 @@ def test_encoding_codominant(data_for_encoding):
expected = pd.Series(result)
result_series = pd.Series(data_for_encoding()).genomics.encode_codominant()
assert_series_equal(result_series, expected)
# Test using DataFrame accessor
# Test using DataFrame accessor with extra col
df = pd.DataFrame.from_dict(
{n: data_for_encoding() for n in "ABC"}, orient="columns"
)
expected = pd.DataFrame.from_dict({n: result_series for n in "ABC"})
df["float"] = np.ones(len(df))
expected = pd.DataFrame.from_dict(
{
"A": result_series,
"B": result_series,
"C": result_series,
"float": df["float"],
}
)
result_df = df.genomics.encode_codominant()
assert_frame_equal(result_df, expected)

Expand Down Expand Up @@ -171,6 +203,7 @@ def test_encoding_weighted(
"var2": [0.0, 0.3, 1.0, None, None],
"var3": [0.0, 0.4, 1.0, None, None],
"var4": [0.0, 0.5, 1.0, None, None],
"num": [1.0, 1.0, 1.0, 1.0, 1.0],
},
dtype="Float64",
),
Expand All @@ -191,14 +224,17 @@ def test_encoding_weighted(
"var0": [0.0, 0.1, 1.0, None, None],
"var1": [0.0, 0.2, 1.0, None, None],
"var4": [1.0, 0.5, 0.0, None, None],
"num": [1.0, 1.0, 1.0, 1.0, 1.0],
},
dtype="Float64",
),
),
],
)
def test_encoding_weighted_df(encoding_df, encoding_info, expected):
result = encoding_df.genomics.encode_weighted(encoding_info)
df = encoding_df.copy()
df["num"] = pd.Series(np.ones(len(df))).astype("Float64")
result = df.genomics.encode_weighted(encoding_info)
assert_frame_equal(expected, result)


Expand Down

0 comments on commit d63e63d

Please sign in to comment.