Skip to content

Use sample_* variables if present to populate plink family fields when writing #1019

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 18 additions & 13 deletions sgkit/io/plink/plink_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@
BIM_DF_DTYPE = dict([(f[0], f[1]) for f in BIM_FIELDS])
BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS])

INT_MISSING = -1
STR_MISSING = "."


class BedReader(object):
def __init__(
Expand Down Expand Up @@ -81,8 +84,8 @@ def __getitem__(self, idx: Tuple[Any, ...]) -> NDArray:
# (missing would then be NaN)
arr = arr.astype(self.dtype)
# Add a ploidy dimension, so allele counts of 0, 1, 2 correspond to 00, 10, 11
call0 = np.where(arr < 0, -1, np.where(arr == 0, 0, 1))
call1 = np.where(arr < 0, -1, np.where(arr == 2, 1, 0))
call0 = np.where(arr < 0, INT_MISSING, np.where(arr == 0, 0, 1))
call1 = np.where(arr < 0, INT_MISSING, np.where(arr == 2, 1, 0))
arr = np.stack([call0, call1], axis=-1)
# Apply final slice to 3D result
return arr[:, :, idx[-1]]
Expand All @@ -104,10 +107,12 @@ def coerce_code(v: dd.Series, codes: List[int]) -> dd.Series:
# Set non-ints and unexpected codes to missing (-1)
v = dd.to_numeric(v, errors="coerce")
v = v.where(v.isin(codes), np.nan)
return v.fillna(-1).astype("int8")
return v.fillna(INT_MISSING).astype("int8")

df["paternal_id"] = df["paternal_id"].where(df["paternal_id"] != "0", None)
df["maternal_id"] = df["maternal_id"].where(df["maternal_id"] != "0", None)
# replace fam "0" with sgkit missing values (".")
df["family_id"] = df["family_id"].where(df["family_id"] != "0", STR_MISSING)
df["paternal_id"] = df["paternal_id"].where(df["paternal_id"] != "0", STR_MISSING)
df["maternal_id"] = df["maternal_id"].where(df["maternal_id"] != "0", STR_MISSING)
df["sex"] = coerce_code(df["sex"], [1, 2])
df["phenotype"] = coerce_code(df["phenotype"], [1, 2])

Expand Down Expand Up @@ -208,16 +213,18 @@ def read_plink(

The following pedigree-specific fields are also included:

- ``sample_family_id``: Family identifier commonly referred to as FID
- ``sample_id``: Within-family identifier for sample
- ``sample_paternal_id``: Within-family identifier for father of sample
- ``sample_maternal_id``: Within-family identifier for mother of sample
- ``sample_family_id``: Family identifier commonly referred to as FID,
"." for missing
- ``sample_member_id`` and ``sample_id``: Within-family identifier for sample
- ``sample_paternal_id``: Within-family identifier for father of sample,
"." for missing
- ``sample_maternal_id``: Within-family identifier for mother of sample,
"." for missing
- ``sample_sex``: Sex code equal to 1 for male, 2 for female, and -1
for missing
- ``sample_phenotype``: Phenotype code equal to 1 for control, 2 for case,
and -1 for missing


See https://www.cog-genomics.org/plink/1.9/formats#fam for more details.

Raises
Expand Down Expand Up @@ -295,6 +302,4 @@ def read_plink(
)

# Assign PLINK-specific pedigree fields
return ds.assign(
{f"sample_{f}": (DIM_SAMPLE, arr_fam[f]) for f in arr_fam if f != "member_id"}
)
return ds.assign({f"sample_{f}": (DIM_SAMPLE, arr_fam[f]) for f in arr_fam})
39 changes: 36 additions & 3 deletions sgkit/io/plink/plink_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,19 @@
from sgkit.typing import ArrayLike, PathType

BED_READER_MISSING_INT_VALUE = -127
INT_MISSING = -1
STR_MISSING = "."

FAM_VARIABLE_TO_BED_READER = {
# sgkit variable name : bed reader properties name
"sample_family_id": "fid",
"sample_member_id": "iid",
"sample_paternal_id": "father",
"sample_maternal_id": "mother",
"sample_sex": "sex",
"sample_phenotype": "pheno",
"variant_id": "sid",
}


def write_plink(
Expand All @@ -20,6 +33,20 @@ def write_plink(
) -> None:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this have a missing: Hashable ='.' parameter? To specify the missing parent value and default to the new value from read_plink.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure this is needed. We don't parameterise the missing value in VCF for example, so I think it's ok to have the sgkit default of "." used here too.

Copy link
Collaborator

@timothymillar timothymillar Feb 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't parameterise the missing value in VCF

Good point, but "." is the missing value in the VCF spec where as "0" is the missing value in a plink fam file. Although, a "0" will stay as a "0" so I guess both cases are covered anyway 🤷

the sgkit default of "."

We should note that in the main documentation (not needed in this PR)

"""Convert a dataset to a PLINK file.

If any of the following pedigree-specific variables are defined in the dataset
they will be included in the PLINK fam file. Otherwise, the PLINK fam file will
contain missing values for these fields, except for the within-family identifier
for samples, which will be taken from the dataset ``sample_id``.

- ``sample_family_id``: Family identifier commonly referred to as FID
- ``sample_member_id``: Within-family identifier for sample
- ``sample_paternal_id``: Within-family identifier for father of sample
- ``sample_maternal_id``: Within-family identifier for mother of sample
- ``sample_sex``: Sex code equal to 1 for male, 2 for female, and -1
for missing
- ``sample_phenotype``: Phenotype code equal to 1 for control, 2 for case,
and -1 for missing

Parameters
----------
ds
Expand Down Expand Up @@ -78,11 +105,17 @@ def write_plink(
"bp_position": ds.variant_position.values,
"allele_1": ds.variant_allele.values[:, 0],
"allele_2": ds.variant_allele.values[:, 1],
"iid": ds.sample_id,
"iid": ds.sample_id.values, # may be overridden by sample_member_id below (if present)
}

if "variant_id" in ds:
properties["sid"] = ds.variant_id
for var, prop in FAM_VARIABLE_TO_BED_READER.items():
if var in ds:
values = ds[var].values
if values.dtype.kind in ("O", "S", "U"):
values = np.where(values == STR_MISSING, "0", values)
elif values.dtype.kind in ("i", "u"):
values = np.where(values == INT_MISSING, 0, values)
properties[prop] = values

output_file = bed_path
val = call_g.T
Expand Down
Binary file added sgkit/tests/io/plink/data/example_with_fam.bed
Binary file not shown.
2 changes: 2 additions & 0 deletions sgkit/tests/io/plink/data/example_with_fam.bim
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1 1_10 0 10 A G
1 1_20 0 20 T C
10 changes: 10 additions & 0 deletions sgkit/tests/io/plink/data/example_with_fam.fam
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
ind0 ind0 0 0 0 -9
ind1 ind1 0 0 0 -9
ind2 ind2 ind1 ind0 2 1
ind3 ind3 ind1 ind0 1 2
ind4 ind4 0 0 0 -9
ind5 ind5 0 0 0 -9
ind6 ind6 0 0 0 -9
ind7 ind7 0 0 0 -9
ind8 ind8 0 0 0 -9
ind9 ind9 0 0 0 -9
6 changes: 2 additions & 4 deletions sgkit/tests/io/plink/test_plink_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,8 @@ def test_fixlen_str_variable(ds1):
assert ds1["variant_id"].dtype == np.dtype("<U13")
assert ds1["variant_allele"].dtype == np.dtype("|S6")
assert ds1["sample_family_id"].dtype == np.dtype("<U1")
# TODO: Remove 'None' strings https://github.com/pystatgen/sgkit-plink/issues/16
# which should make these <U1
assert ds1["sample_maternal_id"].dtype == np.dtype("<U4")
assert ds1["sample_paternal_id"].dtype == np.dtype("<U4")
assert ds1["sample_maternal_id"].dtype == np.dtype("<U1")
assert ds1["sample_paternal_id"].dtype == np.dtype("<U1")


def test_read_slicing(ds1):
Expand Down
28 changes: 19 additions & 9 deletions sgkit/tests/io/plink/test_plink_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from sgkit.testing import simulate_genotype_call_dataset

example_dataset_1 = "plink_sim_10s_100v_10pmiss"
example_dataset_2 = "example"
example_dataset_3 = "example_with_fam"


@pytest.fixture(params=[dict()])
Expand All @@ -15,28 +17,36 @@ def ds1(shared_datadir, request):
return read_plink(path=path, bim_sep="\t", fam_sep="\t", **request.param)


def test_write_plink(shared_datadir, tmp_path, ds1):
@pytest.mark.parametrize(
"plink_in, fam_sep",
[
(example_dataset_1, "\t"),
(example_dataset_2, " "),
(example_dataset_3, " "),
],
)
def test_write_plink(shared_datadir, tmp_path, plink_in, fam_sep):
# read plink file as a dataset then write it out again
ds = read_plink(path=shared_datadir / plink_in, fam_sep=fam_sep)
path = tmp_path / "plink_out"
path.mkdir(parents=True, exist_ok=False)
write_plink(ds1, path=path)
write_plink(ds, path=path)

# check bed files are the same
bed_path = path.with_suffix(".bed")
with open(
(shared_datadir / example_dataset_1).with_suffix(".bed"), "rb"
) as expected, open(bed_path, "rb") as actual:
with open((shared_datadir / plink_in).with_suffix(".bed"), "rb") as expected, open(
bed_path, "rb"
) as actual:
assert expected.read() == actual.read()

# check bim files are the same
bim_expected = read_bim(
(shared_datadir / example_dataset_1).with_suffix(".bim")
).compute()
bim_expected = read_bim((shared_datadir / plink_in).with_suffix(".bim")).compute()
bim_actual = read_bim(path.with_suffix(".bim")).compute()
pd.testing.assert_frame_equal(bim_expected, bim_actual)

# check fam files are the same
fam_expected = read_fam(
(shared_datadir / example_dataset_1).with_suffix(".fam"), sep="\t"
(shared_datadir / plink_in).with_suffix(".fam"), sep=fam_sep
).compute()
fam_actual = read_fam(path.with_suffix(".fam")).compute()
pd.testing.assert_frame_equal(fam_expected, fam_actual)
Expand Down