sgkit-dev · mergify · Feb 22, 2023 · Feb 9, 2023 · Feb 21, 2023 · Feb 22, 2023
diff --git a/sgkit/io/plink/plink_reader.py b/sgkit/io/plink/plink_reader.py
@@ -39,6 +39,9 @@
 BIM_DF_DTYPE = dict([(f[0], f[1]) for f in BIM_FIELDS])
 BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS])
 
+INT_MISSING = -1
+STR_MISSING = "."
+
 
 class BedReader(object):
     def __init__(
@@ -81,8 +84,8 @@ def __getitem__(self, idx: Tuple[Any, ...]) -> NDArray:
         #       (missing would then be NaN)
         arr = arr.astype(self.dtype)
         # Add a ploidy dimension, so allele counts of 0, 1, 2 correspond to 00, 10, 11
-        call0 = np.where(arr < 0, -1, np.where(arr == 0, 0, 1))
-        call1 = np.where(arr < 0, -1, np.where(arr == 2, 1, 0))
+        call0 = np.where(arr < 0, INT_MISSING, np.where(arr == 0, 0, 1))
+        call1 = np.where(arr < 0, INT_MISSING, np.where(arr == 2, 1, 0))
         arr = np.stack([call0, call1], axis=-1)
         # Apply final slice to 3D result
         return arr[:, :, idx[-1]]
@@ -104,10 +107,12 @@ def coerce_code(v: dd.Series, codes: List[int]) -> dd.Series:
         # Set non-ints and unexpected codes to missing (-1)
         v = dd.to_numeric(v, errors="coerce")
         v = v.where(v.isin(codes), np.nan)
-        return v.fillna(-1).astype("int8")
+        return v.fillna(INT_MISSING).astype("int8")
 
-    df["paternal_id"] = df["paternal_id"].where(df["paternal_id"] != "0", None)
-    df["maternal_id"] = df["maternal_id"].where(df["maternal_id"] != "0", None)
+    # replace fam "0" with sgkit missing values (".")
+    df["family_id"] = df["family_id"].where(df["family_id"] != "0", STR_MISSING)
+    df["paternal_id"] = df["paternal_id"].where(df["paternal_id"] != "0", STR_MISSING)
+    df["maternal_id"] = df["maternal_id"].where(df["maternal_id"] != "0", STR_MISSING)
     df["sex"] = coerce_code(df["sex"], [1, 2])
     df["phenotype"] = coerce_code(df["phenotype"], [1, 2])
 
@@ -208,16 +213,18 @@ def read_plink(
 
     The following pedigree-specific fields are also included:
 
-    - ``sample_family_id``: Family identifier commonly referred to as FID
-    - ``sample_id``: Within-family identifier for sample
-    - ``sample_paternal_id``: Within-family identifier for father of sample
-    - ``sample_maternal_id``: Within-family identifier for mother of sample
+    - ``sample_family_id``: Family identifier commonly referred to as FID,
+        "." for missing
+    - ``sample_member_id`` and ``sample_id``: Within-family identifier for sample
+    - ``sample_paternal_id``: Within-family identifier for father of sample,
+        "." for missing
+    - ``sample_maternal_id``: Within-family identifier for mother of sample,
+        "." for missing
     - ``sample_sex``: Sex code equal to 1 for male, 2 for female, and -1
         for missing
     - ``sample_phenotype``: Phenotype code equal to 1 for control, 2 for case,
         and -1 for missing
 
-
     See https://www.cog-genomics.org/plink/1.9/formats#fam for more details.
 
     Raises
@@ -295,6 +302,4 @@ def read_plink(
     )
 
     # Assign PLINK-specific pedigree fields
-    return ds.assign(
-        {f"sample_{f}": (DIM_SAMPLE, arr_fam[f]) for f in arr_fam if f != "member_id"}
-    )
+    return ds.assign({f"sample_{f}": (DIM_SAMPLE, arr_fam[f]) for f in arr_fam})
diff --git a/sgkit/io/plink/plink_writer.py b/sgkit/io/plink/plink_writer.py
@@ -8,6 +8,19 @@
 from sgkit.typing import ArrayLike, PathType
 
 BED_READER_MISSING_INT_VALUE = -127
+INT_MISSING = -1
+STR_MISSING = "."
+
+FAM_VARIABLE_TO_BED_READER = {
+    # sgkit variable name : bed reader properties name
+    "sample_family_id": "fid",
+    "sample_member_id": "iid",
+    "sample_paternal_id": "father",
+    "sample_maternal_id": "mother",
+    "sample_sex": "sex",
+    "sample_phenotype": "pheno",
+    "variant_id": "sid",
+}
 
 
 def write_plink(
@@ -20,6 +33,20 @@ def write_plink(
 ) -> None:
     """Convert a dataset to a PLINK file.
 
+    If any of the following pedigree-specific variables are defined in the dataset
+    they will be included in the PLINK fam file. Otherwise, the PLINK fam file will
+    contain missing values for these fields, except for the within-family identifier
+    for samples, which will be taken from the dataset ``sample_id``.
+
+    - ``sample_family_id``: Family identifier commonly referred to as FID
+    - ``sample_member_id``: Within-family identifier for sample
+    - ``sample_paternal_id``: Within-family identifier for father of sample
+    - ``sample_maternal_id``: Within-family identifier for mother of sample
+    - ``sample_sex``: Sex code equal to 1 for male, 2 for female, and -1
+        for missing
+    - ``sample_phenotype``: Phenotype code equal to 1 for control, 2 for case,
+        and -1 for missing
+
     Parameters
     ----------
     ds
@@ -78,11 +105,17 @@ def write_plink(
         "bp_position": ds.variant_position.values,
         "allele_1": ds.variant_allele.values[:, 0],
         "allele_2": ds.variant_allele.values[:, 1],
-        "iid": ds.sample_id,
+        "iid": ds.sample_id.values,  # may be overridden by sample_member_id below (if present)
     }
 
-    if "variant_id" in ds:
-        properties["sid"] = ds.variant_id
+    for var, prop in FAM_VARIABLE_TO_BED_READER.items():
+        if var in ds:
+            values = ds[var].values
+            if values.dtype.kind in ("O", "S", "U"):
+                values = np.where(values == STR_MISSING, "0", values)
+            elif values.dtype.kind in ("i", "u"):
+                values = np.where(values == INT_MISSING, 0, values)
+            properties[prop] = values
 
     output_file = bed_path
     val = call_g.T

diff --git a/sgkit/tests/io/plink/data/example_with_fam.bed b/sgkit/tests/io/plink/data/example_with_fam.bed
diff --git a/sgkit/tests/io/plink/data/example_with_fam.bim b/sgkit/tests/io/plink/data/example_with_fam.bim
@@ -0,0 +1,2 @@
+1	1_10	0	10	A	G
+1	1_20	0	20	T	C
diff --git a/sgkit/tests/io/plink/data/example_with_fam.fam b/sgkit/tests/io/plink/data/example_with_fam.fam
@@ -0,0 +1,10 @@
+ind0 ind0 0 0 0 -9
+ind1 ind1 0 0 0 -9
+ind2 ind2 ind1 ind0 2 1
+ind3 ind3 ind1 ind0 1 2
+ind4 ind4 0 0 0 -9
+ind5 ind5 0 0 0 -9
+ind6 ind6 0 0 0 -9
+ind7 ind7 0 0 0 -9
+ind8 ind8 0 0 0 -9
+ind9 ind9 0 0 0 -9
diff --git a/sgkit/tests/io/plink/test_plink_reader.py b/sgkit/tests/io/plink/test_plink_reader.py
@@ -80,10 +80,8 @@ def test_fixlen_str_variable(ds1):
     assert ds1["variant_id"].dtype == np.dtype("<U13")
     assert ds1["variant_allele"].dtype == np.dtype("|S6")
     assert ds1["sample_family_id"].dtype == np.dtype("<U1")
-    # TODO: Remove 'None' strings https://github.com/pystatgen/sgkit-plink/issues/16
-    # which should make these <U1
-    assert ds1["sample_maternal_id"].dtype == np.dtype("<U4")
-    assert ds1["sample_paternal_id"].dtype == np.dtype("<U4")
+    assert ds1["sample_maternal_id"].dtype == np.dtype("<U1")
+    assert ds1["sample_paternal_id"].dtype == np.dtype("<U1")
 
 
 def test_read_slicing(ds1):

diff --git a/sgkit/tests/io/plink/test_plink_writer.py b/sgkit/tests/io/plink/test_plink_writer.py
@@ -7,6 +7,8 @@
 from sgkit.testing import simulate_genotype_call_dataset
 
 example_dataset_1 = "plink_sim_10s_100v_10pmiss"
+example_dataset_2 = "example"
+example_dataset_3 = "example_with_fam"
 
 
 @pytest.fixture(params=[dict()])
@@ -15,28 +17,36 @@ def ds1(shared_datadir, request):
     return read_plink(path=path, bim_sep="\t", fam_sep="\t", **request.param)
 
 
-def test_write_plink(shared_datadir, tmp_path, ds1):
+@pytest.mark.parametrize(
+    "plink_in, fam_sep",
+    [
+        (example_dataset_1, "\t"),
+        (example_dataset_2, " "),
+        (example_dataset_3, " "),
+    ],
+)
+def test_write_plink(shared_datadir, tmp_path, plink_in, fam_sep):
+    # read plink file as a dataset then write it out again
+    ds = read_plink(path=shared_datadir / plink_in, fam_sep=fam_sep)
     path = tmp_path / "plink_out"
     path.mkdir(parents=True, exist_ok=False)
-    write_plink(ds1, path=path)
+    write_plink(ds, path=path)
 
     # check bed files are the same
     bed_path = path.with_suffix(".bed")
-    with open(
-        (shared_datadir / example_dataset_1).with_suffix(".bed"), "rb"
-    ) as expected, open(bed_path, "rb") as actual:
+    with open((shared_datadir / plink_in).with_suffix(".bed"), "rb") as expected, open(
+        bed_path, "rb"
+    ) as actual:
         assert expected.read() == actual.read()
 
     # check bim files are the same
-    bim_expected = read_bim(
-        (shared_datadir / example_dataset_1).with_suffix(".bim")
-    ).compute()
+    bim_expected = read_bim((shared_datadir / plink_in).with_suffix(".bim")).compute()
     bim_actual = read_bim(path.with_suffix(".bim")).compute()
     pd.testing.assert_frame_equal(bim_expected, bim_actual)
 
     # check fam files are the same
     fam_expected = read_fam(
-        (shared_datadir / example_dataset_1).with_suffix(".fam"), sep="\t"
+        (shared_datadir / plink_in).with_suffix(".fam"), sep=fam_sep
     ).compute()
     fam_actual = read_fam(path.with_suffix(".fam")).compute()
     pd.testing.assert_frame_equal(fam_expected, fam_actual)