Skip to content

Commit cac9a35

Browse files
Merge pull request #169 from hyanwong/allow-missing-data-in-samples
Allow missing data in samples
2 parents e01b15f + 5bb2089 commit cac9a35

File tree

10 files changed

+316
-136
lines changed

10 files changed

+316
-136
lines changed

docs/inference.rst

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,13 @@ Data requirements
2323
Input haplotype data for tskit must satisfy the following requirements:
2424

2525
- Data must be *phased*.
26-
- For each site used for inference we must know the *ancestral state*. Note that this is
27-
not necessarily the same as the REF column from VCF, and ancestral
26+
- For each site used for inference we must know the *ancestral state*. Note that
27+
this is not necessarily the same as the REF column from VCF, and ancestral
2828
states must be computed using some other method.
2929
- Only biallelic sites can be used for inference.
30+
- Missing data can be included in the haplotypes using the value
31+
``tskit.MISSING_DATA`` (-1). The inferred tree sequence will have values
32+
imputed for the haplotypes at these sites.
3033

3134

3235
.. _sec_inference_data_model:
@@ -36,7 +39,7 @@ Data model
3639
**********
3740

3841
The data model for ``tsinfer`` is tightly integrated with
39-
``tskit``'s `data model <https://tskit.readthedocs.io/en/latest/data-model.html>`_
42+
``tskit``'s :ref:`data model <sec_data_model>`
4043
and uses the same concepts throughout. The intermediate file formats and APIs
4144
described here provide a bridge between this model and existing data sources. For
4245
convenience, we provide a brief description of concepts needed for importing
@@ -183,10 +186,8 @@ paths, "path compression".
183186

184187
The copying path for each ancestor then describes its ancestry at every
185188
point in the sequence: from a genealogical perspective, we know its
186-
parent node. This information is encoded precisely as an `edge
187-
<https://tskit.readthedocs.io/en/latest/data-model.html#edge-table>`_ in a
188-
`tree sequence
189-
<https://tskit.readthedocs.io/en/latest/data-model.html>`_.
189+
parent node. This information is encoded precisely as an :ref:`edge
190+
<sec_edge_table_definition>` in a :ref:`tree sequence <sec_data_model>`.
190191
Thus, we refer to the output of this step as the "ancestors tree sequence",
191192
which is conventionally stored in a file ending with ``.ancestors.trees``.
192193

@@ -204,18 +205,14 @@ The final phase of a ``tsinfer`` inference consists of a number steps:
204205
way as for ancestors, and the path compression algorithm can be equally
205206
applied here.
206207

207-
208208
2. As we only use a subset of the available sites for inference
209209
(excluding by default any sites that are fixed or singletons)
210210
we then place mutations on the inferred trees in order to
211-
represent the information at these sites. This is done using the tskit
212-
`map_mutations <https://tskit.readthedocs.io/en/latest/python-api.html#tskit.Tree.map_mutations>`_.
213-
method.
214-
211+
represent the information at these sites. This is done using
212+
:meth:`tskit.Tree.map_mutations`.
215213

216214
3. Remove ancestral paths that do not lead to any of the samples by
217-
`simplifying
218-
<https://tskit.readthedocs.io/en/latest/python-api.html#tskit.TreeSequence.simplify>`_
215+
:meth:`simplifying <tskit.TreeSequence.simplify>`
219216
the final tree sequence. When simplifying, we keep non-branching ("unary")
220217
nodes, as they indicate ancestors which we have actively inferred, and
221218
for technical reasons keeping unary ancestors can also lead to better

lib/ancestor_builder.c

Lines changed: 69 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -238,45 +238,57 @@ ancestor_builder_compute_ancestral_states(ancestor_builder_t *self,
238238

239239
genotypes = self->sites[l].genotypes;
240240
ones = 0;
241+
zeros = 0;
241242
for (j = 0; j < sample_set_size; j++) {
242-
ones += (size_t) genotypes[sample_set[j]];
243+
switch (genotypes[sample_set[j]]) {
244+
case 0:
245+
zeros++;
246+
break;
247+
case 1:
248+
ones++;
249+
break;
250+
}
243251
}
244-
zeros = sample_set_size - ones;
245252
consensus = 0;
246-
if (ones >= zeros) {
247-
consensus = 1;
248-
}
249-
/* printf("\t:ones=%d, consensus=%d\n", (int) ones, consensus); */
250-
/* fflush(stdout); */
251-
for (j = 0; j < sample_set_size; j++) {
252-
u = sample_set[j];
253-
if (disagree[u] && genotypes[u] != consensus) {
254-
/* This sample has disagreed with consensus twice in a row,
255-
* so remove it */
256-
/* printf("\t\tremoving %d\n", sample_set[j]); */
257-
sample_set[j] = -1;
253+
if (ones + zeros > 0) {
254+
/* We have valid (non-missing) data in the tracked samples at site l */
255+
if (ones >= zeros) {
256+
consensus = 1;
258257
}
259-
}
260-
/* Repack the sample set */
261-
tmp_size = 0;
262-
for (j = 0; j < sample_set_size; j++) {
263-
if (sample_set[j] != -1) {
264-
sample_set[tmp_size] = sample_set[j];
265-
tmp_size++;
258+
/* printf("\t:ones=%d, consensus=%d\n", (int) ones, consensus); */
259+
/* fflush(stdout); */
260+
for (j = 0; j < sample_set_size; j++) {
261+
u = sample_set[j];
262+
if (disagree[u] && (genotypes[u] != consensus) &&
263+
(genotypes[u] != TSK_MISSING_DATA)) {
264+
/* This sample has disagreed with consensus twice in a row,
265+
* so remove it */
266+
/* printf("\t\tremoving %d\n", sample_set[j]); */
267+
sample_set[j] = -1;
268+
}
269+
}
270+
/* Repack the sample set */
271+
tmp_size = 0;
272+
for (j = 0; j < sample_set_size; j++) {
273+
if (sample_set[j] != -1) {
274+
sample_set[tmp_size] = sample_set[j];
275+
tmp_size++;
276+
}
277+
}
278+
sample_set_size = tmp_size;
279+
if (sample_set_size <= min_sample_set_size) {
280+
/* printf("BREAK\n"); */
281+
break;
282+
}
283+
/* For the remaining sample set, set the disagree flags based
284+
* on whether they agree with the consensus for this site. */
285+
for (j = 0; j < sample_set_size; j++) {
286+
u = sample_set[j];
287+
disagree[u] = ((genotypes[u] != consensus) &&
288+
(genotypes[u] != TSK_MISSING_DATA));
266289
}
267-
}
268-
sample_set_size = tmp_size;
269-
if (sample_set_size <= min_sample_set_size) {
270-
/* printf("BREAK\n"); */
271-
break;
272290
}
273291
ancestor[l] = consensus;
274-
/* For the remaining sample set, set the disagree flags based
275-
* on whether they agree with the consensus for this site. */
276-
for (j = 0; j < sample_set_size; j++) {
277-
u = sample_set[j];
278-
disagree[u] = genotypes[u] != consensus;
279-
}
280292
}
281293
}
282294
*last_site_ret = last_site;
@@ -312,12 +324,22 @@ ancestor_builder_compute_between_focal_sites(ancestor_builder_t *self,
312324
/* } */
313325
genotypes = self->sites[l].genotypes;
314326
ones = 0;
327+
zeros = 0;
315328
for (k = 0; k < sample_set_size; k++) {
316-
ones += (size_t) genotypes[sample_set[k]];
329+
switch (genotypes[sample_set[k]]) {
330+
case 0:
331+
zeros++;
332+
break;
333+
case 1:
334+
ones++;
335+
break;
336+
}
317337
}
318-
zeros = sample_set_size - ones;
319-
if (ones >= zeros) {
320-
ancestor[l] = 1;
338+
if (ones + zeros > 0) {
339+
/* We have valid (non-missing) data at this site */
340+
if (ones >= zeros) {
341+
ancestor[l] = 1;
342+
}
321343
}
322344
}
323345
}
@@ -445,15 +467,23 @@ ancestor_builder_break_ancestor(ancestor_builder_t *self, tsk_id_t a,
445467
{
446468
bool ret = false;
447469
tsk_id_t j, k;
448-
size_t ones;
470+
size_t ones, missing;
449471

450472
for (j = a + 1; j < b && !ret; j++) {
451473
if (self->sites[j].time > self->sites[a].time) {
452474
ones = 0;
475+
missing = 0;
453476
for (k = 0; k < (tsk_id_t) num_samples; k++) {
454-
ones += (size_t) self->sites[j].genotypes[samples[k]];
477+
switch (self->sites[j].genotypes[samples[k]]) {
478+
case TSK_MISSING_DATA:
479+
missing++;
480+
break;
481+
case 1:
482+
ones++;
483+
break;
484+
}
455485
}
456-
if (ones != num_samples && ones != 0) {
486+
if (ones != (num_samples - missing) && ones != 0) {
457487
ret = true;
458488
}
459489
}

lib/tsinfer.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
** Copyright (C) 2018 University of Oxford
2+
** Copyright (C) 2018-2020 University of Oxford
33
**
44
** This file is part of tsinfer.
55
**
@@ -27,6 +27,9 @@
2727
#include "object_heap.h"
2828
#include "avl.h"
2929

30+
/* TODO remove this when we update tskit version. */
31+
#define TSK_MISSING_DATA (-1)
32+
3033
/* NULL_LIKELIHOOD represents a compressed path and NONZERO_ROOT_LIKELIHOOD
3134
* marks a node that is not in the current tree. */
3235
#define NULL_LIKELIHOOD (-1)

requirements/conda-minimal.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ numpy
33
six
44
tqdm
55
humanize
6-
tskit
6+
tskit>=0.2.3
77
msprime
88
zarr
99
lmdb

tests/test_formats.py

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -514,17 +514,16 @@ def test_provenance(self):
514514

515515
def test_variant_errors(self):
516516
input_file = formats.SampleData(sequence_length=10)
517-
genotypes = np.zeros(2, np.int8)
517+
genotypes = [0, 0]
518518
input_file.add_site(0, alleles=["0", "1"], genotypes=genotypes)
519519
for bad_position in [-1, 10, 100]:
520520
self.assertRaises(
521521
ValueError, input_file.add_site, position=bad_position,
522522
alleles=["0", "1"], genotypes=genotypes)
523-
for bad_genotypes in [[0, 2], [-1, 0], [], [0], [0, 0, 0]]:
524-
genotypes = np.array(bad_genotypes, dtype=np.int8)
523+
for bad_genotypes in [[0, 2], [-2, 0], [], [0], [0, 0, 0]]:
525524
self.assertRaises(
526525
ValueError, input_file.add_site, position=1,
527-
alleles=["0", "1"], genotypes=genotypes)
526+
alleles=["0", "1"], genotypes=bad_genotypes)
528527
self.assertRaises(
529528
ValueError, input_file.add_site, position=1, inference=True,
530529
alleles=["0", "1", "2"], genotypes=[0, 1])
@@ -533,28 +532,41 @@ def test_variant_errors(self):
533532
alleles=["0"], genotypes=[0, 1])
534533
self.assertRaises(
535534
ValueError, input_file.add_site, position=1,
536-
alleles=["0", "1"], genotypes=np.array([0, 2], dtype=np.int8))
535+
alleles=["0", "1"], genotypes=[0, 2])
537536
self.assertRaises(
538537
ValueError, input_file.add_site, position=1,
539-
alleles=["0", "0"], genotypes=np.array([0, 2], dtype=np.int8))
538+
alleles=["0", "0"], genotypes=[0, 2])
540539

541540
def test_invalid_inference_sites(self):
542541
# Trying to add singletons or fixed sites as inference sites
543542
# raise and error
544543
input_file = formats.SampleData()
545544
# Make sure this is OK
546-
input_file.add_site(0, [0, 1, 1], inference=True)
545+
input_file.add_site(0, [0, 1, 1, tskit.MISSING_DATA], inference=True)
546+
self.assertRaises(
547+
ValueError, input_file.add_site,
548+
position=1, genotypes=[0, 0, 0, 0], inference=True)
549+
self.assertRaises(
550+
ValueError, input_file.add_site,
551+
position=1, genotypes=[1, 0, 0, 0], inference=True)
547552
self.assertRaises(
548553
ValueError, input_file.add_site,
549-
position=1, genotypes=[0, 0, 0], inference=True)
554+
position=1, genotypes=[1, 1, 1, 1], inference=True)
550555
self.assertRaises(
551556
ValueError, input_file.add_site,
552-
position=1, genotypes=[1, 0, 0], inference=True)
557+
position=1, genotypes=[tskit.MISSING_DATA, 0, 0, 0], inference=True)
553558
self.assertRaises(
554559
ValueError, input_file.add_site,
555-
position=1, genotypes=[1, 1, 1], inference=True)
560+
position=1, genotypes=[tskit.MISSING_DATA, 1, 1, 1], inference=True)
561+
self.assertRaises(
562+
ValueError, input_file.add_site,
563+
position=1, genotypes=[tskit.MISSING_DATA, 0, 1, 0], inference=True)
564+
self.assertRaises(
565+
ValueError, input_file.add_site,
566+
position=1, genotypes=[tskit.MISSING_DATA] * 4, inference=True)
567+
# Check we can still add at pos 1
556568
input_file.add_site(
557-
position=1, genotypes=[1, 0, 1], inference=True)
569+
position=1, genotypes=[1, 0, 1, tskit.MISSING_DATA], inference=True)
558570

559571
def test_duplicate_sites(self):
560572
# Duplicate sites are not accepted.
@@ -1146,6 +1158,30 @@ def test_too_many_alleles(self):
11461158
sample_data.add_site(
11471159
0, [0, 0], alleles=[str(x) for x in range(num_alleles)])
11481160

1161+
def test_missing_data(self):
1162+
u = tskit.MISSING_DATA
1163+
sites_by_samples = np.array([
1164+
[u, u, u, 1, 1, 0, 1, 1, 1],
1165+
[u, u, u, 1, 1, 0, 1, 1, 0],
1166+
[u, u, u, 1, 0, 1, 1, 0, 1],
1167+
[u, 0, 0, 1, 1, 1, 1, u, u],
1168+
[u, 0, 1, 1, 1, 0, 1, u, u],
1169+
[u, 1, 1, 0, 0, 0, 0, u, u]
1170+
], dtype=np.int8)
1171+
with tsinfer.SampleData() as data:
1172+
for col in range(sites_by_samples.shape[1]):
1173+
data.add_site(col, sites_by_samples[:, col])
1174+
1175+
self.assertEqual(data.sequence_length, 9.0)
1176+
self.assertEqual(data.num_sites, 9)
1177+
# First site is a entirely missing, second is singleton with missing data =>
1178+
# neither should be marked for inference
1179+
inference_sites = data.sites_inference[:]
1180+
self.assertEqual(inference_sites[0], 0) # Entirely missing data
1181+
self.assertEqual(inference_sites[1], 0) # Singleton with missing data
1182+
for i in inference_sites[2:]:
1183+
self.assertEqual(i, 1)
1184+
11491185

11501186
class TestAncestorData(unittest.TestCase, DataContainerMixin):
11511187
"""

0 commit comments

Comments
 (0)