Skip to content

Warn zero #685

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
[0.2.4] - 2022-06-xx
********************

**Features**

- matching routines warn if no inference sites
(:pr:`685`, :issue:`683` :user:`hyanwong`)

**Fixes**

- sample_data.subset() now accepts a sequence_length (:pr:`681`, :user:`hyanwong`)
Expand Down
16 changes: 15 additions & 1 deletion docs/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,9 @@ variants from chromosome 24 of ten Norwegian and French house sparrows,
Read the sites in the vcf and add them to the samples object, reordering the
alleles to put the ancestral allele first, if it is available.
"""
# You may want to change the following line, e.g. here we allow * (a spanning
# deletion) to be a valid allele state
allowed_allele_chars = set("ATGCatgc*")
pos = 0
for variant in vcf: # Loop over variants, each assumed at a unique site
if pos == variant.POS:
Expand All @@ -387,9 +390,20 @@ variants from chromosome 24 of ten Norwegian and French house sparrows,
if any([not phased for _, _, phased in variant.genotypes]):
raise ValueError("Unphased genotypes for variant at position", pos)
alleles = [variant.REF] + variant.ALT
ancestral = variant.INFO.get("AA", variant.REF)
ancestral = variant.INFO.get("AA", ".") # "." means unknown
# some VCFs (e.g. from 1000G) have many values in the AA field: take the 1st
ancestral = ancestral.split("|")[0]
if ancestral == ".":
# use the reference as ancestral, if unknown (NB: you may not want this)
ancestral = variant.REF
# Ancestral state must be first in the allele list.
ordered_alleles = [ancestral] + list(set(alleles) - {ancestral})
# Check we have ATCG alleles
for allele in ordered_alleles:
if len(set(allele) - allowed_allele_chars) > 0:
raise ValueError(
"Site at pos {pos}: allele {allele} not in {allowed_allele_chars}"
)
allele_index = {
old_index: ordered_alleles.index(allele)
for old_index, allele in enumerate(alleles)
Expand Down
21 changes: 21 additions & 0 deletions tests/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import io
import itertools
import json
import logging
import os.path
import random
import string
Expand Down Expand Up @@ -729,6 +730,14 @@ class TestZeroInferenceSites:
Tests for the degenerate case in which we have no inference sites.
"""

@classmethod
def setup_class(cls):
logging.disable(logging.CRITICAL)

@classmethod
def teardown_class(cls):
logging.disable(logging.NOTSET)

def verify(self, genotypes):
genotypes = np.array(genotypes, dtype=np.int8)
m = genotypes.shape[0]
Expand Down Expand Up @@ -774,6 +783,18 @@ def test_three_sites(self):
self.verify([[1, 1], [1, 1], [1, 1]])


class TestZeroInferenceSitesWarning:
def test_warning_match_ancestors(self, caplog):
with tsinfer.SampleData(sequence_length=10) as sd:
sd.add_site(1, [0, 0])
ancestors = tsinfer.generate_ancestors(sd)
with caplog.at_level(logging.WARNING):
ats = tsinfer.match_ancestors(sd, ancestors)
assert caplog.text.count("No sites used") == 1
_ = tsinfer.match_samples(sd, ats)
assert caplog.text.count("No sites used") == 2


def random_string(rng, max_len=10):
"""
Uses the specified random generator to generate a random string.
Expand Down
2 changes: 2 additions & 0 deletions tsinfer/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,6 +1027,8 @@ def __init__(
self.path_compression = path_compression
self.num_samples = self.sample_data.num_samples
self.num_sites = len(inference_site_position)
if self.num_sites == 0:
logging.warning("No sites used for inference")
num_intervals = max(self.num_sites - 1, 0)
self.progress_monitor = _get_progress_monitor(progress_monitor)
self.match_progress = None # Allocated by subclass
Expand Down