Skip to content

Commit 4f49c76

Browse files
Make simplify thread-safe in no-filter case
1 parent 8998021 commit 4f49c76

File tree

3 files changed

+207
-97
lines changed

3 files changed

+207
-97
lines changed

c/tskit/tables.c

+98-95
Original file line numberDiff line numberDiff line change
@@ -9542,71 +9542,115 @@ simplifier_finalise_site_references(
95429542
}
95439543

95449544
static int TSK_WARN_UNUSED
9545-
simplifier_finalise_population_references(
9546-
simplifier_t *self, const bool *population_referenced, tsk_id_t *population_id_map)
9545+
simplifier_finalise_population_references(simplifier_t *self)
95479546
{
95489547
int ret = 0;
9549-
tsk_id_t ret_id;
95509548
tsk_size_t j;
9549+
tsk_id_t pop_id, ret_id;
95519550
tsk_population_t pop;
9551+
tsk_id_t *node_population = self->tables->nodes.population;
9552+
const tsk_size_t num_nodes = self->tables->nodes.num_rows;
95529553
const tsk_size_t num_populations = self->input_tables.populations.num_rows;
9554+
bool *population_referenced
9555+
= tsk_calloc(num_populations, sizeof(*population_referenced));
9556+
tsk_id_t *population_id_map
9557+
= tsk_malloc(num_populations * sizeof(*population_id_map));
95539558

9554-
if (self->options & TSK_SIMPLIFY_FILTER_POPULATIONS) {
9555-
for (j = 0; j < num_populations; j++) {
9556-
tsk_population_table_get_row_unsafe(
9557-
&self->input_tables.populations, (tsk_id_t) j, &pop);
9558-
population_id_map[j] = TSK_NULL;
9559-
if (population_referenced[j]) {
9560-
ret_id = tsk_population_table_add_row(
9561-
&self->tables->populations, pop.metadata, pop.metadata_length);
9562-
if (ret_id < 0) {
9563-
ret = (int) ret_id;
9564-
goto out;
9565-
}
9566-
population_id_map[j] = ret_id;
9559+
tsk_bug_assert(self->options & TSK_SIMPLIFY_FILTER_POPULATIONS);
9560+
9561+
if (population_referenced == NULL || population_id_map == NULL) {
9562+
ret = TSK_ERR_NO_MEMORY;
9563+
goto out;
9564+
}
9565+
9566+
for (j = 0; j < num_nodes; j++) {
9567+
pop_id = node_population[j];
9568+
if (pop_id != TSK_NULL) {
9569+
population_referenced[pop_id] = true;
9570+
}
9571+
}
9572+
9573+
for (j = 0; j < num_populations; j++) {
9574+
tsk_population_table_get_row_unsafe(
9575+
&self->input_tables.populations, (tsk_id_t) j, &pop);
9576+
population_id_map[j] = TSK_NULL;
9577+
if (population_referenced[j]) {
9578+
ret_id = tsk_population_table_add_row(
9579+
&self->tables->populations, pop.metadata, pop.metadata_length);
9580+
if (ret_id < 0) {
9581+
ret = (int) ret_id;
9582+
goto out;
95679583
}
9584+
population_id_map[j] = ret_id;
95689585
}
9569-
} else {
9570-
tsk_bug_assert(self->tables->populations.num_rows == num_populations);
9571-
for (j = 0; j < num_populations; j++) {
9572-
population_id_map[j] = (tsk_id_t) j;
9586+
}
9587+
9588+
/* Remap the IDs in the node table */
9589+
for (j = 0; j < num_nodes; j++) {
9590+
pop_id = node_population[j];
9591+
if (pop_id != TSK_NULL) {
9592+
node_population[j] = population_id_map[pop_id];
95739593
}
95749594
}
95759595
out:
9596+
tsk_safe_free(population_id_map);
9597+
tsk_safe_free(population_referenced);
95769598
return ret;
95779599
}
95789600

95799601
static int TSK_WARN_UNUSED
9580-
simplifier_finalise_individual_references(
9581-
simplifier_t *self, const bool *individual_referenced, tsk_id_t *individual_id_map)
9602+
simplifier_finalise_individual_references(simplifier_t *self)
95829603
{
95839604
int ret = 0;
9584-
tsk_id_t ret_id;
95859605
tsk_size_t j;
9606+
tsk_id_t pop_id, ret_id;
95869607
tsk_individual_t ind;
9587-
const tsk_size_t num_individuals = self->input_tables.individuals.num_rows;
9608+
tsk_id_t *node_individual = self->tables->nodes.individual;
95889609
tsk_id_t *parents;
9610+
const tsk_size_t num_nodes = self->tables->nodes.num_rows;
9611+
const tsk_size_t num_individuals = self->input_tables.individuals.num_rows;
9612+
bool *individual_referenced
9613+
= tsk_calloc(num_individuals, sizeof(*individual_referenced));
9614+
tsk_id_t *individual_id_map
9615+
= tsk_malloc(num_individuals * sizeof(*individual_id_map));
95899616

9590-
if (self->options & TSK_SIMPLIFY_FILTER_INDIVIDUALS) {
9591-
for (j = 0; j < num_individuals; j++) {
9592-
tsk_individual_table_get_row_unsafe(
9593-
&self->input_tables.individuals, (tsk_id_t) j, &ind);
9594-
individual_id_map[j] = TSK_NULL;
9595-
if (individual_referenced[j]) {
9596-
ret_id = tsk_individual_table_add_row(&self->tables->individuals,
9597-
ind.flags, ind.location, ind.location_length, ind.parents,
9598-
ind.parents_length, ind.metadata, ind.metadata_length);
9599-
if (ret_id < 0) {
9600-
ret = (int) ret_id;
9601-
goto out;
9602-
}
9603-
individual_id_map[j] = ret_id;
9617+
tsk_bug_assert(self->options & TSK_SIMPLIFY_FILTER_INDIVIDUALS);
9618+
9619+
if (individual_referenced == NULL || individual_id_map == NULL) {
9620+
ret = TSK_ERR_NO_MEMORY;
9621+
goto out;
9622+
}
9623+
9624+
for (j = 0; j < num_nodes; j++) {
9625+
pop_id = node_individual[j];
9626+
if (pop_id != TSK_NULL) {
9627+
individual_referenced[pop_id] = true;
9628+
}
9629+
}
9630+
9631+
for (j = 0; j < num_individuals; j++) {
9632+
tsk_individual_table_get_row_unsafe(
9633+
&self->input_tables.individuals, (tsk_id_t) j, &ind);
9634+
individual_id_map[j] = TSK_NULL;
9635+
if (individual_referenced[j]) {
9636+
/* Can't remap the parents inline here because we have no
9637+
* guarantees about sortedness */
9638+
ret_id = tsk_individual_table_add_row(&self->tables->individuals, ind.flags,
9639+
ind.location, ind.location_length, ind.parents, ind.parents_length,
9640+
ind.metadata, ind.metadata_length);
9641+
if (ret_id < 0) {
9642+
ret = (int) ret_id;
9643+
goto out;
96049644
}
9645+
individual_id_map[j] = ret_id;
96059646
}
9606-
} else {
9607-
tsk_bug_assert(self->tables->individuals.num_rows == num_individuals);
9608-
for (j = 0; j < num_individuals; j++) {
9609-
individual_id_map[j] = (tsk_id_t) j;
9647+
}
9648+
9649+
/* Remap the IDs in the node table */
9650+
for (j = 0; j < num_nodes; j++) {
9651+
pop_id = node_individual[j];
9652+
if (pop_id != TSK_NULL) {
9653+
node_individual[j] = individual_id_map[pop_id];
96109654
}
96119655
}
96129656

@@ -9619,7 +9663,10 @@ simplifier_finalise_individual_references(
96199663
parents[j] = individual_id_map[parents[j]];
96209664
}
96219665
}
9666+
96229667
out:
9668+
tsk_safe_free(individual_id_map);
9669+
tsk_safe_free(individual_referenced);
96239670
return ret;
96249671
}
96259672

@@ -9687,26 +9734,6 @@ static int TSK_WARN_UNUSED
96879734
simplifier_flush_output(simplifier_t *self)
96889735
{
96899736
int ret = 0;
9690-
const tsk_size_t num_nodes = self->tables->nodes.num_rows;
9691-
const tsk_size_t num_populations = self->input_tables.populations.num_rows;
9692-
const tsk_size_t num_individuals = self->input_tables.individuals.num_rows;
9693-
tsk_id_t *node_population = self->tables->nodes.population;
9694-
bool *population_referenced
9695-
= tsk_calloc(num_populations, sizeof(*population_referenced));
9696-
tsk_id_t *population_id_map
9697-
= tsk_malloc(num_populations * sizeof(*population_id_map));
9698-
tsk_id_t *node_individual = self->tables->nodes.individual;
9699-
bool *individual_referenced
9700-
= tsk_calloc(num_individuals, sizeof(*individual_referenced));
9701-
tsk_id_t *individual_id_map
9702-
= tsk_malloc(num_individuals * sizeof(*individual_id_map));
9703-
tsk_size_t j;
9704-
tsk_id_t pop_id, ind_id;
9705-
9706-
if (population_referenced == NULL || population_id_map == NULL
9707-
|| individual_referenced == NULL || individual_id_map == NULL) {
9708-
goto out;
9709-
}
97109737

97119738
/* TODO Migrations fit reasonably neatly into the pattern that we have here. We
97129739
* can consider references to populations from migration objects in the same way
@@ -9727,44 +9754,20 @@ simplifier_flush_output(simplifier_t *self)
97279754
goto out;
97289755
}
97299756

9730-
for (j = 0; j < num_nodes; j++) {
9731-
pop_id = node_population[j];
9732-
if (pop_id != TSK_NULL) {
9733-
population_referenced[pop_id] = true;
9734-
}
9735-
ind_id = node_individual[j];
9736-
if (ind_id != TSK_NULL) {
9737-
individual_referenced[ind_id] = true;
9757+
if (self->options & TSK_SIMPLIFY_FILTER_POPULATIONS) {
9758+
ret = simplifier_finalise_population_references(self);
9759+
if (ret != 0) {
9760+
goto out;
97389761
}
97399762
}
9740-
9741-
ret = simplifier_finalise_population_references(
9742-
self, population_referenced, population_id_map);
9743-
if (ret != 0) {
9744-
goto out;
9745-
}
9746-
ret = simplifier_finalise_individual_references(
9747-
self, individual_referenced, individual_id_map);
9748-
if (ret != 0) {
9749-
goto out;
9750-
}
9751-
9752-
/* Remap node IDs referencing the above */
9753-
for (j = 0; j < num_nodes; j++) {
9754-
pop_id = node_population[j];
9755-
if (pop_id != TSK_NULL) {
9756-
node_population[j] = population_id_map[pop_id];
9757-
}
9758-
ind_id = node_individual[j];
9759-
if (ind_id != TSK_NULL) {
9760-
node_individual[j] = individual_id_map[ind_id];
9763+
if (self->options & TSK_SIMPLIFY_FILTER_INDIVIDUALS) {
9764+
ret = simplifier_finalise_individual_references(self);
9765+
if (ret != 0) {
9766+
goto out;
97619767
}
97629768
}
9769+
97639770
out:
9764-
tsk_safe_free(population_referenced);
9765-
tsk_safe_free(population_id_map);
9766-
tsk_safe_free(individual_referenced);
9767-
tsk_safe_free(individual_id_map);
97689771
return ret;
97699772
}
97709773

c/tskit/tables.h

+19-2
Original file line numberDiff line numberDiff line change
@@ -686,6 +686,9 @@ reference them. */
686686
#define TSK_SIMPLIFY_FILTER_POPULATIONS (1 << 1)
687687
/** Remove individuals from the output if there are no nodes that reference them.*/
688688
#define TSK_SIMPLIFY_FILTER_INDIVIDUALS (1 << 2)
689+
/** Do not remove nodes from the output if there are no edges that reference
690+
them. */
691+
#define TSK_SIMPLIFY_NO_FILTER_NODES (1 << 7)
689692
/**
690693
Reduce the topological information in the tables to the minimum necessary to
691694
represent the trees that contain sites. If there are zero sites this will
@@ -719,7 +722,6 @@ flag). It keeps unary nodes, but only if the unary node is referenced from an in
719722
this flag is negated compare to other filtering options because the default
720723
behaviour is remove unreferenced nodes.
721724
*/
722-
#define TSK_SIMPLIFY_NO_FILTER_NODES (1 << 7)
723725
/** @} */
724726

725727
/**
@@ -3923,6 +3925,21 @@ input tables, but not in the specified list of sample IDs (if provided). The
39233925
``node_map[u] == u`` for all nodes. Note also that the order of the list of
39243926
samples is not important in this case.
39253927
3928+
When table is not filtered (i.e., if the `TSK_SIMPLIFY_NO_FILTER_NODES`
3929+
option is provided or the `TSK_SIMPLIFY_FILTER_SITES`,
3930+
`TSK_SIMPLIFY_FILTER_POPULATIONS` or `TSK_SIMPLIFY_FILTER_INDIVIDUALS`
3931+
options are *not* provided) the corresponding table is modified as
3932+
little as possible, and all pointers are guaranteed to remain valid
3933+
after simplification. The only changes made to an unfiltered table are
3934+
to update any references to tables that may have changed (for example,
3935+
remapping population IDs in the node table if
3936+
`TSK_SIMPLIFY_FILTER_POPULATIONS` was specified).
3937+
3938+
.. note:: It is possible for populations and individuals to be filtered
3939+
even if `TSK_SIMPLIFY_NO_FILTER_NODES` is specified because there
3940+
may be entirely unreferences entities in the input tables, which
3941+
are not affected by whether we filter nodes or not.
3942+
39263943
The table collection will always be unindexed after simplify successfully completes.
39273944
39283945
.. note:: Migrations are currently not supported by simplify, and an error will
@@ -3937,11 +3954,11 @@ Options can be specified by providing one or more of the following bitwise
39373954
- :c:macro:`TSK_SIMPLIFY_FILTER_SITES`
39383955
- :c:macro:`TSK_SIMPLIFY_FILTER_POPULATIONS`
39393956
- :c:macro:`TSK_SIMPLIFY_FILTER_INDIVIDUALS`
3957+
- :c:macro:`TSK_SIMPLIFY_NO_FILTER_NODES`
39403958
- :c:macro:`TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY`
39413959
- :c:macro:`TSK_SIMPLIFY_KEEP_UNARY`
39423960
- :c:macro:`TSK_SIMPLIFY_KEEP_INPUT_ROOTS`
39433961
- :c:macro:`TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS`
3944-
- :c:macro:`TSK_SIMPLIFY_NO_FILTER_NODES`
39453962
@endrst
39463963
39473964
@param self A pointer to a tsk_table_collection_t object.

python/tests/test_topology.py

+90
Original file line numberDiff line numberDiff line change
@@ -5655,6 +5655,96 @@ def test_many_trees_recurrent_mutations_internal_samples(self):
56555655
self.verify_simplify_haplotypes(ts, samples, keep_unary=keep)
56565656

56575657

5658+
class TestSimplifyUnreferencedPopulations:
5659+
def example(self):
5660+
tables = tskit.TableCollection(1)
5661+
tables.populations.add_row()
5662+
tables.populations.add_row()
5663+
# No references to population 0
5664+
tables.nodes.add_row(time=0, population=1, flags=1)
5665+
tables.nodes.add_row(time=0, population=1, flags=1)
5666+
tables.nodes.add_row(time=1, population=1, flags=0)
5667+
# Unreference node
5668+
tables.nodes.add_row(time=1, population=1, flags=0)
5669+
tables.edges.add_row(0, 1, parent=2, child=0)
5670+
tables.edges.add_row(0, 1, parent=2, child=1)
5671+
tables.sort()
5672+
return tables
5673+
5674+
def test_no_filter_populations(self):
5675+
tables = self.example()
5676+
tables.simplify(filter_populations=False)
5677+
assert len(tables.populations) == 2
5678+
assert len(tables.nodes) == 3
5679+
assert np.all(tables.nodes.population == 1)
5680+
5681+
def test_no_filter_populations_nodes(self):
5682+
tables = self.example()
5683+
tables.simplify(filter_populations=False, filter_nodes=False)
5684+
assert len(tables.populations) == 2
5685+
assert len(tables.nodes) == 4
5686+
assert np.all(tables.nodes.population == 1)
5687+
5688+
def test_filter_populations_no_filter_nodes(self):
5689+
tables = self.example()
5690+
tables.simplify(filter_populations=True, filter_nodes=False)
5691+
assert len(tables.populations) == 1
5692+
assert len(tables.nodes) == 4
5693+
assert np.all(tables.nodes.population == 0)
5694+
5695+
def test_remapped_default(self):
5696+
tables = self.example()
5697+
tables.simplify()
5698+
assert len(tables.populations) == 1
5699+
assert len(tables.nodes) == 3
5700+
assert np.all(tables.nodes.population == 0)
5701+
5702+
5703+
class TestSimplifyUnreferencedIndividuals:
5704+
def example(self):
5705+
tables = tskit.TableCollection(1)
5706+
tables.individuals.add_row()
5707+
tables.individuals.add_row()
5708+
# No references to individual 0
5709+
tables.nodes.add_row(time=0, individual=1, flags=1)
5710+
tables.nodes.add_row(time=0, individual=1, flags=1)
5711+
tables.nodes.add_row(time=1, individual=1, flags=0)
5712+
# Unreference node
5713+
tables.nodes.add_row(time=1, individual=1, flags=0)
5714+
tables.edges.add_row(0, 1, parent=2, child=0)
5715+
tables.edges.add_row(0, 1, parent=2, child=1)
5716+
tables.sort()
5717+
return tables
5718+
5719+
def test_no_filter_individuals(self):
5720+
tables = self.example()
5721+
tables.simplify(filter_individuals=False)
5722+
assert len(tables.individuals) == 2
5723+
assert len(tables.nodes) == 3
5724+
assert np.all(tables.nodes.individual == 1)
5725+
5726+
def test_no_filter_individuals_nodes(self):
5727+
tables = self.example()
5728+
tables.simplify(filter_individuals=False, filter_nodes=False)
5729+
assert len(tables.individuals) == 2
5730+
assert len(tables.nodes) == 4
5731+
assert np.all(tables.nodes.individual == 1)
5732+
5733+
def test_filter_individuals_no_filter_nodes(self):
5734+
tables = self.example()
5735+
tables.simplify(filter_individuals=True, filter_nodes=False)
5736+
assert len(tables.individuals) == 1
5737+
assert len(tables.nodes) == 4
5738+
assert np.all(tables.nodes.individual == 0)
5739+
5740+
def test_remapped_default(self):
5741+
tables = self.example()
5742+
tables.simplify()
5743+
assert len(tables.individuals) == 1
5744+
assert len(tables.nodes) == 3
5745+
assert np.all(tables.nodes.individual == 0)
5746+
5747+
56585748
class TestSimplifyKeepInputRoots(SimplifyTestBase, ExampleTopologyMixin):
56595749
"""
56605750
Tests for the keep_input_roots option to simplify.

0 commit comments

Comments
 (0)