Skip to content

Commit 51797aa

Browse files
authored
Improve handling of intersecting groups in SVStratify (#9185)
1 parent 845fa58 commit 51797aa

File tree

2 files changed

+71
-19
lines changed

2 files changed

+71
-19
lines changed

src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify.java

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import htsjdk.variant.variantcontext.VariantContextBuilder;
77
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
88
import htsjdk.variant.vcf.VCFHeader;
9+
import htsjdk.variant.vcf.VCFHeaderLineCount;
910
import htsjdk.variant.vcf.VCFHeaderLineType;
1011
import htsjdk.variant.vcf.VCFInfoHeaderLine;
1112
import org.broadinstitute.barclay.argparser.Argument;
@@ -40,7 +41,8 @@
4041
* <li>Reference track overlap</li>
4142
* </ul>
4243
* Records are annotated with their respective strata names in the {@link GATKSVVCFConstants#STRATUM_INFO_KEY} INFO
43-
* field. Users must provide a stratification configuration .tsv file (tab-delimited table) with the following column
44+
* field. SVs that do not match any of the groups will be annotated with the {@link SVStratify#DEFAULT_STRATUM} group.
45+
* Users must provide a stratification configuration .tsv file (tab-delimited table) with the following column
4446
* header on the first line:
4547
* <ol>
4648
* <li>NAME</li>
@@ -78,7 +80,7 @@
7880
* {@link SVStratificationEngineArgumentsCollection#trackNameList} parameters. For example,
7981
* </p>
8082
* <pre>
81-
* gatk GroupedSVCluster \
83+
* gatk SVStratify \
8284
* --track-name RM \
8385
* --track-intervals repeatmasker.bed \
8486
* --track-name SD \
@@ -104,12 +106,11 @@
104106
* overlap is only defined by {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlapsInterchrom}.
105107
* </p>
106108
*
107-
* <p>By default, each stratification group must be mutually exclusive, meaning that any given SV can only belong to
109+
* <p>If using the --split-output option then each stratification group must be mutually exclusive, meaning that any given SV can only belong to
108110
* one group. An error is thrown if the tool encounters a variant that meets the criteria for more than one group.
109-
* This restriction can be overridden with the {@link SVStratify#ALLOW_MULTIPLE_MATCHES_LONG_NAME} argument, in which
110-
* case the record will be written out multiple times: once for each matching stratification group with the corresponding
111-
* {@link GATKSVVCFConstants#STRATUM_INFO_KEY} value. Furthermore, SVs that do not match any of the groups will be
112-
* annotated with the {@link SVStratify#DEFAULT_STRATUM} group.</p>
111+
* This restriction can be overridden with the {@link SVStratify#ALLOW_MULTIPLE_MATCHES_LONG_NAME} argument, in which case
112+
* records belonging to multiple stratification groups will be written to each corresponding file (hence possibly
113+
* resulting in duplicated records).</p>
113114
*
114115
* <p>If using {@link #SPLIT_OUTPUT_LONG_NAME} then the tool generates a set of VCFs as output with each VCF containing
115116
* the records of each group.</p>
@@ -242,7 +243,7 @@ protected void createGroupWriter(final String name, final Path path) {
242243
}
243244

244245
public static void addStratifyMetadata(final VCFHeader header) {
245-
header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.STRATUM_INFO_KEY, 1,
246+
header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.STRATUM_INFO_KEY, VCFHeaderLineCount.UNBOUNDED,
246247
VCFHeaderLineType.String, "Stratum ID"));
247248
}
248249

@@ -322,16 +323,23 @@ public void apply(final VariantContext variant, final ReadsContext readsContext,
322323
if (stratifications.isEmpty()) {
323324
writers.get(DEFAULT_STRATUM).add(builder.attribute(GATKSVVCFConstants.STRATUM_INFO_KEY, DEFAULT_STRATUM).make());
324325
} else {
325-
if (!allowMultipleMatches && stratifications.size() > 1) {
326-
final String matchesString = String.join(", ", stratifications.stream().map(SVStratificationEngine.Stratum::getName).collect(Collectors.toList()));
327-
throw new GATKException("Record " + record.getId() + " matched multiple groups: " + matchesString + ". Bypass this error using the --" + ALLOW_MULTIPLE_MATCHES_LONG_NAME + " argument");
328-
}
329-
for (final SVStratificationEngine.Stratum stratum : stratifications) {
330-
final VariantContextWriter writer = splitOutput ? writers.get(stratum.getName()) : writers.get(DEFAULT_STRATUM);
331-
if (writer == null) {
332-
throw new GATKException("Writer not found for group: " + stratum.getName());
326+
final List<String> stratumNames = new ArrayList<>(stratifications).stream().map(SVStratificationEngine.Stratum::getName).sorted().collect(Collectors.toUnmodifiableList());
327+
final VariantContext outputVariant = builder.attribute(GATKSVVCFConstants.STRATUM_INFO_KEY, stratumNames).make();
328+
if (splitOutput) {
329+
if (!allowMultipleMatches && stratifications.size() > 1) {
330+
final String matchesString = String.join(", ", stratumNames);
331+
throw new GATKException("Record " + record.getId() + " matched multiple groups: " + matchesString + ". Bypass this error using the --" + ALLOW_MULTIPLE_MATCHES_LONG_NAME + " argument");
332+
}
333+
for (final SVStratificationEngine.Stratum stratum : stratifications) {
334+
final VariantContextWriter writer = writers.get(stratum.getName());
335+
if (writer == null) {
336+
throw new GATKException("Writer not found for group: " + stratum.getName());
337+
}
338+
writer.add(outputVariant);
333339
}
334-
writer.add(builder.attribute(GATKSVVCFConstants.STRATUM_INFO_KEY, stratum.getName()).make());
340+
} else {
341+
final VariantContextWriter writer = writers.get(DEFAULT_STRATUM);
342+
writer.add(outputVariant);
335343
}
336344
}
337345
}

src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratifyIntegrationTest.java

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,42 @@ public void testBwaMeltCohortSingleOutput() {
124124
Assert.assertEquals(outputVcf.getRight().size(), inputVcf.getRight().size());
125125
}
126126

127-
@Test(expectedExceptions = GATKException.class)
127+
@Test
128128
public void testBwaMeltCohortRedundant() {
129+
final File outputDir = createTempDir("stratify");
130+
final File outputFile = outputDir.toPath().resolve("out.vcf.gz").toFile();
131+
final String inputVcfPath = getToolTestDataDir() + "bwa_melt.chr22.vcf.gz";
132+
final String configFile = getToolTestDataDir() + "test_config_redundant.tsv";
133+
134+
final String segdupFile = getToolTestDataDir() + "hg38.SegDup.chr22.bed";
135+
final String segdupName = "SD";
136+
final String repeatmaskerFile = getToolTestDataDir() + "hg38.RM.chr22_subsampled.bed";
137+
final String repeatmaskerName = "RM";
138+
139+
final ArgumentsBuilder args = new ArgumentsBuilder()
140+
.addOutput(outputFile)
141+
.add(SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME, configFile)
142+
.add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, segdupName)
143+
.add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, segdupFile)
144+
.add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, repeatmaskerName)
145+
.add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, repeatmaskerFile)
146+
.add(SVStratificationEngineArgumentsCollection.OVERLAP_FRACTION_LONG_NAME, 0.5)
147+
.add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, GATKBaseTest.FULL_HG38_DICT)
148+
.add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVcfPath);
149+
150+
runCommandLine(args, SVStratify.class.getSimpleName());
151+
152+
final List<File> outputFiles = Lists.newArrayList(outputDir.listFiles()).stream().filter(VcfUtils::isVariantFile).collect(Collectors.toUnmodifiableList());
153+
Assert.assertEquals(outputFiles.size(), 1);
154+
Assert.assertEquals(outputFiles.get(0).getAbsolutePath(), outputFile.getAbsolutePath());
155+
final Pair<VCFHeader, List<VariantContext>> inputVcf = VariantContextTestUtils.readEntireVCFIntoMemory(inputVcfPath);
156+
final Pair<VCFHeader, List<VariantContext>> outputVcf = VariantContextTestUtils.readEntireVCFIntoMemory(outputFile.getAbsolutePath());
157+
// No duplicated records
158+
Assert.assertEquals(outputVcf.getRight().size(), inputVcf.getRight().size());
159+
}
160+
161+
@Test(expectedExceptions = GATKException.class)
162+
public void testBwaMeltCohortRedundantFails() {
129163
final File outputDir = createTempDir("stratify");
130164
final String inputVcfPath = getToolTestDataDir() + "bwa_melt.chr22.vcf.gz";
131165
final String configFile = getToolTestDataDir() + "test_config_redundant.tsv";
@@ -153,7 +187,7 @@ public void testBwaMeltCohortRedundant() {
153187

154188
@Test
155189
public void testBwaMeltCohortBypassRedundant() {
156-
final File outputDir = createTempDir("stratify");
190+
final File outputDir = createTempDir("stratify_bypass_redundant");
157191
final String inputVcfPath = getToolTestDataDir() + "bwa_melt.chr22.vcf.gz";
158192
final String configFile = getToolTestDataDir() + "test_config_redundant.tsv";
159193

@@ -177,6 +211,16 @@ public void testBwaMeltCohortBypassRedundant() {
177211
.addFlag(SVStratify.ALLOW_MULTIPLE_MATCHES_LONG_NAME);
178212

179213
runCommandLine(args, SVStratify.class.getSimpleName());
214+
215+
final List<File> outputFiles = Lists.newArrayList(outputDir.listFiles()).stream().filter(VcfUtils::isVariantFile).collect(Collectors.toUnmodifiableList());
216+
Assert.assertEquals(outputFiles.size(), 8);
217+
long totalRecords = 0;
218+
for (final File outputFile : outputFiles) {
219+
final Pair<VCFHeader, List<VariantContext>> inputVcf = VariantContextTestUtils.readEntireVCFIntoMemory(inputVcfPath);
220+
final Pair<VCFHeader, List<VariantContext>> outputVcf = VariantContextTestUtils.readEntireVCFIntoMemory(outputFile.getAbsolutePath());
221+
totalRecords += outputVcf.getRight().size();
222+
}
223+
Assert.assertEquals(totalRecords, 1458);
180224
}
181225

182226
@Test(expectedExceptions = {GATKException.class})

0 commit comments

Comments
 (0)